The patch titled
Subject: mm,hwpoison: fix race with hugetlb page allocation
has been added to the -mm tree. Its filename is
mmhwpoison-fix-race-with-hugetlb-page-allocation.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mmhwpoison-fix-race-with-hugetlb-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mmhwpoison-fix-race-with-hugetlb-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Subject: mm,hwpoison: fix race with hugetlb page allocation
When hugetlb page fault (under overcommitting situation) and
memory_failure() race, VM_BUG_ON_PAGE() is triggered by the following
race:
CPU0: CPU1:
gather_surplus_pages()
page = alloc_surplus_huge_page()
memory_failure_hugetlb()
get_hwpoison_page(page)
__get_hwpoison_page(page)
get_page_unless_zero(page)
zero = put_page_testzero(page)
VM_BUG_ON_PAGE(!zero, page)
enqueue_huge_page(h, page)
put_page(page)
__get_hwpoison_page() only checks the page refcount before taking an
additional one for memory error handling, which is not enough because
there's a time window where compound pages have non-zero refcount during
hugetlb page initialization.
So make __get_hwpoison_page() check page status a bit more for hugetlb
pages with get_hwpoison_huge_page(). Checking hugetlb-specific flags
under hugetlb_lock makes sure that the hugetlb page is not transitive.
It's notable that another new function, HWPoisonHandlable(), is helpful to
prevent a race against other transitive page states (like a generic
compound page just before PageHuge becomes true).
Link: https://lkml.kernel.org/r/20210603233632.2964832-2-nao.horiguchi@gmail.com
Fixes: ead07f6a867b ("mm/memory-failure: introduce get_hwpoison_page() for consistent refcount handling")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reported-by: Muchun Song <songmuchun(a)bytedance.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Tony Luck <tony.luck(a)intel.com>
Cc: <stable(a)vger.kernel.org> [5.12+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/hugetlb.h | 6 ++++++
mm/hugetlb.c | 15 +++++++++++++++
mm/memory-failure.c | 29 +++++++++++++++++++++++++++--
3 files changed, 48 insertions(+), 2 deletions(-)
--- a/include/linux/hugetlb.h~mmhwpoison-fix-race-with-hugetlb-page-allocation
+++ a/include/linux/hugetlb.h
@@ -149,6 +149,7 @@ bool hugetlb_reserve_pages(struct inode
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
bool isolate_huge_page(struct page *page, struct list_head *list);
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
void putback_active_hugepage(struct page *page);
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
@@ -339,6 +340,11 @@ static inline bool isolate_huge_page(str
return false;
}
+static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+{
+ return 0;
+}
+
static inline void putback_active_hugepage(struct page *page)
{
}
--- a/mm/hugetlb.c~mmhwpoison-fix-race-with-hugetlb-page-allocation
+++ a/mm/hugetlb.c
@@ -5857,6 +5857,21 @@ unlock:
return ret;
}
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+{
+ int ret = 0;
+
+ *hugetlb = false;
+ spin_lock_irq(&hugetlb_lock);
+ if (PageHeadHuge(page)) {
+ *hugetlb = true;
+ if (HPageFreed(page) || HPageMigratable(page))
+ ret = get_page_unless_zero(page);
+ }
+ spin_unlock_irq(&hugetlb_lock);
+ return ret;
+}
+
void putback_active_hugepage(struct page *page)
{
spin_lock_irq(&hugetlb_lock);
--- a/mm/memory-failure.c~mmhwpoison-fix-race-with-hugetlb-page-allocation
+++ a/mm/memory-failure.c
@@ -949,6 +949,17 @@ static int page_action(struct page_state
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
+/*
+ * Return true if a page type of a given page is supported by hwpoison
+ * mechanism (while handling could fail), otherwise false. This function
+ * does not return true for hugetlb or device memory pages, so it's assumed
+ * to be called only in the context where we never have such pages.
+ */
+static inline bool HWPoisonHandlable(struct page *page)
+{
+ return PageLRU(page) || __PageMovable(page);
+}
+
/**
* __get_hwpoison_page() - Get refcount for memory error handling:
* @page: raw error page (hit by memory error)
@@ -959,8 +970,22 @@ static int page_action(struct page_state
static int __get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
+ int ret = 0;
+ bool hugetlb = false;
+
+ ret = get_hwpoison_huge_page(head, &hugetlb);
+ if (hugetlb)
+ return ret;
+
+ /*
+ * This check prevents from calling get_hwpoison_unless_zero()
+ * for any unsupported type of page in order to reduce the risk of
+ * unexpected races caused by taking a page refcount.
+ */
+ if (!HWPoisonHandlable(head))
+ return 0;
- if (!PageHuge(head) && PageTransHuge(head)) {
+ if (PageTransHuge(head)) {
/*
* Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up.
@@ -1017,7 +1042,7 @@ try_again:
ret = -EIO;
}
} else {
- if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
+ if (PageHuge(p) || HWPoisonHandlable(p)) {
ret = 1;
} else {
/*
_
Patches currently in -mm which might be from naoya.horiguchi(a)nec.com are
hugetlb-pass-head-page-to-remove_hugetlb_page.patch
mmhwpoison-fix-race-with-hugetlb-page-allocation.patch
mmhwpoison-send-sigbus-with-error-virutal-address.patch
mmhwpoison-make-get_hwpoison_page-call-get_any_page.patch
From: Andy Lutomirski <luto(a)kernel.org>
If XRSTOR fails due to sufficiently complicated paging errors (e.g.
concurrent TLB invalidation), it may fault with #PF but still modify
portions of the user extended state.
If this happens in __fpu_restore_sig() with a victim task's FPU registers
loaded and the task is preempted by the victim task, the victim task
resumes with partially corrupt extended state.
Invalidate the FPU registers when XRSTOR fails to prevent this scenario.
Fixes: 1d731e731c4c ("x86/fpu: Add a fastpath to __fpu__restore_sig()")
Signed-off-by: Andy Lutomirski <luto(a)kernel.org>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
---
arch/x86/kernel/fpu/signal.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -369,6 +369,27 @@ static int __fpu__restore_sig(void __use
fpregs_unlock();
return 0;
}
+
+ if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
+ /*
+ * The FPU registers do not belong to current, and
+ * we just did an FPU restore operation, restricted
+ * to the user portion of the register file, and
+ * failed. In the event that the ucode was
+ * unfriendly and modified the registers at all, we
+ * need to make sure that we aren't corrupting an
+ * innocent non-current task's registers.
+ */
+ __cpu_invalidate_fpregs_state();
+ } else {
+ /*
+ * As above, we may have just clobbered current's
+ * user FPU state. We will either successfully
+ * load it or clear it below, so no action is
+ * required here.
+ */
+ }
+
fpregs_unlock();
} else {
/*
From: Long Li <longli(a)microsoft.com>
After commit 07173c3ec276 ("block: enable multipage bvecs"), a bvec can
have multiple pages. But bio_will_gap() still assumes one page bvec while
checking for merging. This causes data corruption on drivers relying on
the correct merging on virt_boundary_mask.
Fix this by returning the bvec for multi-page bvec.
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Cc: Pavel Begunkov <asml.silence(a)gmail.com>
Cc: Ming Lei <ming.lei(a)redhat.com>
Cc: Tejun Heo <tj(a)kernel.org>
Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Cc: Jeffle Xu <jefflexu(a)linux.alibaba.com>
Cc: linux-kernel(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Fixes: 07173c3ec276 ("block: enable multipage bvecs")
Signed-off-by: Long Li <longli(a)microsoft.com>
---
include/linux/bio.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a0b4cfdf62a4..e89242a53bbc 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -271,7 +271,7 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
{
- *bv = bio_iovec(bio);
+ *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
}
static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
--
2.17.1
This is a note to let you know that I've just added the patch titled
usb: dwc3: debugfs: Add and remove endpoint dirs dynamically
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 8d396bb0a5b62b326f6be7594d8bd46b088296bd Mon Sep 17 00:00:00 2001
From: Jack Pham <jackp(a)codeaurora.org>
Date: Sat, 29 May 2021 12:29:32 -0700
Subject: usb: dwc3: debugfs: Add and remove endpoint dirs dynamically
The DWC3 DebugFS directory and files are currently created once
during probe. This includes creation of subdirectories for each
of the gadget's endpoints. This works fine for peripheral-only
controllers, as dwc3_core_init_mode() calls dwc3_gadget_init()
just prior to calling dwc3_debugfs_init().
However, for dual-role controllers, dwc3_core_init_mode() will
instead call dwc3_drd_init() which is problematic in a few ways.
First, the initial state must be determined, then dwc3_set_mode()
will have to schedule drd_work and by then dwc3_debugfs_init()
could have already been invoked. Even if the initial mode is
peripheral, dwc3_gadget_init() happens after the DebugFS files
are created, and worse so if the initial state is host and the
controller switches to peripheral much later. And secondly,
even if the gadget endpoints' debug entries were successfully
created, if the controller exits peripheral mode, its dwc3_eps
are freed so the debug files would now hold stale references.
So it is best if the DebugFS endpoint entries are created and
removed dynamically at the same time the underlying dwc3_eps are.
Do this by calling dwc3_debugfs_create_endpoint_dir() as each
endpoint is created, and conversely remove the DebugFS entry when
the endpoint is freed.
Fixes: 41ce1456e1db ("usb: dwc3: core: make dwc3_set_mode() work properly")
Cc: stable <stable(a)vger.kernel.org>
Reviewed-by: Peter Chen <peter.chen(a)kernel.org>
Signed-off-by: Jack Pham <jackp(a)codeaurora.org>
Link: https://lore.kernel.org/r/20210529192932.22912-1-jackp@codeaurora.org
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/dwc3/debug.h | 3 +++
drivers/usb/dwc3/debugfs.c | 21 ++-------------------
drivers/usb/dwc3/gadget.c | 3 +++
3 files changed, 8 insertions(+), 19 deletions(-)
diff --git a/drivers/usb/dwc3/debug.h b/drivers/usb/dwc3/debug.h
index d0ac89c5b317..d223c54115f4 100644
--- a/drivers/usb/dwc3/debug.h
+++ b/drivers/usb/dwc3/debug.h
@@ -413,9 +413,12 @@ static inline const char *dwc3_gadget_generic_cmd_status_string(int status)
#ifdef CONFIG_DEBUG_FS
+extern void dwc3_debugfs_create_endpoint_dir(struct dwc3_ep *dep);
extern void dwc3_debugfs_init(struct dwc3 *d);
extern void dwc3_debugfs_exit(struct dwc3 *d);
#else
+static inline void dwc3_debugfs_create_endpoint_dir(struct dwc3_ep *dep)
+{ }
static inline void dwc3_debugfs_init(struct dwc3 *d)
{ }
static inline void dwc3_debugfs_exit(struct dwc3 *d)
diff --git a/drivers/usb/dwc3/debugfs.c b/drivers/usb/dwc3/debugfs.c
index 7146ee2ac057..5dbbe53269d3 100644
--- a/drivers/usb/dwc3/debugfs.c
+++ b/drivers/usb/dwc3/debugfs.c
@@ -886,30 +886,14 @@ static void dwc3_debugfs_create_endpoint_files(struct dwc3_ep *dep,
}
}
-static void dwc3_debugfs_create_endpoint_dir(struct dwc3_ep *dep,
- struct dentry *parent)
+void dwc3_debugfs_create_endpoint_dir(struct dwc3_ep *dep)
{
struct dentry *dir;
- dir = debugfs_create_dir(dep->name, parent);
+ dir = debugfs_create_dir(dep->name, dep->dwc->root);
dwc3_debugfs_create_endpoint_files(dep, dir);
}
-static void dwc3_debugfs_create_endpoint_dirs(struct dwc3 *dwc,
- struct dentry *parent)
-{
- int i;
-
- for (i = 0; i < dwc->num_eps; i++) {
- struct dwc3_ep *dep = dwc->eps[i];
-
- if (!dep)
- continue;
-
- dwc3_debugfs_create_endpoint_dir(dep, parent);
- }
-}
-
void dwc3_debugfs_init(struct dwc3 *dwc)
{
struct dentry *root;
@@ -940,7 +924,6 @@ void dwc3_debugfs_init(struct dwc3 *dwc)
&dwc3_testmode_fops);
debugfs_create_file("link_state", 0644, root, dwc,
&dwc3_link_state_fops);
- dwc3_debugfs_create_endpoint_dirs(dwc, root);
}
}
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 88270eee8a48..f14c2aa83759 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -2753,6 +2753,8 @@ static int dwc3_gadget_init_endpoint(struct dwc3 *dwc, u8 epnum)
INIT_LIST_HEAD(&dep->started_list);
INIT_LIST_HEAD(&dep->cancelled_list);
+ dwc3_debugfs_create_endpoint_dir(dep);
+
return 0;
}
@@ -2796,6 +2798,7 @@ static void dwc3_gadget_free_endpoints(struct dwc3 *dwc)
list_del(&dep->endpoint.ep_list);
}
+ debugfs_remove_recursive(debugfs_lookup(dep->name, dwc->root));
kfree(dep);
}
}
--
2.31.1
From: Jeimon <jjjinmeng.zhou(a)gmail.com>
[ Upstream commit 8ab78863e9eff11910e1ac8bcf478060c29b379e ]
The function rawsock_create() calls a privileged function sk_alloc(), which requires a ns-aware check to check net->user_ns, i.e., ns_capable(). However, the original code checks the init_user_ns using capable(). So we replace the capable() with ns_capable().
Signed-off-by: Jeimon <jjjinmeng.zhou(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/nfc/rawsock.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 92a3cfae4de8..2fba626a0125 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -345,7 +345,7 @@ static int rawsock_create(struct net *net, struct socket *sock,
return -ESOCKTNOSUPPORT;
if (sock->type == SOCK_RAW) {
- if (!capable(CAP_NET_RAW))
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
return -EPERM;
sock->ops = &rawsock_raw_ops;
} else {
--
2.30.2
From: Jeimon <jjjinmeng.zhou(a)gmail.com>
[ Upstream commit 8ab78863e9eff11910e1ac8bcf478060c29b379e ]
The function rawsock_create() calls a privileged function sk_alloc(), which requires a ns-aware check to check net->user_ns, i.e., ns_capable(). However, the original code checks the init_user_ns using capable(). So we replace the capable() with ns_capable().
Signed-off-by: Jeimon <jjjinmeng.zhou(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/nfc/rawsock.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 92a3cfae4de8..2fba626a0125 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -345,7 +345,7 @@ static int rawsock_create(struct net *net, struct socket *sock,
return -ESOCKTNOSUPPORT;
if (sock->type == SOCK_RAW) {
- if (!capable(CAP_NET_RAW))
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
return -EPERM;
sock->ops = &rawsock_raw_ops;
} else {
--
2.30.2
From: Saravana Kannan <saravanak(a)google.com>
[ Upstream commit c7299fea67696db5bd09d924d1f1080d894f92ef ]
When an SPI device is unregistered, the spi->controller->cleanup() is
called in the device's release callback. That's wrong for a couple of
reasons:
1. spi_dev_put() can be called before spi_add_device() is called. And
it's spi_add_device() that calls spi_setup(). This will cause clean()
to get called without the spi device ever being setup.
2. There's no guarantee that the controller's driver would be present by
the time the spi device's release function gets called.
3. It also causes "sleeping in atomic context" stack dump[1] when device
link deletion code does a put_device() on the spi device.
Fix these issues by simply moving the cleanup from the device release
callback to the actual spi_unregister_device() function.
[1] - https://lore.kernel.org/lkml/CAHp75Vc=FCGcUyS0v6fnxme2YJ+qD+Y-hQDQLa2JhWNON…
Signed-off-by: Saravana Kannan <saravanak(a)google.com>
Link: https://lore.kernel.org/r/20210426235638.1285530-1-saravanak@google.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/spi/spi.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index da71a53b0df7..49f39c54d679 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -52,10 +52,6 @@ static void spidev_release(struct device *dev)
{
struct spi_device *spi = to_spi_device(dev);
- /* spi controllers may cleanup for released devices */
- if (spi->controller->cleanup)
- spi->controller->cleanup(spi);
-
spi_controller_put(spi->controller);
kfree(spi);
}
@@ -501,6 +497,12 @@ static int spi_dev_check(struct device *dev, void *data)
return 0;
}
+static void spi_cleanup(struct spi_device *spi)
+{
+ if (spi->controller->cleanup)
+ spi->controller->cleanup(spi);
+}
+
/**
* spi_add_device - Add spi_device allocated with spi_alloc_device
* @spi: spi_device to register
@@ -562,11 +564,13 @@ int spi_add_device(struct spi_device *spi)
/* Device may be bound to an active driver when this returns */
status = device_add(&spi->dev);
- if (status < 0)
+ if (status < 0) {
dev_err(dev, "can't add %s, status %d\n",
dev_name(&spi->dev), status);
- else
+ spi_cleanup(spi);
+ } else {
dev_dbg(dev, "registered child %s\n", dev_name(&spi->dev));
+ }
done:
mutex_unlock(&spi_add_lock);
@@ -653,6 +657,8 @@ void spi_unregister_device(struct spi_device *spi)
if (!spi)
return;
+ spi_cleanup(spi);
+
if (spi->dev.of_node) {
of_node_clear_flag(spi->dev.of_node, OF_POPULATED);
of_node_put(spi->dev.of_node);
--
2.30.2
From: Saravana Kannan <saravanak(a)google.com>
[ Upstream commit c7299fea67696db5bd09d924d1f1080d894f92ef ]
When an SPI device is unregistered, the spi->controller->cleanup() is
called in the device's release callback. That's wrong for a couple of
reasons:
1. spi_dev_put() can be called before spi_add_device() is called. And
it's spi_add_device() that calls spi_setup(). This will cause clean()
to get called without the spi device ever being setup.
2. There's no guarantee that the controller's driver would be present by
the time the spi device's release function gets called.
3. It also causes "sleeping in atomic context" stack dump[1] when device
link deletion code does a put_device() on the spi device.
Fix these issues by simply moving the cleanup from the device release
callback to the actual spi_unregister_device() function.
[1] - https://lore.kernel.org/lkml/CAHp75Vc=FCGcUyS0v6fnxme2YJ+qD+Y-hQDQLa2JhWNON…
Signed-off-by: Saravana Kannan <saravanak(a)google.com>
Link: https://lore.kernel.org/r/20210426235638.1285530-1-saravanak@google.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/spi/spi.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index bbe33016d371..39dfaad9097e 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -55,10 +55,6 @@ static void spidev_release(struct device *dev)
{
struct spi_device *spi = to_spi_device(dev);
- /* spi controllers may cleanup for released devices */
- if (spi->controller->cleanup)
- spi->controller->cleanup(spi);
-
spi_controller_put(spi->controller);
kfree(spi);
}
@@ -506,6 +502,12 @@ static int spi_dev_check(struct device *dev, void *data)
return 0;
}
+static void spi_cleanup(struct spi_device *spi)
+{
+ if (spi->controller->cleanup)
+ spi->controller->cleanup(spi);
+}
+
/**
* spi_add_device - Add spi_device allocated with spi_alloc_device
* @spi: spi_device to register
@@ -567,11 +569,13 @@ int spi_add_device(struct spi_device *spi)
/* Device may be bound to an active driver when this returns */
status = device_add(&spi->dev);
- if (status < 0)
+ if (status < 0) {
dev_err(dev, "can't add %s, status %d\n",
dev_name(&spi->dev), status);
- else
+ spi_cleanup(spi);
+ } else {
dev_dbg(dev, "registered child %s\n", dev_name(&spi->dev));
+ }
done:
mutex_unlock(&spi_add_lock);
@@ -658,6 +662,8 @@ void spi_unregister_device(struct spi_device *spi)
if (!spi)
return;
+ spi_cleanup(spi);
+
if (spi->dev.of_node) {
of_node_clear_flag(spi->dev.of_node, OF_POPULATED);
of_node_put(spi->dev.of_node);
--
2.30.2
This reverts commit 9e31c1fe45d555a948ff66f1f0e3fe1f83ca63f7. Ever
since that commit, we've been having issues where a hang in one client
can propagate to another. In particular, a hang in an app can propagate
to the X server which causes the whole desktop to lock up.
Error propagation along fences sound like a good idea, but as your bug
shows, surprising consequences, since propagating errors across security
boundaries is not a good thing.
What we do have is track the hangs on the ctx, and report information to
userspace using RESET_STATS. That's how arb_robustness works. Also, if my
understanding is still correct, the EIO from execbuf is when your context
is banned (because not recoverable or too many hangs). And in all these
cases it's up to userspace to figure out what is all impacted and should
be reported to the application, that's not on the kernel to guess and
automatically propagate.
What's more, we're also building more features on top of ctx error
reporting with RESET_STATS ioctl: Encrypted buffers use the same, and the
userspace fence wait also relies on that mechanism. So it is the path
going forward for reporting gpu hangs and resets to userspace.
So all together that's why I think we should just bury this idea again as
not quite the direction we want to go to, hence why I think the revert is
the right option here.
For backporters: Please note that you _must_ have a backport of
https://lore.kernel.org/dri-devel/20210602164149.391653-2-jason@jlekstrand.…
for otherwise backporting just this patch opens up a security bug.
v2: Augment commit message. Also restore Jason's sob that I
accidentally lost.
v3: Add a note for backporters
Signed-off-by: Jason Ekstrand <jason(a)jlekstrand.net>
Reported-by: Marcin Slusarz <marcin.slusarz(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.6+
Cc: Jason Ekstrand <jason.ekstrand(a)intel.com>
Cc: Marcin Slusarz <marcin.slusarz(a)intel.com>
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/3080
Fixes: 9e31c1fe45d5 ("drm/i915: Propagate errors on awaiting already signaled fences")
Acked-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Reviewed-by: Jon Bloomfield <jon.bloomfield(a)intel.com>
---
drivers/gpu/drm/i915/i915_request.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 970d8f4986bbe..b796197c07722 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1426,10 +1426,8 @@ i915_request_await_execution(struct i915_request *rq,
do {
fence = *child++;
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
- i915_sw_fence_set_error_once(&rq->submit, fence->error);
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
continue;
- }
if (fence->context == rq->fence.context)
continue;
@@ -1527,10 +1525,8 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
do {
fence = *child++;
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
- i915_sw_fence_set_error_once(&rq->submit, fence->error);
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
continue;
- }
/*
* Requests on the same timeline are explicitly ordered, along
--
2.31.1
This reverts commit 9e31c1fe45d555a948ff66f1f0e3fe1f83ca63f7. Ever
since that commit, we've been having issues where a hang in one client
can propagate to another. In particular, a hang in an app can propagate
to the X server which causes the whole desktop to lock up.
Signed-off-by: Jason Ekstrand <jason.ekstrand(a)intel.com>
Reported-by: Marcin Slusarz <marcin.slusarz(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.6+
Cc: Jason Ekstrand <jason.ekstrand(a)intel.com>
Cc: Marcin Slusarz <marcin.slusarz(a)intel.com>
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/3080
Fixes: 9e31c1fe45d5 ("drm/i915: Propagate errors on awaiting already signaled fences")
Signed-off-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Reviewed-by: Jon Bloomfield <jon.bloomfield(a)intel.com>
---
drivers/gpu/drm/i915/i915_request.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 970d8f4986bbe..b796197c07722 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1426,10 +1426,8 @@ i915_request_await_execution(struct i915_request *rq,
do {
fence = *child++;
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
- i915_sw_fence_set_error_once(&rq->submit, fence->error);
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
continue;
- }
if (fence->context == rq->fence.context)
continue;
@@ -1527,10 +1525,8 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
do {
fence = *child++;
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
- i915_sw_fence_set_error_once(&rq->submit, fence->error);
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
continue;
- }
/*
* Requests on the same timeline are explicitly ordered, along
--
2.31.1
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 74b2fc882d380d8fafc2a26f01d401c2a7beeadb
Gitweb: https://git.kernel.org/tip/74b2fc882d380d8fafc2a26f01d401c2a7beeadb
Author: Borislav Petkov <bp(a)suse.de>
AuthorDate: Wed, 02 Jun 2021 12:07:52 +02:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Thu, 03 Jun 2021 16:32:59 +02:00
dmaengine: idxd: Use cpu_feature_enabled()
When testing x86 feature bits, use cpu_feature_enabled() so that
build-disabled features can remain off, regardless of what CPUID says.
Fixes: 8e50d392652f ("dmaengine: idxd: Add shared workqueue support")
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-By: Vinod Koul <vkoul(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
---
drivers/dma/idxd/init.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 2a926be..776fd44 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -745,12 +745,12 @@ static int __init idxd_init_module(void)
* If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
* enumerating the device. We can not utilize it.
*/
- if (!boot_cpu_has(X86_FEATURE_MOVDIR64B)) {
+ if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
pr_warn("idxd driver failed to load without MOVDIR64B.\n");
return -ENODEV;
}
- if (!boot_cpu_has(X86_FEATURE_ENQCMD))
+ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
pr_warn("Platform does not have ENQCMD(S) support.\n");
else
support_enqcmd = true;
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 9bfecd05833918526cc7357d55e393393440c5fa
Gitweb: https://git.kernel.org/tip/9bfecd05833918526cc7357d55e393393440c5fa
Author: Thomas Gleixner <tglx(a)linutronix.de>
AuthorDate: Sat, 29 May 2021 11:17:30 +02:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Thu, 03 Jun 2021 16:33:09 +02:00
x86/cpufeatures: Force disable X86_FEATURE_ENQCMD and remove update_pasid()
While digesting the XSAVE-related horrors which got introduced with
the supervisor/user split, the recent addition of ENQCMD-related
functionality got on the radar and turned out to be similarly broken.
update_pasid(), which is only required when X86_FEATURE_ENQCMD is
available, is invoked from two places:
1) From switch_to() for the incoming task
2) Via a SMP function call from the IOMMU/SMV code
#1 is half-ways correct as it hacks around the brokenness of get_xsave_addr()
by enforcing the state to be 'present', but all the conditionals in that
code are completely pointless for that.
Also the invocation is just useless overhead because at that point
it's guaranteed that TIF_NEED_FPU_LOAD is set on the incoming task
and all of this can be handled at return to user space.
#2 is broken beyond repair. The comment in the code claims that it is safe
to invoke this in an IPI, but that's just wishful thinking.
FPU state of a running task is protected by fregs_lock() which is
nothing else than a local_bh_disable(). As BH-disabled regions run
usually with interrupts enabled the IPI can hit a code section which
modifies FPU state and there is absolutely no guarantee that any of the
assumptions which are made for the IPI case is true.
Also the IPI is sent to all CPUs in mm_cpumask(mm), but the IPI is
invoked with a NULL pointer argument, so it can hit a completely
unrelated task and unconditionally force an update for nothing.
Worse, it can hit a kernel thread which operates on a user space
address space and set a random PASID for it.
The offending commit does not cleanly revert, but it's sufficient to
force disable X86_FEATURE_ENQCMD and to remove the broken update_pasid()
code to make this dysfunctional all over the place. Anything more
complex would require more surgery and none of the related functions
outside of the x86 core code are blatantly wrong, so removing those
would be overkill.
As nothing enables the PASID bit in the IA32_XSS MSR yet, which is
required to make this actually work, this cannot result in a regression
except for related out of tree train-wrecks, but they are broken already
today.
Fixes: 20f0afd1fb3d ("x86/mmu: Allocate/free a PASID")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Acked-by: Andy Lutomirski <luto(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/87mtsd6gr9.ffs@nanos.tec.linutronix.de
---
arch/x86/include/asm/disabled-features.h | 7 +---
arch/x86/include/asm/fpu/api.h | 6 +--
arch/x86/include/asm/fpu/internal.h | 7 +---
arch/x86/kernel/fpu/xstate.c | 57 +-----------------------
4 files changed, 3 insertions(+), 74 deletions(-)
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index b7dd944..8f28faf 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -56,11 +56,8 @@
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
-#ifdef CONFIG_IOMMU_SUPPORT
-# define DISABLE_ENQCMD 0
-#else
-# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
-#endif
+/* Force disable because it's broken beyond repair */
+#define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
#ifdef CONFIG_X86_SGX
# define DISABLE_SGX 0
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index ed33a14..23bef08 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -106,10 +106,6 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
*/
#define PASID_DISABLED 0
-#ifdef CONFIG_IOMMU_SUPPORT
-/* Update current's PASID MSR/state by mm's PASID. */
-void update_pasid(void);
-#else
static inline void update_pasid(void) { }
-#endif
+
#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 8d33ad8..ceeba9f 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -584,13 +584,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
pkru_val = pk->pkru;
}
__write_pkru(pkru_val);
-
- /*
- * Expensive PASID MSR write will be avoided in update_pasid() because
- * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated
- * unless it's different from mm->pasid to reduce overhead.
- */
- update_pasid();
}
#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index a85c640..d0eef96 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1402,60 +1402,3 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
-
-#ifdef CONFIG_IOMMU_SUPPORT
-void update_pasid(void)
-{
- u64 pasid_state;
- u32 pasid;
-
- if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
- return;
-
- if (!current->mm)
- return;
-
- pasid = READ_ONCE(current->mm->pasid);
- /* Set the valid bit in the PASID MSR/state only for valid pasid. */
- pasid_state = pasid == PASID_DISABLED ?
- pasid : pasid | MSR_IA32_PASID_VALID;
-
- /*
- * No need to hold fregs_lock() since the task's fpstate won't
- * be changed by others (e.g. ptrace) while the task is being
- * switched to or is in IPI.
- */
- if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
- /* The MSR is active and can be directly updated. */
- wrmsrl(MSR_IA32_PASID, pasid_state);
- } else {
- struct fpu *fpu = ¤t->thread.fpu;
- struct ia32_pasid_state *ppasid_state;
- struct xregs_state *xsave;
-
- /*
- * The CPU's xstate registers are not currently active. Just
- * update the PASID state in the memory buffer here. The
- * PASID MSR will be loaded when returning to user mode.
- */
- xsave = &fpu->state.xsave;
- xsave->header.xfeatures |= XFEATURE_MASK_PASID;
- ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
- /*
- * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state
- * won't be NULL and no need to check its value.
- *
- * Only update the task's PASID state when it's different
- * from the mm's pasid.
- */
- if (ppasid_state->pasid != pasid_state) {
- /*
- * Invalid fpregs so that state restoring will pick up
- * the PASID state.
- */
- __fpu_invalidate_fpregs_state(fpu);
- ppasid_state->pasid = pasid_state;
- }
- }
-}
-#endif /* CONFIG_IOMMU_SUPPORT */
On Thu, Jun 3, 2021 at 7:53 AM Hangbin Liu <liuhangbin(a)gmail.com> wrote:
>
> In curve25519_mod_init() the curve25519_alg will be registered only when
> (X86_FEATURE_BMI2 && X86_FEATURE_ADX). But in curve25519_mod_exit()
> it still checks (X86_FEATURE_BMI2 || X86_FEATURE_ADX) when do crypto
> unregister. This will trigger a BUG_ON in crypto_unregister_alg() as
> alg->cra_refcnt is 0 if the cpu only supports one of X86_FEATURE_BMI2
> and X86_FEATURE_ADX.
>
> Fixes: 07b586fe0662 ("crypto: x86/curve25519 - replace with formally verified implementation")
> Signed-off-by: Hangbin Liu <liuhangbin(a)gmail.com>
> ---
> arch/x86/crypto/curve25519-x86_64.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> index 6706b6cb1d0f..38caf61cd5b7 100644
> --- a/arch/x86/crypto/curve25519-x86_64.c
> +++ b/arch/x86/crypto/curve25519-x86_64.c
> @@ -1500,7 +1500,7 @@ static int __init curve25519_mod_init(void)
> static void __exit curve25519_mod_exit(void)
> {
> if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
> - (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
> + static_branch_likely(&curve25519_use_bmi2_adx))
> crypto_unregister_kpp(&curve25519_alg);
> }
Looks like the error is actually that the `||` should be a `&&`. But
if you'd like to branch on that static key instead, that's fine.
Cc: stable(a)vger.kernel.org
Reviewed-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
From: Joerg Roedel <jroedel(a)suse.de>
For now, kexec is not supported when running as an SEV-ES guest. Doing
so requires additional hypervisor support and special code to hand
over the CPUs to the new kernel in a safe way.
Until this is implemented, do not support kexec in SEV-ES guests.
Cc: stable(a)vger.kernel.org # v5.10+
Signed-off-by: Joerg Roedel <jroedel(a)suse.de>
---
arch/x86/kernel/machine_kexec_64.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index c078b0d3ab0e..f902cc9cc634 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -620,3 +620,11 @@ void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
*/
set_memory_encrypted((unsigned long)vaddr, pages);
}
+
+/*
+ * Kexec is not supported in SEV-ES guests yet
+ */
+bool arch_kexec_supported(void)
+{
+ return !sev_es_active();
+}
--
2.31.1
From: Joerg Roedel <jroedel(a)suse.de>
Allow a runtime opt-out of kexec support for architecture code in case
the kernel is running in an environment where kexec is not properly
supported yet.
This will be used on x86 when the kernel is running as an SEV-ES
guest. SEV-ES guests need special handling for kexec to hand over all
CPUs to the new kernel. This requires special hypervisor support and
handling code in the guest which is not yet implemented.
Cc: stable(a)vger.kernel.org # v5.10+
Signed-off-by: Joerg Roedel <jroedel(a)suse.de>
---
include/linux/kexec.h | 1 +
kernel/kexec.c | 14 ++++++++++++++
kernel/kexec_file.c | 9 +++++++++
3 files changed, 24 insertions(+)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0c994ae37729..85c30dcd0bdc 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -201,6 +201,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
unsigned long buf_len);
#endif
int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
+bool arch_kexec_supported(void);
extern int kexec_add_buffer(struct kexec_buf *kbuf);
int kexec_locate_mem_hole(struct kexec_buf *kbuf);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c82c6c06f051..d03134160458 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -195,11 +195,25 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
* that to happen you need to do that yourself.
*/
+bool __weak arch_kexec_supported(void)
+{
+ return true;
+}
+
static inline int kexec_load_check(unsigned long nr_segments,
unsigned long flags)
{
int result;
+ /*
+ * The architecture may support kexec in general, but the kernel could
+ * run in an environment where it is not (yet) possible to execute a new
+ * kernel. Allow the architecture code to opt-out of kexec support when
+ * it is running in such an environment.
+ */
+ if (!arch_kexec_supported())
+ return -ENOSYS;
+
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 33400ff051a8..96d08a512e9c 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -358,6 +358,15 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
int ret = 0, i;
struct kimage **dest_image, *image;
+ /*
+ * The architecture may support kexec in general, but the kernel could
+ * run in an environment where it is not (yet) possible to execute a new
+ * kernel. Allow the architecture code to opt-out of kexec support when
+ * it is running in such an environment.
+ */
+ if (!arch_kexec_supported())
+ return -ENOSYS;
+
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
--
2.31.1
xfrm_policy_lookup_bytype loops on seqcount mutex xfrm_policy_hash_generation
within an RCU read side critical section. Although ill advised, this is fine if
the loop is bounded.
xfrm_policy_hash_generation wraps mutex hash_resize_mutex, which is used to
serialize writers (xfrm_hash_resize, xfrm_hash_rebuild). This is fine too.
On PREEMPT_RT=y, the read_seqcount_begin call within xfrm_policy_lookup_bytype
emits a mutex lock/unlock for hash_resize_mutex. Mutex locking is fine, since
RCU read side critical sections are allowed to sleep with PREEMPT_RT.
xfrm_hash_resize can, however, block on synchronize_rcu while holding
hash_resize_mutex.
This leads to the following situation on PREEMPT_RT, where the writer is
blocked on RCU grace period expiry, while the reader is blocked on a lock held
by the writer:
Thead 1 (xfrm_hash_resize) Thread 2 (xfrm_policy_lookup_bytype)
rcu_read_lock();
mutex_lock(&hash_resize_mutex);
read_seqcount_begin(&xfrm_policy_hash_generation);
mutex_lock(&hash_resize_mutex); // block
xfrm_bydst_resize();
synchronize_rcu(); // block
<RCU stalls in xfrm_policy_lookup_bytype>
Move the read_seqcount_begin call outside of the RCU read side critical section,
and do an rcu_read_unlock/retry if we got stale data within the critical section.
On non-PREEMPT_RT, this shortens the time spent within RCU read side critical
section in case the seqcount needs a retry, and avoids unbounded looping.
Fixes: a7c44247f70 ("xfrm: policy: make xfrm_policy_lookup_bytype lockless")
Signed-off-by: Varad Gautam <varad.gautam(a)suse.com>
Cc: linux-rt-users <linux-rt-users(a)vger.kernel.org>
Cc: netdev(a)vger.kernel.org
Cc: stable(a)vger.kernel.org # v4.9
Cc: Steffen Klassert <steffen.klassert(a)secunet.com>
Cc: Herbert Xu <herbert(a)gondor.apana.org.au>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Jakub Kicinski <kuba(a)kernel.org>
Cc: Florian Westphal <fw(a)strlen.de>
---
net/xfrm/xfrm_policy.c | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ce500f847b99..e9d0df2a2ab1 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2092,12 +2092,15 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
if (unlikely(!daddr || !saddr))
return NULL;
- rcu_read_lock();
retry:
- do {
- sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
- chain = policy_hash_direct(net, daddr, saddr, family, dir);
- } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+ sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+ rcu_read_lock();
+
+ chain = policy_hash_direct(net, daddr, saddr, family, dir);
+ if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) {
+ rcu_read_unlock();
+ goto retry;
+ }
ret = NULL;
hlist_for_each_entry_rcu(pol, chain, bydst) {
@@ -2128,11 +2131,15 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
}
skip_inexact:
- if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
+ if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) {
+ rcu_read_unlock();
goto retry;
+ }
- if (ret && !xfrm_pol_hold_rcu(ret))
+ if (ret && !xfrm_pol_hold_rcu(ret)) {
+ rcu_read_unlock();
goto retry;
+ }
fail:
rcu_read_unlock();
--
2.26.2
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: db099bafbf5e9c421a3a11cf33965c7b0afb3f89
Gitweb: https://git.kernel.org/tip/db099bafbf5e9c421a3a11cf33965c7b0afb3f89
Author: Borislav Petkov <bp(a)suse.de>
AuthorDate: Wed, 02 Jun 2021 12:07:52 +02:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Wed, 02 Jun 2021 12:29:00 +02:00
dmaengine: idxd: Use cpu_feature_enabled()
When testing x86 feature bits, use cpu_feature_enabled() so that
build-disabled features can remain off, regardless of what CPUID says.
Fixes: 8e50d392652f ("dmaengine: idxd: Add shared workqueue support")
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: <stable(a)vger.kernel.org>
---
drivers/dma/idxd/init.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 2a926be..776fd44 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -745,12 +745,12 @@ static int __init idxd_init_module(void)
* If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
* enumerating the device. We can not utilize it.
*/
- if (!boot_cpu_has(X86_FEATURE_MOVDIR64B)) {
+ if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
pr_warn("idxd driver failed to load without MOVDIR64B.\n");
return -ENODEV;
}
- if (!boot_cpu_has(X86_FEATURE_ENQCMD))
+ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
pr_warn("Platform does not have ENQCMD(S) support.\n");
else
support_enqcmd = true;
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 6867ee8bcb754c59a8304f4a9d693ed6cfb96b31
Gitweb: https://git.kernel.org/tip/6867ee8bcb754c59a8304f4a9d693ed6cfb96b31
Author: Thomas Gleixner <tglx(a)linutronix.de>
AuthorDate: Sat, 29 May 2021 11:17:30 +02:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Wed, 02 Jun 2021 12:29:00 +02:00
x86/cpufeatures: Force disable X86_FEATURE_ENQCMD and remove update_pasid()
While digesting the XSAVE-related horrors which got introduced with
the supervisor/user split, the recent addition of ENQCMD-related
functionality got on the radar and turned out to be similarly broken.
update_pasid(), which is only required when X86_FEATURE_ENQCMD is
available, is invoked from two places:
1) From switch_to() for the incoming task
2) Via a SMP function call from the IOMMU/SMV code
#1 is half-ways correct as it hacks around the brokenness of get_xsave_addr()
by enforcing the state to be 'present', but all the conditionals in that
code are completely pointless for that.
Also the invocation is just useless overhead because at that point
it's guaranteed that TIF_NEED_FPU_LOAD is set on the incoming task
and all of this can be handled at return to user space.
#2 is broken beyond repair. The comment in the code claims that it is safe
to invoke this in an IPI, but that's just wishful thinking.
FPU state of a running task is protected by fregs_lock() which is
nothing else than a local_bh_disable(). As BH-disabled regions run
usually with interrupts enabled the IPI can hit a code section which
modifies FPU state and there is absolutely no guarantee that any of the
assumptions which are made for the IPI case is true.
Also the IPI is sent to all CPUs in mm_cpumask(mm), but the IPI is
invoked with a NULL pointer argument, so it can hit a completely
unrelated task and unconditionally force an update for nothing.
Worse, it can hit a kernel thread which operates on a user space
address space and set a random PASID for it.
The offending commit does not cleanly revert, but it's sufficient to
force disable X86_FEATURE_ENQCMD and to remove the broken update_pasid()
code to make this dysfunctional all over the place. Anything more
complex would require more surgery and none of the related functions
outside of the x86 core code are blatantly wrong, so removing those
would be overkill.
As nothing enables the PASID bit in the IA32_XSS MSR yet, which is
required to make this actually work, this cannot result in a regression
except for related out of tree train-wrecks, but they are broken already
today.
Fixes: 20f0afd1fb3d ("x86/mmu: Allocate/free a PASID")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Acked-by: Andy Lutomirski <luto(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/87mtsd6gr9.ffs@nanos.tec.linutronix.de
---
arch/x86/include/asm/disabled-features.h | 7 +---
arch/x86/include/asm/fpu/api.h | 6 +--
arch/x86/include/asm/fpu/internal.h | 7 +---
arch/x86/kernel/fpu/xstate.c | 57 +-----------------------
4 files changed, 3 insertions(+), 74 deletions(-)
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index b7dd944..8f28faf 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -56,11 +56,8 @@
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
-#ifdef CONFIG_IOMMU_SUPPORT
-# define DISABLE_ENQCMD 0
-#else
-# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
-#endif
+/* Force disable because it's broken beyond repair */
+#define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
#ifdef CONFIG_X86_SGX
# define DISABLE_SGX 0
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index ed33a14..23bef08 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -106,10 +106,6 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
*/
#define PASID_DISABLED 0
-#ifdef CONFIG_IOMMU_SUPPORT
-/* Update current's PASID MSR/state by mm's PASID. */
-void update_pasid(void);
-#else
static inline void update_pasid(void) { }
-#endif
+
#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 8d33ad8..ceeba9f 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -584,13 +584,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
pkru_val = pk->pkru;
}
__write_pkru(pkru_val);
-
- /*
- * Expensive PASID MSR write will be avoided in update_pasid() because
- * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated
- * unless it's different from mm->pasid to reduce overhead.
- */
- update_pasid();
}
#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index a85c640..d0eef96 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1402,60 +1402,3 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
-
-#ifdef CONFIG_IOMMU_SUPPORT
-void update_pasid(void)
-{
- u64 pasid_state;
- u32 pasid;
-
- if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
- return;
-
- if (!current->mm)
- return;
-
- pasid = READ_ONCE(current->mm->pasid);
- /* Set the valid bit in the PASID MSR/state only for valid pasid. */
- pasid_state = pasid == PASID_DISABLED ?
- pasid : pasid | MSR_IA32_PASID_VALID;
-
- /*
- * No need to hold fregs_lock() since the task's fpstate won't
- * be changed by others (e.g. ptrace) while the task is being
- * switched to or is in IPI.
- */
- if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
- /* The MSR is active and can be directly updated. */
- wrmsrl(MSR_IA32_PASID, pasid_state);
- } else {
- struct fpu *fpu = ¤t->thread.fpu;
- struct ia32_pasid_state *ppasid_state;
- struct xregs_state *xsave;
-
- /*
- * The CPU's xstate registers are not currently active. Just
- * update the PASID state in the memory buffer here. The
- * PASID MSR will be loaded when returning to user mode.
- */
- xsave = &fpu->state.xsave;
- xsave->header.xfeatures |= XFEATURE_MASK_PASID;
- ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
- /*
- * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state
- * won't be NULL and no need to check its value.
- *
- * Only update the task's PASID state when it's different
- * from the mm's pasid.
- */
- if (ppasid_state->pasid != pasid_state) {
- /*
- * Invalid fpregs so that state restoring will pick up
- * the PASID state.
- */
- __fpu_invalidate_fpregs_state(fpu);
- ppasid_state->pasid = pasid_state;
- }
- }
-}
-#endif /* CONFIG_IOMMU_SUPPORT */
From: Thomas Gleixner <tglx(a)linutronix.de>
The non-compacted slowpath uses __copy_from_user() and copies the entire
user buffer into the kernel buffer, verbatim. This means that the kernel
buffer may now contain entirely invalid state on which XRSTOR will #GP.
validate_user_xstate_header() can detect some of that corruption, but that
leaves the onus on callers to clear the buffer.
Prior to XSAVES support it was possible just to reinitialize the buffer,
completely, but with supervisor states that is not longer possible as the
buffer clearing code split got it backwards. Fixing that is possible, but
not corrupting the state in the first place is more robust.
Avoid corruption of the kernel XSAVE buffer by using copy_user_to_xstate()
which validates the XSAVE header contents before copying the actual states
to the kernel. copy_user_to_xstate() was previously only called for
compacted-format kernel buffers, but it works for both compacted and
non-compacted forms.
Using it for the non-compacted form is slower because of multiple
__copy_from_user() operations, but that cost is less important than robust
code in an already slow path.
[ Changelog polished by Dave Hansen ]
Fixes: b860eb8dce59 ("x86/fpu/xstate: Define new functions for clearing fpregs and xstates")
Reported-by: syzbot+2067e764dbcd10721e2e(a)syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
---
arch/x86/include/asm/fpu/xstate.h | 4 ----
arch/x86/kernel/fpu/signal.c | 9 +--------
arch/x86/kernel/fpu/xstate.c | 2 +-
3 files changed, 2 insertions(+), 13 deletions(-)
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -112,8 +112,4 @@ void copy_supervisor_to_kernel(struct xr
void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
-
-/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_user_xstate_header(const struct xstate_header *hdr);
-
#endif
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -405,14 +405,7 @@ static int __fpu__restore_sig(void __use
if (use_xsave() && !fx_only) {
u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
- if (using_compacted_format()) {
- ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
- } else {
- ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
-
- if (!ret && state_size > offsetof(struct xregs_state, header))
- ret = validate_user_xstate_header(&fpu->state.xsave.header);
- }
+ ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
if (ret)
goto err_out;
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -515,7 +515,7 @@ int using_compacted_format(void)
}
/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_user_xstate_header(const struct xstate_header *hdr)
+static int validate_user_xstate_header(const struct xstate_header *hdr)
{
/* No unknown or supervisor features may be set */
if (hdr->xfeatures & ~xfeatures_mask_user())
This is a note to let you know that I've just added the patch titled
usb: pd: Set PD_T_SINK_WAIT_CAP to 310ms
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 6490fa565534fa83593278267785a694fd378a2b Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso(a)google.com>
Date: Fri, 28 May 2021 16:16:13 +0800
Subject: usb: pd: Set PD_T_SINK_WAIT_CAP to 310ms
Current timer PD_T_SINK_WAIT_CAP is set to 240ms which will violate the
SinkWaitCapTimer (tTypeCSinkWaitCap 310 - 620 ms) defined in the PD
Spec if the port is faster enough when running the state machine. Set it
to the lower bound 310ms to ensure the timeout is in Spec.
Fixes: f0690a25a140 ("staging: typec: USB Type-C Port Manager (tcpm)")
Cc: stable <stable(a)vger.kernel.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Signed-off-by: Kyle Tso <kyletso(a)google.com>
Link: https://lore.kernel.org/r/20210528081613.730661-1-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/linux/usb/pd.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index bf00259493e0..96b7ff66f074 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -460,7 +460,7 @@ static inline unsigned int rdo_max_power(u32 rdo)
#define PD_T_RECEIVER_RESPONSE 15 /* 15ms max */
#define PD_T_SOURCE_ACTIVITY 45
#define PD_T_SINK_ACTIVITY 135
-#define PD_T_SINK_WAIT_CAP 240
+#define PD_T_SINK_WAIT_CAP 310 /* 310 - 620 ms */
#define PD_T_PS_TRANSITION 500
#define PD_T_SRC_TRANSITION 35
#define PD_T_DRP_SNK 40
--
2.31.1
This is a note to let you know that I've just added the patch titled
usb: dwc3: gadget: Bail from dwc3_gadget_exit() if dwc->gadget is
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 03715ea2e3dbbc56947137ce3b4ac18a726b2f87 Mon Sep 17 00:00:00 2001
From: Jack Pham <jackp(a)codeaurora.org>
Date: Fri, 28 May 2021 09:04:05 -0700
Subject: usb: dwc3: gadget: Bail from dwc3_gadget_exit() if dwc->gadget is
NULL
There exists a possible scenario in which dwc3_gadget_init() can fail:
during during host -> peripheral mode switch in dwc3_set_mode(), and
a pending gadget driver fails to bind. Then, if the DRD undergoes
another mode switch from peripheral->host the resulting
dwc3_gadget_exit() will attempt to reference an invalid and dangling
dwc->gadget pointer as well as call dma_free_coherent() on unmapped
DMA pointers.
The exact scenario can be reproduced as follows:
- Start DWC3 in peripheral mode
- Configure ConfigFS gadget with FunctionFS instance (or use g_ffs)
- Run FunctionFS userspace application (open EPs, write descriptors, etc)
- Bind gadget driver to DWC3's UDC
- Switch DWC3 to host mode
=> dwc3_gadget_exit() is called. usb_del_gadget() will put the
ConfigFS driver instance on the gadget_driver_pending_list
- Stop FunctionFS application (closes the ep files)
- Switch DWC3 to peripheral mode
=> dwc3_gadget_init() fails as usb_add_gadget() calls
check_pending_gadget_drivers() and attempts to rebind the UDC
to the ConfigFS gadget but fails with -19 (-ENODEV) because the
FFS instance is not in FFS_ACTIVE state (userspace has not
re-opened and written the descriptors yet, i.e. desc_ready!=0).
- Switch DWC3 back to host mode
=> dwc3_gadget_exit() is called again, but this time dwc->gadget
is invalid.
Although it can be argued that userspace should take responsibility
for ensuring that the FunctionFS application be ready prior to
allowing the composite driver bind to the UDC, failure to do so
should not result in a panic from the kernel driver.
Fix this by setting dwc->gadget to NULL in the failure path of
dwc3_gadget_init() and add a check to dwc3_gadget_exit() to bail out
unless the gadget pointer is valid.
Fixes: e81a7018d93a ("usb: dwc3: allocate gadget structure dynamically")
Cc: <stable(a)vger.kernel.org>
Reviewed-by: Peter Chen <peter.chen(a)kernel.org>
Signed-off-by: Jack Pham <jackp(a)codeaurora.org>
Link: https://lore.kernel.org/r/20210528160405.17550-1-jackp@codeaurora.org
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/dwc3/gadget.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 2577488456da..88270eee8a48 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -4045,6 +4045,7 @@ int dwc3_gadget_init(struct dwc3 *dwc)
dwc3_gadget_free_endpoints(dwc);
err4:
usb_put_gadget(dwc->gadget);
+ dwc->gadget = NULL;
err3:
dma_free_coherent(dwc->sysdev, DWC3_BOUNCE_SIZE, dwc->bounce,
dwc->bounce_addr);
@@ -4064,6 +4065,9 @@ int dwc3_gadget_init(struct dwc3 *dwc)
void dwc3_gadget_exit(struct dwc3 *dwc)
{
+ if (!dwc->gadget)
+ return;
+
usb_del_gadget(dwc->gadget);
dwc3_gadget_free_endpoints(dwc);
usb_put_gadget(dwc->gadget);
--
2.31.1
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: rtl28xxu: fix zero-length control request
Author: Johan Hovold <johan(a)kernel.org>
Date: Mon May 24 13:09:20 2021 +0200
The direction of the pipe argument must match the request-type direction
bit or control requests may fail depending on the host-controller-driver
implementation.
Control transfers without a data stage are treated as OUT requests by
the USB stack and should be using usb_sndctrlpipe(). Failing to do so
will now trigger a warning.
Fix the zero-length i2c-read request used for type detection by
attempting to read a single byte instead.
Reported-by: syzbot+faf11bbadc5a372564da(a)syzkaller.appspotmail.com
Fixes: d0f232e823af ("[media] rtl28xxu: add heuristic to detect chip type")
Cc: stable(a)vger.kernel.org # 4.0
Cc: Antti Palosaari <crope(a)iki.fi>
Signed-off-by: Johan Hovold <johan(a)kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
drivers/media/usb/dvb-usb-v2/rtl28xxu.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
---
diff --git a/drivers/media/usb/dvb-usb-v2/rtl28xxu.c b/drivers/media/usb/dvb-usb-v2/rtl28xxu.c
index 97ed17a141bb..2c04ed8af0e4 100644
--- a/drivers/media/usb/dvb-usb-v2/rtl28xxu.c
+++ b/drivers/media/usb/dvb-usb-v2/rtl28xxu.c
@@ -612,8 +612,9 @@ static int rtl28xxu_read_config(struct dvb_usb_device *d)
static int rtl28xxu_identify_state(struct dvb_usb_device *d, const char **name)
{
struct rtl28xxu_dev *dev = d_to_priv(d);
+ u8 buf[1];
int ret;
- struct rtl28xxu_req req_demod_i2c = {0x0020, CMD_I2C_DA_RD, 0, NULL};
+ struct rtl28xxu_req req_demod_i2c = {0x0020, CMD_I2C_DA_RD, 1, buf};
dev_dbg(&d->intf->dev, "\n");
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: rtl28xxu: fix zero-length control request
Author: Johan Hovold <johan(a)kernel.org>
Date: Mon May 24 13:09:20 2021 +0200
The direction of the pipe argument must match the request-type direction
bit or control requests may fail depending on the host-controller-driver
implementation.
Control transfers without a data stage are treated as OUT requests by
the USB stack and should be using usb_sndctrlpipe(). Failing to do so
will now trigger a warning.
Fix the zero-length i2c-read request used for type detection by
attempting to read a single byte instead.
Reported-by: syzbot+faf11bbadc5a372564da(a)syzkaller.appspotmail.com
Fixes: d0f232e823af ("[media] rtl28xxu: add heuristic to detect chip type")
Cc: stable(a)vger.kernel.org # 4.0
Cc: Antti Palosaari <crope(a)iki.fi>
Signed-off-by: Johan Hovold <johan(a)kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
drivers/media/usb/dvb-usb-v2/rtl28xxu.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
---
diff --git a/drivers/media/usb/dvb-usb-v2/rtl28xxu.c b/drivers/media/usb/dvb-usb-v2/rtl28xxu.c
index 97ed17a141bb..2c04ed8af0e4 100644
--- a/drivers/media/usb/dvb-usb-v2/rtl28xxu.c
+++ b/drivers/media/usb/dvb-usb-v2/rtl28xxu.c
@@ -612,8 +612,9 @@ static int rtl28xxu_read_config(struct dvb_usb_device *d)
static int rtl28xxu_identify_state(struct dvb_usb_device *d, const char **name)
{
struct rtl28xxu_dev *dev = d_to_priv(d);
+ u8 buf[1];
int ret;
- struct rtl28xxu_req req_demod_i2c = {0x0020, CMD_I2C_DA_RD, 0, NULL};
+ struct rtl28xxu_req req_demod_i2c = {0x0020, CMD_I2C_DA_RD, 1, buf};
dev_dbg(&d->intf->dev, "\n");
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: gspca/sunplus: fix zero-length control requests
Author: Johan Hovold <johan(a)kernel.org>
Date: Mon May 24 13:09:19 2021 +0200
The direction of the pipe argument must match the request-type direction
bit or control requests may fail depending on the host-controller-driver
implementation.
Control transfers without a data stage are treated as OUT requests by
the USB stack and should be using usb_sndctrlpipe(). Failing to do so
will now trigger a warning.
Fix the single zero-length control request which was using the
read-register helper, and update the helper so that zero-length reads
fail with an error message instead.
Fixes: 6a7eba24e4f0 ("V4L/DVB (8157): gspca: all subdrivers")
Cc: stable(a)vger.kernel.org # 2.6.27
Signed-off-by: Johan Hovold <johan(a)kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
drivers/media/usb/gspca/sunplus.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
---
diff --git a/drivers/media/usb/gspca/sunplus.c b/drivers/media/usb/gspca/sunplus.c
index ace3da40006e..971dee0a56da 100644
--- a/drivers/media/usb/gspca/sunplus.c
+++ b/drivers/media/usb/gspca/sunplus.c
@@ -242,6 +242,10 @@ static void reg_r(struct gspca_dev *gspca_dev,
gspca_err(gspca_dev, "reg_r: buffer overflow\n");
return;
}
+ if (len == 0) {
+ gspca_err(gspca_dev, "reg_r: zero-length read\n");
+ return;
+ }
if (gspca_dev->usb_err < 0)
return;
ret = usb_control_msg(gspca_dev->dev,
@@ -250,7 +254,7 @@ static void reg_r(struct gspca_dev *gspca_dev,
USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
0, /* value */
index,
- len ? gspca_dev->usb_buf : NULL, len,
+ gspca_dev->usb_buf, len,
500);
if (ret < 0) {
pr_err("reg_r err %d\n", ret);
@@ -727,7 +731,7 @@ static int sd_start(struct gspca_dev *gspca_dev)
case MegaImageVI:
reg_w_riv(gspca_dev, 0xf0, 0, 0);
spca504B_WaitCmdStatus(gspca_dev);
- reg_r(gspca_dev, 0xf0, 4, 0);
+ reg_w_riv(gspca_dev, 0xf0, 4, 0);
spca504B_WaitCmdStatus(gspca_dev);
break;
default:
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: gspca/sunplus: fix zero-length control requests
Author: Johan Hovold <johan(a)kernel.org>
Date: Mon May 24 13:09:19 2021 +0200
The direction of the pipe argument must match the request-type direction
bit or control requests may fail depending on the host-controller-driver
implementation.
Control transfers without a data stage are treated as OUT requests by
the USB stack and should be using usb_sndctrlpipe(). Failing to do so
will now trigger a warning.
Fix the single zero-length control request which was using the
read-register helper, and update the helper so that zero-length reads
fail with an error message instead.
Fixes: 6a7eba24e4f0 ("V4L/DVB (8157): gspca: all subdrivers")
Cc: stable(a)vger.kernel.org # 2.6.27
Signed-off-by: Johan Hovold <johan(a)kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
drivers/media/usb/gspca/sunplus.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
---
diff --git a/drivers/media/usb/gspca/sunplus.c b/drivers/media/usb/gspca/sunplus.c
index ace3da40006e..971dee0a56da 100644
--- a/drivers/media/usb/gspca/sunplus.c
+++ b/drivers/media/usb/gspca/sunplus.c
@@ -242,6 +242,10 @@ static void reg_r(struct gspca_dev *gspca_dev,
gspca_err(gspca_dev, "reg_r: buffer overflow\n");
return;
}
+ if (len == 0) {
+ gspca_err(gspca_dev, "reg_r: zero-length read\n");
+ return;
+ }
if (gspca_dev->usb_err < 0)
return;
ret = usb_control_msg(gspca_dev->dev,
@@ -250,7 +254,7 @@ static void reg_r(struct gspca_dev *gspca_dev,
USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
0, /* value */
index,
- len ? gspca_dev->usb_buf : NULL, len,
+ gspca_dev->usb_buf, len,
500);
if (ret < 0) {
pr_err("reg_r err %d\n", ret);
@@ -727,7 +731,7 @@ static int sd_start(struct gspca_dev *gspca_dev)
case MegaImageVI:
reg_w_riv(gspca_dev, 0xf0, 0, 0);
spca504B_WaitCmdStatus(gspca_dev);
- reg_r(gspca_dev, 0xf0, 4, 0);
+ reg_w_riv(gspca_dev, 0xf0, 4, 0);
spca504B_WaitCmdStatus(gspca_dev);
break;
default:
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: gspca/sq905: fix control-request direction
Author: Johan Hovold <johan(a)kernel.org>
Date: Fri May 21 15:28:39 2021 +0200
The direction of the pipe argument must match the request-type direction
bit or control requests may fail depending on the host-controller-driver
implementation.
Fix the USB_REQ_SYNCH_FRAME request which erroneously used
usb_sndctrlpipe().
Fixes: 27d35fc3fb06 ("V4L/DVB (10639): gspca - sq905: New subdriver.")
Cc: stable(a)vger.kernel.org # 2.6.30
Signed-off-by: Johan Hovold <johan(a)kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
drivers/media/usb/gspca/sq905.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
---
diff --git a/drivers/media/usb/gspca/sq905.c b/drivers/media/usb/gspca/sq905.c
index 949111070971..32504ebcfd4d 100644
--- a/drivers/media/usb/gspca/sq905.c
+++ b/drivers/media/usb/gspca/sq905.c
@@ -116,7 +116,7 @@ static int sq905_command(struct gspca_dev *gspca_dev, u16 index)
}
ret = usb_control_msg(gspca_dev->dev,
- usb_sndctrlpipe(gspca_dev->dev, 0),
+ usb_rcvctrlpipe(gspca_dev->dev, 0),
USB_REQ_SYNCH_FRAME, /* request */
USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
SQ905_PING, 0, gspca_dev->usb_buf, 1,
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: gspca/sq905: fix control-request direction
Author: Johan Hovold <johan(a)kernel.org>
Date: Fri May 21 15:28:39 2021 +0200
The direction of the pipe argument must match the request-type direction
bit or control requests may fail depending on the host-controller-driver
implementation.
Fix the USB_REQ_SYNCH_FRAME request which erroneously used
usb_sndctrlpipe().
Fixes: 27d35fc3fb06 ("V4L/DVB (10639): gspca - sq905: New subdriver.")
Cc: stable(a)vger.kernel.org # 2.6.30
Signed-off-by: Johan Hovold <johan(a)kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
drivers/media/usb/gspca/sq905.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
---
diff --git a/drivers/media/usb/gspca/sq905.c b/drivers/media/usb/gspca/sq905.c
index 949111070971..32504ebcfd4d 100644
--- a/drivers/media/usb/gspca/sq905.c
+++ b/drivers/media/usb/gspca/sq905.c
@@ -116,7 +116,7 @@ static int sq905_command(struct gspca_dev *gspca_dev, u16 index)
}
ret = usb_control_msg(gspca_dev->dev,
- usb_sndctrlpipe(gspca_dev->dev, 0),
+ usb_rcvctrlpipe(gspca_dev->dev, 0),
USB_REQ_SYNCH_FRAME, /* request */
USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
SQ905_PING, 0, gspca_dev->usb_buf, 1,
From: Michael Weiser <michael.weiser(a)gmx.de>
commit 1962682d2b2fbe6cfa995a85c53c069fadda473e upstream.
Stop printing a (ratelimited) kernel message for each instance of an
unimplemented syscall being called. Userland making an unimplemented
syscall is not necessarily misbehaviour and to be expected with a
current userland running on an older kernel. Also, the current message
looks scary to users but does not actually indicate a real problem nor
help them narrow down the cause. Just rely on sys_ni_syscall() to return
-ENOSYS.
Cc: <stable(a)vger.kernel.org>
Cc: Martin Vajnar <martin.vajnar(a)gmail.com>
Cc: musl(a)lists.openwall.com
Acked-by: Will Deacon <will.deacon(a)arm.com>
Signed-off-by: Michael Weiser <michael.weiser(a)gmx.de>
Signed-off-by: Will Deacon <will.deacon(a)arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
This was backported to v4.14 and later, but is missing in v4.4 and
before, apparently because of a trivial merge conflict. This is
a manual backport I did after I saw a report about the issue
by Martin Vajnar on the musl mailing list.
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
arch/arm64/kernel/traps.c | 8 --------
1 file changed, 8 deletions(-)
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 9322be69ca09..db4163808c76 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -363,14 +363,6 @@ asmlinkage long do_ni_syscall(struct pt_regs *regs)
}
#endif
- if (show_unhandled_signals && printk_ratelimit()) {
- pr_info("%s[%d]: syscall %d\n", current->comm,
- task_pid_nr(current), (int)regs->syscallno);
- dump_instr("", regs);
- if (user_mode(regs))
- __show_regs(regs);
- }
-
return sys_ni_syscall();
}
--
2.29.2
The first two bits of the CPUID leaf 0x8000001F EAX indicate whether
SEV or SME is supported respectively. It's better to check whether
SEV or SME is supported before checking the SEV MSR(0xc0010131) to
see whether SEV or SME is enabled.
This also avoid the MSR reading failure on the first generation Hygon
Dhyana CPU which does not support SEV or SME.
Fixes: eab696d8e8b9 ("x86/sev: Do not require Hypervisor CPUID bit for SEV guests")
Cc: <stable(a)vger.kernel.org> # v5.10+
Signed-off-by: Pu Wen <puwen(a)hygon.cn>
---
arch/x86/mm/mem_encrypt_identity.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index a9639f663d25..470b20208430 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -504,10 +504,6 @@ void __init sme_enable(struct boot_params *bp)
#define AMD_SME_BIT BIT(0)
#define AMD_SEV_BIT BIT(1)
- /* Check the SEV MSR whether SEV or SME is enabled */
- sev_status = __rdmsr(MSR_AMD64_SEV);
- feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
-
/*
* Check for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
@@ -519,11 +515,16 @@ void __init sme_enable(struct boot_params *bp)
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
- if (!(eax & feature_mask))
+ /* Check whether SEV or SME is supported */
+ if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT)))
return;
me_mask = 1UL << (ebx & 0x3f);
+ /* Check the SEV MSR whether SEV or SME is enabled */
+ sev_status = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
/*
--
2.23.0
The abort_cmd_ia flag in an abort wqe describes whether an ABTS basic
link service should be transmitted on the FC link or not.
Code added in lpfc_sli4_issue_abort_iotag() set the abort_cmd_ia flag
incorrectly, surpressing ABTS transmission.
A previous LPFC change to build an abort wqe inverted prior logic that
determined whether an ABTS was to be issued on the FC link.
Revert this logic to its proper state.
Fixes: db7531d2b377 ("scsi: lpfc: Convert abort handling to SLI-3 and SLI-4 handlers")
Cc: <stable(a)vger.kernel.org> # v5.11+
Signed-off-by: James Smart <jsmart2021(a)gmail.com>
---
drivers/scsi/lpfc/lpfc_sli.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 1ad1beb2a8a8..e2cfb86f7e61 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -20615,10 +20615,8 @@ lpfc_sli4_issue_abort_iotag(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
abtswqe = &abtsiocb->wqe;
memset(abtswqe, 0, sizeof(*abtswqe));
- if (lpfc_is_link_up(phba))
+ if (!lpfc_is_link_up(phba))
bf_set(abort_cmd_ia, &abtswqe->abort_cmd, 1);
- else
- bf_set(abort_cmd_ia, &abtswqe->abort_cmd, 0);
bf_set(abort_cmd_criteria, &abtswqe->abort_cmd, T_XRI_TAG);
abtswqe->abort_cmd.rsrvd5 = 0;
abtswqe->abort_cmd.wqe_com.abort_tag = xritag;
--
2.26.2
Prior to adding XSAVES support, some of fpu__clear()'s callers
required that fpu__clear() successfully reset current's FPU state
even if current's XSAVE header was corrupt. commit b860eb8dce59
("x86/fpu/xstate: Define new functions for clearing fpregs and
xstates") split fpu__clear() into two different functions, neither
of which was capable of correctly handling a corrupt header.
As a practical matter, trying to specify what
fpu__clear_user_states() is supposed to do if current's XSAVE area
is corrupt is difficult -- it's supposed to preverve supervisor
states, but the entire XSAVES buffer may be unusable due to corrupt
XFEATURES and/or XCOMP_BV.
Improve the situation by completely separating fpu__clear_all() and
fpu__clear_user_states(). The former is, once again, a full reset.
The latter requires an intact header and resets only user states.
Update and document __fpu__restore_sig() accordingly.
Cc: stable(a)vger.kernel.org
Fixes: b860eb8dce59 ("x86/fpu/xstate: Define new functions for clearing fpregs and xstates")
Reported-by: syzbot+2067e764dbcd10721e2e(a)syzkaller.appspotmail.com
Signed-off-by: Andy Lutomirski <luto(a)kernel.org>
---
arch/x86/kernel/fpu/core.c | 62 ++++++++++++++++++++++++------------
arch/x86/kernel/fpu/signal.c | 50 +++++++++++++++++++++++++----
2 files changed, 84 insertions(+), 28 deletions(-)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 571220ac8bea..55792ef6ba6d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -354,45 +354,65 @@ static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
}
/*
- * Clear the FPU state back to init state.
- *
- * Called by sys_execve(), by the signal handler code and by various
- * error paths.
+ * Reset current's user FPU states to the init states. The caller promises
+ * that current's supervisor states (in memory or CPU regs as appropriate)
+ * as well as the XSAVE header in memory are intact.
*/
-static void fpu__clear(struct fpu *fpu, bool user_only)
+void fpu__clear_user_states(struct fpu *fpu)
{
WARN_ON_FPU(fpu != ¤t->thread.fpu);
if (!static_cpu_has(X86_FEATURE_FPU)) {
- fpu__drop(fpu);
- fpu__initialize(fpu);
+ fpu__clear_all(fpu);
return;
}
fpregs_lock();
- if (user_only) {
- if (!fpregs_state_valid(fpu, smp_processor_id()) &&
- xfeatures_mask_supervisor())
- copy_kernel_to_xregs(&fpu->state.xsave,
- xfeatures_mask_supervisor());
- copy_init_fpstate_to_fpregs(xfeatures_mask_user());
- } else {
- copy_init_fpstate_to_fpregs(xfeatures_mask_all);
- }
+ /*
+ * Ensure that current's supervisor states are loaded into
+ * their corresponding registers.
+ */
+ if (!fpregs_state_valid(fpu, smp_processor_id()) &&
+ xfeatures_mask_supervisor())
+ copy_kernel_to_xregs(&fpu->state.xsave,
+ xfeatures_mask_supervisor());
+ /*
+ * Reset user states in registers.
+ */
+ copy_init_fpstate_to_fpregs(xfeatures_mask_user());
+
+ /*
+ * Now all FPU registers have their desired values. Inform the
+ * FPU state machine that current's FPU registers are in the
+ * hardware registers.
+ */
fpregs_mark_activate();
+
fpregs_unlock();
}
-void fpu__clear_user_states(struct fpu *fpu)
-{
- fpu__clear(fpu, true);
-}
+/*
+ * Reset current's FPU registers (user and supervisor) to their INIT values.
+ * This function does not trust the in-memory XSAVE image to be valid at all;
+ * it is safe to call even if the contents of fpu->state may be entirely
+ * malicious, including the header.
+ *
+ * This means that it must not use XSAVE, as XSAVE reads the header from
+ * memory.
+ *
+ * This does not change the actual hardware registers; when fpu__clear_all()
+ * returns, TIF_NEED_FPU_LOAD will be set, and a subsequent exit to user mode
+ * will reload the hardware registers from memory.
+ */
void fpu__clear_all(struct fpu *fpu)
{
- fpu__clear(fpu, false);
+ fpregs_lock();
+ fpu__drop(fpu);
+ fpu__initialize(fpu);
+ fpregs_unlock();
}
/*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..3ff7e75c7b8f 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -345,6 +345,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
*/
fpregs_lock();
pagefault_disable();
+
ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only);
pagefault_enable();
if (!ret) {
@@ -382,10 +383,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
}
/*
- * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
- * not modified on context switch and that the xstate is considered
- * to be loaded again on return to userland (overriding last_cpu avoids
- * the optimisation).
+ * By setting TIF_NEED_FPU_LOAD, we ensure that any context switches
+ * or kernel_fpu_begin() operations (due to scheduling, page faults,
+ * softirq, etc) do not access fpu->state.
*/
fpregs_lock();
@@ -406,10 +406,18 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
if (using_compacted_format()) {
+ /*
+ * copy_user_to_xstate() may write complete garbage
+ * to the user states, but it will not corrupt the
+ * XSAVE header, nor will it corrupt any supervisor
+ * states.
+ */
ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
} else {
ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
-
+ /*
+ * Careful, the XSAVE header may be corrupt now.
+ */
if (!ret && state_size > offsetof(struct xregs_state, header))
ret = validate_user_xstate_header(&fpu->state.xsave.header);
}
@@ -457,6 +465,15 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
fpregs_lock();
ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
}
+
+ /*
+ * At this point, we have modified the hardware FPU regs. We must
+ * either mark them active for current or invalidate them for
+ * any other task that may have owned them.
+ *
+ * Yes, the nonsensically named fpregs_deactivate() function
+ * ignores its parameter and marks this CPU's regs invalid.
+ */
if (!ret)
fpregs_mark_activate();
else
@@ -464,8 +481,27 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
fpregs_unlock();
err_out:
- if (ret)
- fpu__clear_user_states(fpu);
+ if (ret) {
+ if (using_compacted_format()) {
+ /*
+ * Our failed attempt to load the new state is
+ * guaranteed to have preserved the XSAVE header
+ * and all supervisor state.
+ */
+ fpu__clear_user_states(fpu);
+ } else {
+ /*
+ * If an error above caused garbage to be written
+ * to XFEATURES, then XSAVE would preserve that
+ * garbage in memory, and a subsequent XRSTOR would
+ * fail or worse. XSAVES does not have this problem.
+ *
+ * Wipe out the FPU state and start over from a clean
+ * slate.
+ */
+ fpu__clear_all(fpu);
+ }
+ }
return ret;
}
--
2.31.1
Do not torn down the system when getting invalid status from a TPM chip.
This can happen when panic-on-warn is used.
In addition, print out the value of TPM_STS.x instead of "invalid
status". In order to get the earlier benefits for forensics, also call
dump_stack().
Link: https://lore.kernel.org/keyrings/YKzlTR1AzUigShtZ@kroah.com/
Fixes: 55707d531af6 ("tpm_tis: Add a check for invalid status")
Cc: stable(a)vger.kernel.org
Cc: Hans de Goede <hdegoede(a)redhat.com>
Cc: Greg KH <greg(a)kroah.com>
Signed-off-by: Jarkko Sakkinen <jarkko(a)kernel.org>
---
v2:
Dump also stack only once.
drivers/char/tpm/tpm_tis_core.c | 28 ++++++++++++++++++++--------
1 file changed, 20 insertions(+), 8 deletions(-)
diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index 55b9d3965ae1..ce410f19eff2 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -188,21 +188,33 @@ static int request_locality(struct tpm_chip *chip, int l)
static u8 tpm_tis_status(struct tpm_chip *chip)
{
struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
- int rc;
+ static unsigned long klog_once;
u8 status;
+ int rc;
rc = tpm_tis_read8(priv, TPM_STS(priv->locality), &status);
if (rc < 0)
return 0;
if (unlikely((status & TPM_STS_READ_ZERO) != 0)) {
- /*
- * If this trips, the chances are the read is
- * returning 0xff because the locality hasn't been
- * acquired. Usually because tpm_try_get_ops() hasn't
- * been called before doing a TPM operation.
- */
- WARN_ONCE(1, "TPM returned invalid status\n");
+ if (!test_and_set_bit(BIT(0), &klog_once)) {
+ /*
+ * If this trips, the chances are the read is
+ * returning 0xff because the locality hasn't been
+ * acquired. Usually because tpm_try_get_ops() hasn't
+ * been called before doing a TPM operation.
+ */
+ dev_err(&chip->dev, "invalid TPM_STS.x 0x%02x, dumping stack for forensics\n",
+ status);
+
+ /*
+ * Dump stack for forensics, as invalid TPM_STS.x could be
+ * potentially triggered by impaired tpm_try_get_ops() or
+ * tpm_find_get_ops().
+ */
+ dump_stack();
+ }
+
return 0;
}
--
2.31.1
Do not torn down the system when getting invalid status from a TPM chip.
This can happen when panic-on-warn is used.
In addition, print out the value of TPM_STS.x instead of "invalid
status". In order to get the earlier benefits for forensics, also call
dump_stack().
Link: https://lore.kernel.org/keyrings/YKzlTR1AzUigShtZ@kroah.com/
Fixes: 55707d531af6 ("tpm_tis: Add a check for invalid status")
Cc: stable(a)vger.kernel.org
Cc: Hans de Goede <hdegoede(a)redhat.com>
Cc: Greg KH <greg(a)kroah.com>
Signed-off-by: Jarkko Sakkinen <jarkko(a)kernel.org>
---
drivers/char/tpm/tpm_tis_core.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index 55b9d3965ae1..514a481829c9 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -202,7 +202,16 @@ static u8 tpm_tis_status(struct tpm_chip *chip)
* acquired. Usually because tpm_try_get_ops() hasn't
* been called before doing a TPM operation.
*/
- WARN_ONCE(1, "TPM returned invalid status\n");
+ dev_err_once(&chip->dev, "invalid TPM_STS.x 0x%02x, dumping stack for forensics\n",
+ status);
+
+ /*
+ * Dump stack for forensics, as invalid TPM_STS.x could be
+ * potentially triggered by impaired tpm_try_get_ops() or
+ * tpm_find_get_ops().
+ */
+ dump_stack();
+
return 0;
}
--
2.31.1
The userfaultfd hugetlb tests detect a resv_huge_pages underflow. This
happens when hugetlb_mcopy_atomic_pte() is called with !is_continue on
an index for which we already have a page in the cache. When this
happens, we allocate a second page, double consuming the reservation,
and then fail to insert the page into the cache and return -EEXIST.
To fix this, we first if there exists a page in the cache which already
consumed the reservation, and return -EEXIST immediately if so.
There is still a rare condition where we fail to copy the page contents
AND race with a call for hugetlb_no_page() for this index and again we
will underflow resv_huge_pages. That is fixed in a more complicated
patch not targeted for -stable.
Test:
Hacked the code locally such that resv_huge_pages underflows produce
a warning, then:
./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10
2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
./tools/testing/selftests/vm/userfaultfd hugetlb 10
2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
Both tests succeed and produce no warnings. After the
test runs number of free/resv hugepages is correct.
Signed-off-by: Mina Almasry <almasrymina(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: linux-mm(a)kvack.org
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: linux-mm(a)kvack.org
Cc: linux-kernel(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
---
mm/hugetlb.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ead5d12e0604..76e2a6efc165 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4925,10 +4925,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!page)
goto out;
} else if (!*pagep) {
- ret = -ENOMEM;
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
goto out;
+ }
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
--
2.32.0.rc0.204.g9fa02ecfa5-goog
This reverts commit 9e31c1fe45d555a948ff66f1f0e3fe1f83ca63f7. Ever
since that commit, we've been having issues where a hang in one client
can propagate to another. In particular, a hang in an app can propagate
to the X server which causes the whole desktop to lock up.
Signed-off-by: Jason Ekstrand <jason.ekstrand(a)intel.com>
Reported-by: Marcin Slusarz <marcin.slusarz(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.6+
Cc: Jason Ekstrand <jason.ekstrand(a)intel.com>
Cc: Marcin Slusarz <marcin.slusarz(a)intel.com>
Cc: Jon Bloomfield <jon.bloomfield(a)intel.com>
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/3080
Fixes: 9e31c1fe45d5 ("drm/i915: Propagate errors on awaiting already signaled fences")
Signed-off-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
---
drivers/gpu/drm/i915/i915_request.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 970d8f4986bbe..b796197c07722 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1426,10 +1426,8 @@ i915_request_await_execution(struct i915_request *rq,
do {
fence = *child++;
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
- i915_sw_fence_set_error_once(&rq->submit, fence->error);
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
continue;
- }
if (fence->context == rq->fence.context)
continue;
@@ -1527,10 +1525,8 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
do {
fence = *child++;
- if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
- i915_sw_fence_set_error_once(&rq->submit, fence->error);
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
continue;
- }
/*
* Requests on the same timeline are explicitly ordered, along
--
2.31.1
From: Kan Liang <kan.liang(a)linux.intel.com>
Perf tool errors out with the latest event list for the Ice Lake server.
event syntax error: 'unc_m2m_imc_reads.to_pmm'
\___ value too big for format, maximum is 255
The same as the Snow Ridge server, the M2M uncore unit in the Ice Lake
server has the unit mask extension field as well.
Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support")
Reported-by: Jin Yao <yao.jin(a)linux.intel.com>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/events/intel/uncore_snbep.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index acc3c0e5..06c055d 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5106,9 +5106,10 @@ static struct intel_uncore_type icx_uncore_m2m = {
.perf_ctr = SNR_M2M_PCI_PMON_CTR0,
.event_ctl = SNR_M2M_PCI_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT,
.box_ctl = SNR_M2M_PCI_PMON_BOX_CTL,
.ops = &snr_m2m_uncore_pci_ops,
- .format_group = &skx_uncore_format_group,
+ .format_group = &snr_m2m_uncore_format_group,
};
static struct attribute *icx_upi_uncore_formats_attr[] = {
--
2.7.4
As promised on the list [0], this series aims to backport 3 upstream
commits [1,2,3] into 5.12-stable tree.
Patch #1 is already in the queue and therefore not included. Patch #2 can
be applied now by manually adding the __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc
macro (please review). Patch #3 can be applied cleanly then (after #2).
I've slightly tested it on my 920 (boot test and the whole kvm-unit-tests),
on top of the latest linux-stable-rc/linux-5.12.y. Please consider taking
them for 5.12-stable.
[0] https://lore.kernel.org/r/0d9f123c-e9f7-7481-143d-efd488873082@huawei.com
[1] https://git.kernel.org/torvalds/c/f5e30680616a
[2] https://git.kernel.org/torvalds/c/26778aaa134a
[3] https://git.kernel.org/torvalds/c/e3e880bb1518
Marc Zyngier (1):
KVM: arm64: Commit pending PC adjustemnts before returning to
userspace
Zenghui Yu (1):
KVM: arm64: Resolve all pending PC updates before immediate exit
arch/arm64/include/asm/kvm_asm.h | 1 +
arch/arm64/kvm/arm.c | 20 +++++++++++++++++---
arch/arm64/kvm/hyp/exception.c | 4 ++--
arch/arm64/kvm/hyp/nvhe/hyp-main.c | 8 ++++++++
4 files changed, 28 insertions(+), 5 deletions(-)
--
2.19.1
From: Chris Wilson <chris(a)chris-wilson.co.uk>
The first tracepoint for a request is trace_dma_fence_init called before
we have associated the request with a device. The tracepoint uses
fence->ops->get_driver_name() as a pretty name, and as we try to report
the device name this oopses as it is then NULL. Support the early
tracepoint by reporting the DRIVER_NAME instead of the actual device
name.
Note that rq->engine remains during the course of request recycling
(SLAB_TYPESAFE_BY_RCU). For the physical engines, the pointer remains
valid, however a virtual engine may be destroyed after the request is
retired. If we process a preempt-to-busy completed request along the
virtual engine, we should make sure we mark the request as no longer
belonging to the virtual engine to remove the dangling pointers from the
tracepoint.
Fixes: 855e39e65cfc ("drm/i915: Initialise basic fence before acquiring seqno")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)linux.intel.com>
Cc: Chintan M Patel <chintan.m.patel(a)intel.com>
Cc: Andi Shyti <andi.shyti(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.7+
Signed-off-by: Matthew Auld <matthew.auld(a)intel.com>
---
.../drm/i915/gt/intel_execlists_submission.c | 20 ++++++++++++++-----
drivers/gpu/drm/i915/i915_request.c | 7 ++++++-
2 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index de124870af44..75604e927d34 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3249,6 +3249,18 @@ static struct list_head *virtual_queue(struct virtual_engine *ve)
return &ve->base.execlists.default_priolist.requests;
}
+static void
+virtual_submit_completed(struct virtual_engine *ve, struct i915_request *rq)
+{
+ GEM_BUG_ON(!__i915_request_is_complete(rq));
+ GEM_BUG_ON(rq->engine != &ve->base);
+
+ __i915_request_submit(rq);
+
+ /* Remove the dangling pointer to the stale virtual engine */
+ WRITE_ONCE(rq->engine, ve->siblings[0]);
+}
+
static void rcu_virtual_context_destroy(struct work_struct *wrk)
{
struct virtual_engine *ve =
@@ -3265,8 +3277,7 @@ static void rcu_virtual_context_destroy(struct work_struct *wrk)
old = fetch_and_zero(&ve->request);
if (old) {
- GEM_BUG_ON(!__i915_request_is_complete(old));
- __i915_request_submit(old);
+ virtual_submit_completed(ve, old);
i915_request_put(old);
}
@@ -3538,13 +3549,12 @@ static void virtual_submit_request(struct i915_request *rq)
/* By the time we resubmit a request, it may be completed */
if (__i915_request_is_complete(rq)) {
- __i915_request_submit(rq);
+ virtual_submit_completed(ve, rq);
goto unlock;
}
if (ve->request) { /* background completion from preempt-to-busy */
- GEM_BUG_ON(!__i915_request_is_complete(ve->request));
- __i915_request_submit(ve->request);
+ virtual_submit_completed(ve, ve->request);
i915_request_put(ve->request);
}
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 970d8f4986bb..aa124adb1051 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -61,7 +61,12 @@ static struct i915_global_request {
static const char *i915_fence_get_driver_name(struct dma_fence *fence)
{
- return dev_name(to_request(fence)->engine->i915->drm.dev);
+ struct i915_request *rq = to_request(fence);
+
+ if (unlikely(!rq->engine)) /* not yet attached to any device */
+ return DRIVER_NAME;
+
+ return dev_name(rq->engine->i915->drm.dev);
}
static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
--
2.26.3
From: Niklas Cassel <niklas.cassel(a)wdc.com>
Performing a BLKREPORTZONE operation should be allowed under the same
permissions as read(). (read() does not require CAP_SYS_ADMIN).
Remove the CAP_SYS_ADMIN requirement, and instead check that the fd was
successfully opened with FMODE_READ. This way BLKREPORTZONE will match
the access control requirement of read().
Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls")
Signed-off-by: Niklas Cassel <niklas.cassel(a)wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Cc: stable(a)vger.kernel.org # v4.10+
---
Changes since v1:
- Pick up tag from Damien.
- Add fixes tag and CC stable.
block/blk-zoned.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 0789e6e9f7db..e05fe8dbb06d 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -288,8 +288,8 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (!blk_queue_is_zoned(q))
return -ENOTTY;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (!(mode & FMODE_READ))
+ return -EBADF;
if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
return -EFAULT;
--
2.31.1
From: Niklas Cassel <niklas.cassel(a)wdc.com>
Zone management send operations (BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE
and BLKFINISHZONE) should be allowed under the same permissions as write().
(write() does not require CAP_SYS_ADMIN).
Additionally, other ioctls like BLKSECDISCARD and BLKZEROOUT only check if
the fd was successfully opened with FMODE_WRITE.
(They do not require CAP_SYS_ADMIN).
Currently, zone management send operations require both CAP_SYS_ADMIN
and that the fd was successfully opened with FMODE_WRITE.
Remove the CAP_SYS_ADMIN requirement, so that zone management send
operations match the access control requirement of write(), BLKSECDISCARD
and BLKZEROOUT.
Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls")
Signed-off-by: Niklas Cassel <niklas.cassel(a)wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Cc: stable(a)vger.kernel.org # v4.10+
---
Changes since v1:
- Pick up tag from Damien.
- Add fixes tag and CC stable.
Note to backporter:
Function was added as blkdev_reset_zones_ioctl() in v4.10.
Function was renamed to blkdev_zone_mgmt_ioctl() in v5.5.
The patch is valid both before and after the function rename.
block/blk-zoned.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 250cb76ee615..0789e6e9f7db 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -349,9 +349,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
if (!blk_queue_is_zoned(q))
return -ENOTTY;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
if (!(mode & FMODE_WRITE))
return -EBADF;
--
2.31.1
Hi,
This is version 2 of a backport for v5.10.
Changes since v2:
1. Rebased v3 of v5.4 version, I kept numbering to be consistent.
2. The context in patch 1/3 had to be adjusted.
The first patch specifically fixes bug aftert 2nd resume:
BUG: Bad page state in process dbus-daemon pfn:18b01
page:ffffea000062c040 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 compound_mapcount: -30591
flags: 0xfffffc0078141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)
raw: 000fffffc0078141 dead0000000002d0 dead000000000100 0000000000000000
raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: PAGE_FLAGS_CHECK_AT_PREP flag set
bad because of flags: 0x78141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)
Best regards,
Krzysztof
Vitaly Kuznetsov (3):
x86/kvm: Teardown PV features on boot CPU as well
x86/kvm: Disable kvmclock on all CPUs on shutdown
x86/kvm: Disable all PV features on crash
arch/x86/include/asm/kvm_para.h | 10 +---
arch/x86/kernel/kvm.c | 92 ++++++++++++++++++++++++---------
arch/x86/kernel/kvmclock.c | 26 +---------
3 files changed, 72 insertions(+), 56 deletions(-)
--
2.27.0
The patch titled
Subject: mm, hugetlb: fix simple resv_huge_pages underflow on UFFDIO_COPY
has been added to the -mm tree. Its filename is
mm-hugetlb-fix-simple-resv_huge_pages-underflow-on-uffdio_copy.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-hugetlb-fix-simple-resv_huge_p…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-hugetlb-fix-simple-resv_huge_p…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Mina Almasry <almasrymina(a)google.com>
Subject: mm, hugetlb: fix simple resv_huge_pages underflow on UFFDIO_COPY
The userfaultfd hugetlb tests cause a resv_huge_pages underflow. This
happens when hugetlb_mcopy_atomic_pte() is called with !is_continue on an
index for which we already have a page in the cache. When this happens,
we allocate a second page, double consuming the reservation, and then fail
to insert the page into the cache and return -EEXIST.
To fix this, we first check if there is a page in the cache which already
consumed the reservation, and return -EEXIST immediately if so.
There is still a rare condition where we fail to copy the page contents
AND race with a call for hugetlb_no_page() for this index and again we
will underflow resv_huge_pages. That is fixed in a more complicated patch
not targeted for -stable.
Test:
Hacked the code locally such that resv_huge_pages underflows produce
a warning, then:
./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10
2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
./tools/testing/selftests/vm/userfaultfd hugetlb 10
2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
Both tests succeed and produce no warnings. After the
test runs number of free/resv hugepages is correct.
[mike.kravetz(a)oracle.com: changelog fixes]
Link: https://lkml.kernel.org/r/20210528004649.85298-1-almasrymina@google.com
Signed-off-by: Mina Almasry <almasrymina(a)google.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
--- a/mm/hugetlb.c~mm-hugetlb-fix-simple-resv_huge_pages-underflow-on-uffdio_copy
+++ a/mm/hugetlb.c
@@ -4889,10 +4889,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_s
if (!page)
goto out;
} else if (!*pagep) {
- ret = -ENOMEM;
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
goto out;
+ }
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
_
Patches currently in -mm which might be from almasrymina(a)google.com are
mm-hugetlb-fix-simple-resv_huge_pages-underflow-on-uffdio_copy.patch
mm-hugetlb-fix-racy-resv_huge_pages-underflow-on-uffdio_copy.patch
The patch titled
Subject: ocfs2: fix data corruption by fallocate
has been added to the -mm tree. Its filename is
ocfs2-fix-data-corruption-by-fallocate.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/ocfs2-fix-data-corruption-by-fall…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/ocfs2-fix-data-corruption-by-fall…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Junxiao Bi <junxiao.bi(a)oracle.com>
Subject: ocfs2: fix data corruption by fallocate
When fallocate punches holes out of inode size, if original isize is in
the middle of last cluster, then the part from isize to the end of the
cluster will be zeroed with buffer write, at that time isize is not yet
updated to match the new size, if writeback is kicked in, it will invoke
ocfs2_writepage()->block_write_full_page() where the pages out of inode
size will be dropped. That will cause file corruption. Fix this by zero
out eof blocks when extending the inode size.
Running the following command with qemu-image 4.2.1 can get a corrupted
coverted image file easily.
qemu-img convert -p -t none -T none -f qcow2 $qcow_image \
-O qcow2 -o compat=1.1 $qcow_image.conv
The usage of fallocate in qemu is like this, it first punches holes out of
inode size, then extend the inode size.
fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2276196352, 65536) = 0
fallocate(11, 0, 2276196352, 65536) = 0
v1: https://www.spinics.net/lists/linux-fsdevel/msg193999.html
v2: https://lore.kernel.org/linux-fsdevel/20210525093034.GB4112@quack2.suse.cz/…
Link: https://lkml.kernel.org/r/20210528210648.9124-1-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi(a)oracle.com>
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/file.c | 55 +++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 50 insertions(+), 5 deletions(-)
--- a/fs/ocfs2/file.c~ocfs2-fix-data-corruption-by-fallocate
+++ a/fs/ocfs2/file.c
@@ -1856,6 +1856,45 @@ out:
}
/*
+ * zero out partial blocks of one cluster.
+ *
+ * start: file offset where zero starts, will be made upper block aligned.
+ * len: it will be trimmed to the end of current cluster if "start + len"
+ * is bigger than it.
+ */
+static int ocfs2_zeroout_partial_cluster(struct inode *inode,
+ u64 start, u64 len)
+{
+ int ret;
+ u64 start_block, end_block, nr_blocks;
+ u64 p_block, offset;
+ u32 cluster, p_cluster, nr_clusters;
+ struct super_block *sb = inode->i_sb;
+ u64 end = ocfs2_align_bytes_to_clusters(sb, start);
+
+ if (start + len < end)
+ end = start + len;
+
+ start_block = ocfs2_blocks_for_bytes(sb, start);
+ end_block = ocfs2_blocks_for_bytes(sb, end);
+ nr_blocks = end_block - start_block;
+ if (!nr_blocks)
+ return 0;
+
+ cluster = ocfs2_bytes_to_clusters(sb, start);
+ ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
+ &nr_clusters, NULL);
+ if (ret)
+ return ret;
+ if (!p_cluster)
+ return 0;
+
+ offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
+ p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
+ return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
+}
+
+/*
* Parts of this function taken from xfs_change_file_space()
*/
static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
@@ -1865,7 +1904,7 @@ static int __ocfs2_change_file_space(str
{
int ret;
s64 llen;
- loff_t size;
+ loff_t size, orig_isize;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *di_bh = NULL;
handle_t *handle;
@@ -1896,6 +1935,7 @@ static int __ocfs2_change_file_space(str
goto out_inode_unlock;
}
+ orig_isize = i_size_read(inode);
switch (sr->l_whence) {
case 0: /*SEEK_SET*/
break;
@@ -1903,7 +1943,7 @@ static int __ocfs2_change_file_space(str
sr->l_start += f_pos;
break;
case 2: /*SEEK_END*/
- sr->l_start += i_size_read(inode);
+ sr->l_start += orig_isize;
break;
default:
ret = -EINVAL;
@@ -1957,6 +1997,14 @@ static int __ocfs2_change_file_space(str
default:
ret = -EINVAL;
}
+
+ /* zeroout eof blocks in the cluster. */
+ if (!ret && change_size && orig_isize < size) {
+ ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
+ size - orig_isize);
+ if (!ret)
+ i_size_write(inode, size);
+ }
up_write(&OCFS2_I(inode)->ip_alloc_sem);
if (ret) {
mlog_errno(ret);
@@ -1973,9 +2021,6 @@ static int __ocfs2_change_file_space(str
goto out_inode_unlock;
}
- if (change_size && i_size_read(inode) < size)
- i_size_write(inode, size);
-
inode->i_ctime = inode->i_mtime = current_time(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
if (ret < 0)
_
Patches currently in -mm which might be from junxiao.bi(a)oracle.com are
ocfs2-fix-data-corruption-by-fallocate.patch
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 9a90ed065a155d13db0d0ffeaad5cc54e51c90c6
Gitweb: https://git.kernel.org/tip/9a90ed065a155d13db0d0ffeaad5cc54e51c90c6
Author: Borislav Petkov <bp(a)suse.de>
AuthorDate: Thu, 27 May 2021 11:02:26 +02:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Mon, 31 May 2021 22:32:26 +02:00
x86/thermal: Fix LVT thermal setup for SMI delivery mode
There are machines out there with added value crap^WBIOS which provide an
SMI handler for the local APIC thermal sensor interrupt. Out of reset,
the BSP on those machines has something like 0x200 in that APIC register
(timestamps left in because this whole issue is timing sensitive):
[ 0.033858] read lvtthmr: 0x330, val: 0x200
which means:
- bit 16 - the interrupt mask bit is clear and thus that interrupt is enabled
- bits [10:8] have 010b which means SMI delivery mode.
Now, later during boot, when the kernel programs the local APIC, it
soft-disables it temporarily through the spurious vector register:
setup_local_APIC:
...
/*
* If this comes from kexec/kcrash the APIC might be enabled in
* SPIV. Soft disable it before doing further initialization.
*/
value = apic_read(APIC_SPIV);
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write(APIC_SPIV, value);
which means (from the SDM):
"10.4.7.2 Local APIC State After It Has Been Software Disabled
...
* The mask bits for all the LVT entries are set. Attempts to reset these
bits will be ignored."
And this happens too:
[ 0.124111] APIC: Switch to symmetric I/O mode setup
[ 0.124117] lvtthmr 0x200 before write 0xf to APIC 0xf0
[ 0.124118] lvtthmr 0x10200 after write 0xf to APIC 0xf0
This results in CPU 0 soft lockups depending on the placement in time
when the APIC soft-disable happens. Those soft lockups are not 100%
reproducible and the reason for that can only be speculated as no one
tells you what SMM does. Likely, it confuses the SMM code that the APIC
is disabled and the thermal interrupt doesn't doesn't fire at all,
leading to CPU 0 stuck in SMM forever...
Now, before
4f432e8bb15b ("x86/mce: Get rid of mcheck_intel_therm_init()")
due to how the APIC_LVTTHMR was read before APIC initialization in
mcheck_intel_therm_init(), it would read the value with the mask bit 16
clear and then intel_init_thermal() would replicate it onto the APs and
all would be peachy - the thermal interrupt would remain enabled.
But that commit moved that reading to a later moment in
intel_init_thermal(), resulting in reading APIC_LVTTHMR on the BSP too
late and with its interrupt mask bit set.
Thus, revert back to the old behavior of reading the thermal LVT
register before the APIC gets initialized.
Fixes: 4f432e8bb15b ("x86/mce: Get rid of mcheck_intel_therm_init()")
Reported-by: James Feeney <james(a)nurealm.net>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Cc: Zhang Rui <rui.zhang(a)intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Link: https://lkml.kernel.org/r/YKIqDdFNaXYd39wz@zn.tnic
---
arch/x86/include/asm/thermal.h | 4 +++-
arch/x86/kernel/setup.c | 9 +++++++++
drivers/thermal/intel/therm_throt.c | 15 +++++++++++----
3 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h
index ddbdefd..91a7b66 100644
--- a/arch/x86/include/asm/thermal.h
+++ b/arch/x86/include/asm/thermal.h
@@ -3,11 +3,13 @@
#define _ASM_X86_THERMAL_H
#ifdef CONFIG_X86_THERMAL_VECTOR
+void therm_lvt_init(void);
void intel_init_thermal(struct cpuinfo_x86 *c);
bool x86_thermal_enabled(void);
void intel_thermal_interrupt(void);
#else
-static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
+static inline void therm_lvt_init(void) { }
+static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
#endif
#endif /* _ASM_X86_THERMAL_H */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 72920af..ff653d6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -44,6 +44,7 @@
#include <asm/pci-direct.h>
#include <asm/prom.h>
#include <asm/proto.h>
+#include <asm/thermal.h>
#include <asm/unwind.h>
#include <asm/vsyscall.h>
#include <linux/vmalloc.h>
@@ -1226,6 +1227,14 @@ void __init setup_arch(char **cmdline_p)
x86_init.timers.wallclock_init();
+ /*
+ * This needs to run before setup_local_APIC() which soft-disables the
+ * local APIC temporarily and that masks the thermal LVT interrupt,
+ * leading to softlockups on machines which have configured SMI
+ * interrupt delivery.
+ */
+ therm_lvt_init();
+
mcheck_init();
register_refined_jiffies(CLOCK_TICK_RATE);
diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
index f8e8825..99abdc0 100644
--- a/drivers/thermal/intel/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -621,6 +621,17 @@ bool x86_thermal_enabled(void)
return atomic_read(&therm_throt_en);
}
+void __init therm_lvt_init(void)
+{
+ /*
+ * This function is only called on boot CPU. Save the init thermal
+ * LVT value on BSP and use that value to restore APs' thermal LVT
+ * entry BIOS programmed later
+ */
+ if (intel_thermal_supported(&boot_cpu_data))
+ lvtthmr_init = apic_read(APIC_LVTTHMR);
+}
+
void intel_init_thermal(struct cpuinfo_x86 *c)
{
unsigned int cpu = smp_processor_id();
@@ -630,10 +641,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
if (!intel_thermal_supported(c))
return;
- /* On the BSP? */
- if (c == &boot_cpu_data)
- lvtthmr_init = apic_read(APIC_LVTTHMR);
-
/*
* First check if its enabled already, in which case there might
* be some SMM goo which handles it, so we can't even put a handler
In commit 92af4fc6ec33 ("usb: musb: Fix suspend with devices
connected for a64"), the logic to support the
MUSB_QUIRK_B_DISCONNECT_99 quirk was modified to only conditionally
schedule the musb->irq_work delayed work.
This commit badly breaks ECM Gadget on AM335X. Indeed, with this
commit, one can observe massive packet loss:
$ ping 192.168.0.100
...
15 packets transmitted, 3 received, 80% packet loss, time 14316ms
Reverting this commit brings back a properly functioning ECM
Gadget. An analysis of the commit seems to indicate that a mistake was
made: the previous code was not falling through into the
MUSB_QUIRK_B_INVALID_VBUS_91, but now it is, unless the condition is
taken.
Changing the logic to be as it was before the problematic commit *and*
only conditionally scheduling musb->irq_work resolves the regression:
$ ping 192.168.0.100
...
64 packets transmitted, 64 received, 0% packet loss, time 64475ms
Fixes: 92af4fc6ec33 ("usb: musb: Fix suspend with devices connected for a64")
Signed-off-by: Thomas Petazzoni <thomas.petazzoni(a)bootlin.com>
Cc: stable(a)vger.kernel.org
---
drivers/usb/musb/musb_core.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c
index 8f09a387b773..4c8f0112481f 100644
--- a/drivers/usb/musb/musb_core.c
+++ b/drivers/usb/musb/musb_core.c
@@ -2009,9 +2009,8 @@ static void musb_pm_runtime_check_session(struct musb *musb)
schedule_delayed_work(&musb->irq_work,
msecs_to_jiffies(1000));
musb->quirk_retries--;
- break;
}
- fallthrough;
+ break;
case MUSB_QUIRK_B_INVALID_VBUS_91:
if (musb->quirk_retries && !musb->flush_irq_work) {
musb_dbg(musb,
--
2.31.1
Hi,
This is version 2 of a backport for v5.4.
Changes since v2:
1. Send proper backport without context changes,
2. This series is only for v5.4.
Changes since v1:
1. Clean backport, without context changes.
The first patch specifically fixes bug aftert 2nd resume:
BUG: Bad page state in process dbus-daemon pfn:18b01
page:ffffea000062c040 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 compound_mapcount: -30591
flags: 0xfffffc0078141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)
raw: 000fffffc0078141 dead0000000002d0 dead000000000100 0000000000000000
raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: PAGE_FLAGS_CHECK_AT_PREP flag set
bad because of flags: 0x78141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)
Best regards,
Krzysztof
Vitaly Kuznetsov (3):
x86/kvm: Teardown PV features on boot CPU as well
x86/kvm: Disable kvmclock on all CPUs on shutdown
x86/kvm: Disable all PV features on crash
arch/x86/include/asm/kvm_para.h | 10 +---
arch/x86/kernel/kvm.c | 92 ++++++++++++++++++++++++---------
arch/x86/kernel/kvmclock.c | 26 +---------
3 files changed, 72 insertions(+), 56 deletions(-)
--
2.27.0
When NET_NS is not enabled, socket ioctl cmd SIOCGSKNS should do nothing
but acknowledge userspace it is not supported. Otherwise, kernel would
panic wherever nsfs trys to access ns->ops since the proc_ns_operations
is not implemented in this case.
[7.670023] Unable to handle kernel NULL pointer dereference at virtual address 00000010
[7.670268] pgd = 32b54000
[7.670544] [00000010] *pgd=00000000
[7.671861] Internal error: Oops: 5 [#1] SMP ARM
[7.672315] Modules linked in:
[7.672918] CPU: 0 PID: 1 Comm: systemd Not tainted 5.13.0-rc3-00375-g6799d4f2da49 #16
[7.673309] Hardware name: Generic DT based system
[7.673642] PC is at nsfs_evict+0x24/0x30
[7.674486] LR is at clear_inode+0x20/0x9c
Signed-off-by: Changbin Du <changbin.du(a)gmail.com>
Cc: <stable(a)vger.kernel.org> # v4.9
---
net/socket.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/net/socket.c b/net/socket.c
index 27e3e7d53f8e..644b46112d35 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1149,11 +1149,15 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
mutex_unlock(&vlan_ioctl_mutex);
break;
case SIOCGSKNS:
+#ifdef CONFIG_NET_NS
err = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
err = open_related_ns(&net->ns, get_net_ns);
+#else
+ err = -ENOTSUPP;
+#endif
break;
case SIOCGSTAMP_OLD:
case SIOCGSTAMPNS_OLD:
--
2.27.0
"Hello,
We are interested in advertising. We want to advertise guest article
on your site planet.kernel.org. Our article will be written according
to your site theme and will include a link to a betting site.
What is the rate for this?
IMPORTANT: If you are not accepting links to gambling site we might
be still interested, Please mention the rate for regular links as well
Skype: shanker2020
Telegram 9966160005
WhatsApp number: 9966160005
Thank you"
Hi,
This is version 2 of aa backport for v5.4+.
Changes since v1:
1. Clean backport, without context changes.
The first patch specifically fixes bug aftert 2nd resume:
BUG: Bad page state in process dbus-daemon pfn:18b01
page:ffffea000062c040 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 compound_mapcount: -30591
flags: 0xfffffc0078141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)
raw: 000fffffc0078141 dead0000000002d0 dead000000000100 0000000000000000
raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: PAGE_FLAGS_CHECK_AT_PREP flag set
bad because of flags: 0x78141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)
Best regards,
Krzysztof
Vitaly Kuznetsov (3):
x86/kvm: Teardown PV features on boot CPU as well
x86/kvm: Disable kvmclock on all CPUs on shutdown
x86/kvm: Disable all PV features on crash
arch/x86/include/asm/kvm_para.h | 9 +---
arch/x86/kernel/kvm.c | 75 ++++++++++++++++++++++++++-------
arch/x86/kernel/kvmclock.c | 26 +-----------
3 files changed, 63 insertions(+), 47 deletions(-)
--
2.27.0
From: Lin Ma <linma(a)zju.edu.cn>
commit e2cb6b891ad2b8caa9131e3be70f45243df82a80 upstream.
There is a possible race condition vulnerability between issuing a HCI
command and removing the cont. Specifically, functions hci_req_sync()
and hci_dev_do_close() can race each other like below:
thread-A in hci_req_sync() | thread-B in hci_dev_do_close()
| hci_req_sync_lock(hdev);
test_bit(HCI_UP, &hdev->flags); |
... | test_and_clear_bit(HCI_UP, &hdev->flags)
hci_req_sync_lock(hdev); |
|
In this commit we alter the sequence in function hci_req_sync(). Hence,
the thread-A cannot issue th.
Signed-off-by: Lin Ma <linma(a)zju.edu.cn>
Cc: Marcel Holtmann <marcel(a)holtmann.org>
Fixes: 7c6a329e4447 ("[Bluetooth] Fix regression from using default link policy")
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
[iwamatsu: adjust filename, arguments of __hci_req_sync(). CVE-2021-32399]
Signed-off-by: Nobuhiro Iwamatsu <nobuhiro1.iwamatsu(a)toshiba.co.jp>
---
net/bluetooth/hci_core.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index cc905a4e573253..81a81b9a3c7d00 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -371,12 +371,17 @@ static int hci_req_sync(struct hci_dev *hdev,
{
int ret;
- if (!test_bit(HCI_UP, &hdev->flags))
- return -ENETDOWN;
-
/* Serialize all requests */
hci_req_lock(hdev);
- ret = __hci_req_sync(hdev, req, opt, timeout);
+ /* check the state after obtaing the lock to protect the HCI_UP
+ * against any races from hci_dev_do_close when the controller
+ * gets removed.
+ */
+ if (test_bit(HCI_UP, &hdev->flags))
+ ret = __hci_req_sync(hdev, req, opt, timeout);
+ else
+ ret = -ENETDOWN;
+
hci_req_unlock(hdev);
return ret;
--
2.31.1
The patch below does not apply to the 5.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e3e880bb1518eb10a4b4bb4344ed614d6856f190 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui(a)huawei.com>
Date: Wed, 26 May 2021 22:18:31 +0800
Subject: [PATCH] KVM: arm64: Resolve all pending PC updates before immediate
exit
Commit 26778aaa134a ("KVM: arm64: Commit pending PC adjustemnts before
returning to userspace") fixed the PC updating issue by forcing an explicit
synchronisation of the exception state on vcpu exit to userspace.
However, we forgot to take into account the case where immediate_exit is
set by userspace and KVM_RUN will exit immediately. Fix it by resolving all
pending PC updates before returning to userspace.
Since __kvm_adjust_pc() relies on a loaded vcpu context, I moved the
immediate_exit checking right after vcpu_load(). We will get some overhead
if immediate_exit is true (which should hopefully be rare).
Fixes: 26778aaa134a ("KVM: arm64: Commit pending PC adjustemnts before returning to userspace")
Signed-off-by: Zenghui Yu <yuzenghui(a)huawei.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Link: https://lore.kernel.org/r/20210526141831.1662-1-yuzenghui@huawei.com
Cc: stable(a)vger.kernel.org # 5.11
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1126eae27400..e720148232a0 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -720,11 +720,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
return ret;
}
- if (run->immediate_exit)
- return -EINTR;
-
vcpu_load(vcpu);
+ if (run->immediate_exit) {
+ ret = -EINTR;
+ goto out;
+ }
+
kvm_sigset_activate(vcpu);
ret = 1;
@@ -897,6 +899,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_sigset_deactivate(vcpu);
+out:
/*
* In the unlikely event that we are returning to userspace
* with pending exceptions or PC adjustment, commit these
The patch below does not apply to the 5.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 26778aaa134a9aefdf5dbaad904054d7be9d656d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz(a)kernel.org>
Date: Thu, 6 May 2021 15:20:12 +0100
Subject: [PATCH] KVM: arm64: Commit pending PC adjustemnts before returning to
userspace
KVM currently updates PC (and the corresponding exception state)
using a two phase approach: first by setting a set of flags,
then by converting these flags into a state update when the vcpu
is about to enter the guest.
However, this creates a disconnect with userspace if the vcpu thread
returns there with any exception/PC flag set. In this case, the exposed
context is wrong, as userspace doesn't have access to these flags
(they aren't architectural). It also means that these flags are
preserved across a reset, which isn't expected.
To solve this problem, force an explicit synchronisation of the
exception state on vcpu exit to userspace. As an optimisation
for nVHE systems, only perform this when there is something pending.
Reported-by: Zenghui Yu <yuzenghui(a)huawei.com>
Reviewed-by: Alexandru Elisei <alexandru.elisei(a)arm.com>
Reviewed-by: Zenghui Yu <yuzenghui(a)huawei.com>
Tested-by: Zenghui Yu <yuzenghui(a)huawei.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org # 5.11
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index d5b11037401d..5e9b33cbac51 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -63,6 +63,7 @@
#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18
#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19
#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20
+#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1cb39c0803a4..1126eae27400 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -897,6 +897,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_sigset_deactivate(vcpu);
+ /*
+ * In the unlikely event that we are returning to userspace
+ * with pending exceptions or PC adjustment, commit these
+ * adjustments in order to give userspace a consistent view of
+ * the vcpu state. Note that this relies on __kvm_adjust_pc()
+ * being preempt-safe on VHE.
+ */
+ if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION |
+ KVM_ARM64_INCREMENT_PC)))
+ kvm_call_hyp(__kvm_adjust_pc, vcpu);
+
vcpu_put(vcpu);
return ret;
}
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 0812a496725f..11541b94b328 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -331,8 +331,8 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
}
/*
- * Adjust the guest PC on entry, depending on flags provided by EL1
- * for the purpose of emulation (MMIO, sysreg) or exception injection.
+ * Adjust the guest PC (and potentially exception state) depending on
+ * flags provided by the emulation code.
*/
void __kvm_adjust_pc(struct kvm_vcpu *vcpu)
{
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index f36420a80474..1632f001f4ed 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -28,6 +28,13 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu));
}
+static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
+
+ __kvm_adjust_pc(kern_hyp_va(vcpu));
+}
+
static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt)
{
__kvm_flush_vm_context();
@@ -170,6 +177,7 @@ typedef void (*hcall_t)(struct kvm_cpu_context *);
static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_vcpu_run),
+ HANDLE_FUNC(__kvm_adjust_pc),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
HANDLE_FUNC(__kvm_tlb_flush_vmid),
The Linux stable-rc 5.12 arm64 builds failed due to these warnings / errors.
make --silent --keep-going --jobs=8
O=/home/tuxbuild/.cache/tuxmake/builds/current ARCH=arm64
CROSS_COMPILE=aarch64-linux-gnu- 'HOSTCC=sccache clang' 'CC=sccache
clang'
arch/arm64/kvm/arm.c:722:8: error: use of undeclared label 'out'
goto out;
^
arch/arm64/kvm/arm.c:918:1: warning: unused label 'out' [-Wunused-label]
out:
^~~~
1 warning and 1 error generated.
Reported-by: Naresh Kamboju <naresh.kamboju(a)linaro.org>
step to reproduce:
------------------------
# TuxMake is a command line tool and Python library that provides
# portable and repeatable Linux kernel builds across a variety of
# architectures, toolchains, kernel configurations, and make targets.
#
# TuxMake supports the concept of runtimes.
# See https://docs.tuxmake.org/runtimes/, for that to work it requires
# that you install podman or docker on your system.
#
# To install tuxmake on your system globally:
# sudo pip3 install -U tuxmake
#
# See https://docs.tuxmake.org/ for complete documentation.
tuxmake --runtime podman --target-arch arm64 --toolchain gcc-9
--kconfig defconfig --kconfig-add
https://builds.tuxbuild.com/1tILLnkg7Rqf4tsdhjS3VoZDrdk/config
--
Linaro LKFT
https://lkft.linaro.org