The 4.4.y stable backport dc6ae4dffd65 for the upstream commit
3d4bf93ac120 ("tcp: detect malicious patterns in
tcp_collapse_ofo_queue()") missed a line that enlarges the
range_truesize value, which broke the whole check.
Fixes: dc6ae4dffd65 ("tcp: detect malicious patterns in tcp_collapse_ofo_queue()")
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
---
Greg, this is a fix-up specific to 4.4.y stable backport that had a
slightly different form from upstream fix. I haven't looked at the
older trees, but 4.9.y and later took the upstream fix as is, so this
patch isn't needed for them.
The patch hasn't been tested with the real test case, though; let me
know if the current code is intended. Thanks!
net/ipv4/tcp_input.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4a261e078082..9c4c6cd0316e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4835,6 +4835,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
end = TCP_SKB_CB(skb)->end_seq;
range_truesize = skb->truesize;
} else {
+ range_truesize += skb->truesize;
if (before(TCP_SKB_CB(skb)->seq, start))
start = TCP_SKB_CB(skb)->seq;
if (after(TCP_SKB_CB(skb)->end_seq, end))
--
2.18.0
From: Yannik Sembritzki <yannik(a)sembritzki.me>
The split of .system_keyring into .builtin_trusted_keys and
.secondary_trusted_keys broke kexec, thereby preventing kernels signed by
keys which are now in the secondary keyring from being kexec'd.
Fix this by passing VERIFY_USE_SECONDARY_KEYRING to
verify_pefile_signature().
Fixes: d3bfe84129f6 ("certs: Add a secondary system keyring that can be added to dynamically")
Signed-off-by: Yannik Sembritzki <yannik(a)sembritzki.me>
Signed-off-by: David Howells <dhowells(a)redhat.com>
cc: kexec(a)lists.infradead.org
cc: keyrings(a)vger.kernel.org
cc: linux-security-module(a)vger.kernel.org
cc: stable(a)vger.kernel.org
---
arch/x86/kernel/kexec-bzimage64.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 7326078eaa7a..278cd07228dd 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -532,7 +532,7 @@ static int bzImage64_cleanup(void *loader_data)
static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
{
return verify_pefile_signature(kernel, kernel_len,
- NULL,
+ VERIFY_USE_SECONDARY_KEYRING,
VERIFYING_KEXEC_PE_SIGNATURE);
}
#endif
Inside of start_xmit() the call to check if the connection is up and the
queueing of the packets for later transmission is not atomic which
leaves a window where cm_rep_handler can run, set the connection up,
dequeue pending packets and leave the subsequently queued packets by
start_xmit() sitting on neigh->queue until they're dropped when the
connection is torn down. This only applies to connected mode. These
dropped packets can really upset TCP, for example, and cause
multi-minute delays in transmission for open connections.
I've got a reproducer available if it's needed.
Here's the code in start_xmit where we check to see if the connection
is up:
if (ipoib_cm_get(neigh)) {
if (ipoib_cm_up(neigh)) {
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
goto unref;
}
}
The race occurs if cm_rep_handler execution occurs after the above
connection check (specifically if it gets to the point where it acquires
priv->lock to dequeue pending skb's) but before the below code snippet
in start_xmit where packets are queued.
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
push_pseudo_header(skb, phdr->hwaddr);
spin_lock_irqsave(&priv->lock, flags);
__skb_queue_tail(&neigh->queue, skb);
spin_unlock_irqrestore(&priv->lock, flags);
} else {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
}
The patch re-checks ipoib_cm_up with priv->lock held to avoid this
race condition. Since odds are the conn should be up most of the time
(and thus the connection *not* down most of the time) we don't hold the
lock for the first check attempt to avoid a slowdown from unecessary
locking for the majority of the packets transmitted during the
connection's life.
Cc: stable(a)vger.kernel.org
Tested-by: Ira Weiny <ira.weiny(a)intel.com>
Signed-off-by: Aaron Knister <aaron.s.knister(a)nasa.gov>
---
drivers/infiniband/ulp/ipoib/ipoib_main.c | 53 +++++++++++++++++++++++++------
1 file changed, 44 insertions(+), 9 deletions(-)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 26cde95b..529dbeab 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1093,6 +1093,34 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
spin_unlock_irqrestore(&priv->lock, flags);
}
+static void defer_neigh_skb(struct sk_buff *skb, struct net_device *dev,
+ struct ipoib_neigh *neigh,
+ struct ipoib_pseudo_header *phdr,
+ unsigned long *flags)
+{
+ struct ipoib_dev_priv *priv = ipoib_priv(dev);
+ unsigned long local_flags;
+ int acquire_priv_lock = 0;
+
+ /* Passing in pointer to spin_lock flags indicates spin lock
+ * already acquired so we don't need to acquire the priv lock */
+ if (flags == NULL) {
+ flags = &local_flags;
+ acquire_priv_lock = 1;
+ }
+
+ if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ push_pseudo_header(skb, phdr->hwaddr);
+ if (acquire_priv_lock)
+ spin_lock_irqsave(&priv->lock, *flags);
+ __skb_queue_tail(&neigh->queue, skb);
+ spin_unlock_irqrestore(&priv->lock, *flags);
+ } else {
+ ++dev->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ }
+}
+
static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -1160,6 +1188,21 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
goto unref;
}
+ /*
+ * Re-check ipoib_cm_up with priv->lock held to avoid
+ * race condition between start_xmit and skb_dequeue in
+ * cm_rep_handler. Since odds are the conn should be up
+ * most of the time, we don't hold the lock for the
+ * first check above
+ */
+ spin_lock_irqsave(&priv->lock, flags);
+ if (ipoib_cm_up(neigh)) {
+ spin_unlock_irqrestore(&priv->lock, flags);
+ ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+ } else {
+ defer_neigh_skb(skb, dev, neigh, phdr, &flags);
+ }
+ goto unref;
} else if (neigh->ah && neigh->ah->valid) {
neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
IPOIB_QPN(phdr->hwaddr));
@@ -1168,15 +1211,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
neigh_refresh_path(neigh, phdr->hwaddr, dev);
}
- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
- push_pseudo_header(skb, phdr->hwaddr);
- spin_lock_irqsave(&priv->lock, flags);
- __skb_queue_tail(&neigh->queue, skb);
- spin_unlock_irqrestore(&priv->lock, flags);
- } else {
- ++dev->stats.tx_dropped;
- dev_kfree_skb_any(skb);
- }
+ defer_neigh_skb(skb, dev, neigh, phdr, NULL);
unref:
ipoib_neigh_put(neigh);
--
2.12.3
Hi!
According to 1), disabling EPT offers the same maximum protection against L1TF as disabling SMT but
has a severe performance impact.
FWIW: With EPT disabled (2)), I can *not* confirm any performance-degradation for the VirtualBox
Windows- or Linux-VMs that I use. Those VMs are for desktop-use, though.
So to me it seems that the performance impact depends on the use case and in a desktop-setting
disabling EPT may offer a simple max-protection-option with the advantage of still enabled
hyperthreading.
I have tried this with 4.18.1 and 4.14.63.
Rainer Fiebig
***
1) https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html#mitigation-sel…
2) kvm-intel.ept=0
> tail /sys/devices/system/cpu/vulnerabilities/*
==> /sys/devices/system/cpu/vulnerabilities/l1tf <==
Mitigation: PTE Inversion; VMX: EPT disabled
==> /sys/devices/system/cpu/vulnerabilities/meltdown <==
Mitigation: PTI
==> /sys/devices/system/cpu/vulnerabilities/spec_store_bypass <==
Mitigation: Speculative Store Bypass disabled via prctl and seccomp
==> /sys/devices/system/cpu/vulnerabilities/spectre_v1 <==
Mitigation: __user pointer sanitization
==> /sys/devices/system/cpu/vulnerabilities/spectre_v2 <==
Mitigation: Full generic retpoline, IBPB, IBRS_FW
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5e0fb5df2ee871b841f96f9cb6a7f2784e96aa4e Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani(a)hpe.com>
Date: Wed, 27 Jun 2018 08:13:48 -0600
Subject: [PATCH] x86/mm: Add TLB purge to free pmd/pte page interfaces
ioremap() calls pud_free_pmd_page() / pmd_free_pte_page() when it creates
a pud / pmd map. The following preconditions are met at their entry.
- All pte entries for a target pud/pmd address range have been cleared.
- System-wide TLB purges have been peformed for a target pud/pmd address
range.
The preconditions assure that there is no stale TLB entry for the range.
Speculation may not cache TLB entries since it requires all levels of page
entries, including ptes, to have P & A-bits set for an associated address.
However, speculation may cache pud/pmd entries (paging-structure caches)
when they have P-bit set.
Add a system-wide TLB purge (INVLPG) to a single page after clearing
pud/pmd entry's P-bit.
SDM 4.10.4.1, Operation that Invalidate TLBs and Paging-Structure Caches,
states that:
INVLPG invalidates all paging-structure caches associated with the
current PCID regardless of the liner addresses to which they correspond.
Fixes: 28ee90fe6048 ("x86/mm: implement free pmd/pte page interfaces")
Signed-off-by: Toshi Kani <toshi.kani(a)hpe.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: mhocko(a)suse.com
Cc: akpm(a)linux-foundation.org
Cc: hpa(a)zytor.com
Cc: cpandya(a)codeaurora.org
Cc: linux-mm(a)kvack.org
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: Joerg Roedel <joro(a)8bytes.org>
Cc: stable(a)vger.kernel.org
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20180627141348.21777-4-toshi.kani@hpe.com
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fbd14e506758..e3deefb891da 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -725,24 +725,44 @@ int pmd_clear_huge(pmd_t *pmd)
* @pud: Pointer to a PUD.
* @addr: Virtual address associated with pud.
*
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
*/
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
- pmd_t *pmd;
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i], addr + (i * PMD_SIZE)))
- return 0;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd);
return 1;
@@ -753,7 +773,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
* @pmd: Pointer to a PMD.
* @addr: Virtual address associated with pmd.
*
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
@@ -765,6 +785,10 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
free_page((unsigned long)pte);
return 1;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5e0fb5df2ee871b841f96f9cb6a7f2784e96aa4e Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani(a)hpe.com>
Date: Wed, 27 Jun 2018 08:13:48 -0600
Subject: [PATCH] x86/mm: Add TLB purge to free pmd/pte page interfaces
ioremap() calls pud_free_pmd_page() / pmd_free_pte_page() when it creates
a pud / pmd map. The following preconditions are met at their entry.
- All pte entries for a target pud/pmd address range have been cleared.
- System-wide TLB purges have been peformed for a target pud/pmd address
range.
The preconditions assure that there is no stale TLB entry for the range.
Speculation may not cache TLB entries since it requires all levels of page
entries, including ptes, to have P & A-bits set for an associated address.
However, speculation may cache pud/pmd entries (paging-structure caches)
when they have P-bit set.
Add a system-wide TLB purge (INVLPG) to a single page after clearing
pud/pmd entry's P-bit.
SDM 4.10.4.1, Operation that Invalidate TLBs and Paging-Structure Caches,
states that:
INVLPG invalidates all paging-structure caches associated with the
current PCID regardless of the liner addresses to which they correspond.
Fixes: 28ee90fe6048 ("x86/mm: implement free pmd/pte page interfaces")
Signed-off-by: Toshi Kani <toshi.kani(a)hpe.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: mhocko(a)suse.com
Cc: akpm(a)linux-foundation.org
Cc: hpa(a)zytor.com
Cc: cpandya(a)codeaurora.org
Cc: linux-mm(a)kvack.org
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: Joerg Roedel <joro(a)8bytes.org>
Cc: stable(a)vger.kernel.org
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20180627141348.21777-4-toshi.kani@hpe.com
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fbd14e506758..e3deefb891da 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -725,24 +725,44 @@ int pmd_clear_huge(pmd_t *pmd)
* @pud: Pointer to a PUD.
* @addr: Virtual address associated with pud.
*
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
*/
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
- pmd_t *pmd;
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i], addr + (i * PMD_SIZE)))
- return 0;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd);
return 1;
@@ -753,7 +773,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
* @pmd: Pointer to a PMD.
* @addr: Virtual address associated with pmd.
*
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
@@ -765,6 +785,10 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
free_page((unsigned long)pte);
return 1;
The patch below does not apply to the 4.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5e0fb5df2ee871b841f96f9cb6a7f2784e96aa4e Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani(a)hpe.com>
Date: Wed, 27 Jun 2018 08:13:48 -0600
Subject: [PATCH] x86/mm: Add TLB purge to free pmd/pte page interfaces
ioremap() calls pud_free_pmd_page() / pmd_free_pte_page() when it creates
a pud / pmd map. The following preconditions are met at their entry.
- All pte entries for a target pud/pmd address range have been cleared.
- System-wide TLB purges have been peformed for a target pud/pmd address
range.
The preconditions assure that there is no stale TLB entry for the range.
Speculation may not cache TLB entries since it requires all levels of page
entries, including ptes, to have P & A-bits set for an associated address.
However, speculation may cache pud/pmd entries (paging-structure caches)
when they have P-bit set.
Add a system-wide TLB purge (INVLPG) to a single page after clearing
pud/pmd entry's P-bit.
SDM 4.10.4.1, Operation that Invalidate TLBs and Paging-Structure Caches,
states that:
INVLPG invalidates all paging-structure caches associated with the
current PCID regardless of the liner addresses to which they correspond.
Fixes: 28ee90fe6048 ("x86/mm: implement free pmd/pte page interfaces")
Signed-off-by: Toshi Kani <toshi.kani(a)hpe.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: mhocko(a)suse.com
Cc: akpm(a)linux-foundation.org
Cc: hpa(a)zytor.com
Cc: cpandya(a)codeaurora.org
Cc: linux-mm(a)kvack.org
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: Joerg Roedel <joro(a)8bytes.org>
Cc: stable(a)vger.kernel.org
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20180627141348.21777-4-toshi.kani@hpe.com
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fbd14e506758..e3deefb891da 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -725,24 +725,44 @@ int pmd_clear_huge(pmd_t *pmd)
* @pud: Pointer to a PUD.
* @addr: Virtual address associated with pud.
*
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
*/
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
- pmd_t *pmd;
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i], addr + (i * PMD_SIZE)))
- return 0;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd);
return 1;
@@ -753,7 +773,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
* @pmd: Pointer to a PMD.
* @addr: Virtual address associated with pmd.
*
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
@@ -765,6 +785,10 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
free_page((unsigned long)pte);
return 1;
The patch below does not apply to the 4.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5e0fb5df2ee871b841f96f9cb6a7f2784e96aa4e Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani(a)hpe.com>
Date: Wed, 27 Jun 2018 08:13:48 -0600
Subject: [PATCH] x86/mm: Add TLB purge to free pmd/pte page interfaces
ioremap() calls pud_free_pmd_page() / pmd_free_pte_page() when it creates
a pud / pmd map. The following preconditions are met at their entry.
- All pte entries for a target pud/pmd address range have been cleared.
- System-wide TLB purges have been peformed for a target pud/pmd address
range.
The preconditions assure that there is no stale TLB entry for the range.
Speculation may not cache TLB entries since it requires all levels of page
entries, including ptes, to have P & A-bits set for an associated address.
However, speculation may cache pud/pmd entries (paging-structure caches)
when they have P-bit set.
Add a system-wide TLB purge (INVLPG) to a single page after clearing
pud/pmd entry's P-bit.
SDM 4.10.4.1, Operation that Invalidate TLBs and Paging-Structure Caches,
states that:
INVLPG invalidates all paging-structure caches associated with the
current PCID regardless of the liner addresses to which they correspond.
Fixes: 28ee90fe6048 ("x86/mm: implement free pmd/pte page interfaces")
Signed-off-by: Toshi Kani <toshi.kani(a)hpe.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: mhocko(a)suse.com
Cc: akpm(a)linux-foundation.org
Cc: hpa(a)zytor.com
Cc: cpandya(a)codeaurora.org
Cc: linux-mm(a)kvack.org
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: Joerg Roedel <joro(a)8bytes.org>
Cc: stable(a)vger.kernel.org
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20180627141348.21777-4-toshi.kani@hpe.com
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fbd14e506758..e3deefb891da 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -725,24 +725,44 @@ int pmd_clear_huge(pmd_t *pmd)
* @pud: Pointer to a PUD.
* @addr: Virtual address associated with pud.
*
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
*/
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
- pmd_t *pmd;
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i], addr + (i * PMD_SIZE)))
- return 0;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd);
return 1;
@@ -753,7 +773,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
* @pmd: Pointer to a PMD.
* @addr: Virtual address associated with pmd.
*
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
@@ -765,6 +785,10 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
free_page((unsigned long)pte);
return 1;
The patch below does not apply to the 4.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c40a56a7818cfe735fc93a69e1875f8bba834483 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen(a)linux.intel.com>
Date: Thu, 2 Aug 2018 15:58:31 -0700
Subject: [PATCH] x86/mm/init: Remove freed kernel image areas from alias
mapping
The kernel image is mapped into two places in the virtual address space
(addresses without KASLR, of course):
1. The kernel direct map (0xffff880000000000)
2. The "high kernel map" (0xffffffff81000000)
We actually execute out of #2. If we get the address of a kernel symbol,
it points to #2, but almost all physical-to-virtual translations point to
Parts of the "high kernel map" alias are mapped in the userspace page
tables with the Global bit for performance reasons. The parts that we map
to userspace do not (er, should not) have secrets. When PTI is enabled then
the global bit is usually not set in the high mapping and just used to
compensate for poor performance on systems which lack PCID.
This is fine, except that some areas in the kernel image that are adjacent
to the non-secret-containing areas are unused holes. We free these holes
back into the normal page allocator and reuse them as normal kernel memory.
The memory will, of course, get *used* via the normal map, but the alias
mapping is kept.
This otherwise unused alias mapping of the holes will, by default keep the
Global bit, be mapped out to userspace, and be vulnerable to Meltdown.
Remove the alias mapping of these pages entirely. This is likely to
fracture the 2M page mapping the kernel image near these areas, but this
should affect a minority of the area.
The pageattr code changes *all* aliases mapping the physical pages that it
operates on (by default). We only want to modify a single alias, so we
need to tweak its behavior.
This unmapping behavior is currently dependent on PTI being in place.
Going forward, we should at least consider doing this for all
configurations. Having an extra read-write alias for memory is not exactly
ideal for debugging things like random memory corruption and this does
undercut features like DEBUG_PAGEALLOC or future work like eXclusive Page
Frame Ownership (XPFO).
Before this patch:
current_kernel:---[ High Kernel Mapping ]---
current_kernel-0xffffffff80000000-0xffffffff81000000 16M pmd
current_kernel-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd
current_kernel-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte
current_kernel-0xffffffff81e11000-0xffffffff82000000 1980K RW NX pte
current_kernel-0xffffffff82000000-0xffffffff82600000 6M ro PSE GLB NX pmd
current_kernel-0xffffffff82600000-0xffffffff82c00000 6M RW PSE NX pmd
current_kernel-0xffffffff82c00000-0xffffffff82e00000 2M RW NX pte
current_kernel-0xffffffff82e00000-0xffffffff83200000 4M RW PSE NX pmd
current_kernel-0xffffffff83200000-0xffffffffa0000000 462M pmd
current_user:---[ High Kernel Mapping ]---
current_user-0xffffffff80000000-0xffffffff81000000 16M pmd
current_user-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd
current_user-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte
current_user-0xffffffff81e11000-0xffffffff82000000 1980K RW NX pte
current_user-0xffffffff82000000-0xffffffff82600000 6M ro PSE GLB NX pmd
current_user-0xffffffff82600000-0xffffffffa0000000 474M pmd
After this patch:
current_kernel:---[ High Kernel Mapping ]---
current_kernel-0xffffffff80000000-0xffffffff81000000 16M pmd
current_kernel-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd
current_kernel-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte
current_kernel-0xffffffff81e11000-0xffffffff82000000 1980K pte
current_kernel-0xffffffff82000000-0xffffffff82400000 4M ro PSE GLB NX pmd
current_kernel-0xffffffff82400000-0xffffffff82488000 544K ro NX pte
current_kernel-0xffffffff82488000-0xffffffff82600000 1504K pte
current_kernel-0xffffffff82600000-0xffffffff82c00000 6M RW PSE NX pmd
current_kernel-0xffffffff82c00000-0xffffffff82c0d000 52K RW NX pte
current_kernel-0xffffffff82c0d000-0xffffffff82dc0000 1740K pte
current_user:---[ High Kernel Mapping ]---
current_user-0xffffffff80000000-0xffffffff81000000 16M pmd
current_user-0xffffffff81000000-0xffffffff81e00000 14M ro PSE GLB x pmd
current_user-0xffffffff81e00000-0xffffffff81e11000 68K ro GLB x pte
current_user-0xffffffff81e11000-0xffffffff82000000 1980K pte
current_user-0xffffffff82000000-0xffffffff82400000 4M ro PSE GLB NX pmd
current_user-0xffffffff82400000-0xffffffff82488000 544K ro NX pte
current_user-0xffffffff82488000-0xffffffff82600000 1504K pte
current_user-0xffffffff82600000-0xffffffffa0000000 474M pmd
[ tglx: Do not unmap on 32bit as there is only one mapping ]
Fixes: 0f561fce4d69 ("x86/pti: Enable global pages for shared areas")
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Kees Cook <keescook(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Joerg Roedel <jroedel(a)suse.de>
Link: https://lkml.kernel.org/r/20180802225831.5F6A2BFC@viggo.jf.intel.com
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index bd090367236c..34cffcef7375 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -46,6 +46,7 @@ int set_memory_np(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
+int set_memory_np_noalias(unsigned long addr, int numpages);
int set_memory_array_uc(unsigned long *addr, int addrinarray);
int set_memory_array_wc(unsigned long *addr, int addrinarray);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bc11dedffc45..74b157ac078d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -780,8 +780,30 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
*/
void free_kernel_image_pages(void *begin, void *end)
{
- free_init_pages("unused kernel image",
- (unsigned long)begin, (unsigned long)end);
+ unsigned long begin_ul = (unsigned long)begin;
+ unsigned long end_ul = (unsigned long)end;
+ unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
+
+
+ free_init_pages("unused kernel image", begin_ul, end_ul);
+
+ /*
+ * PTI maps some of the kernel into userspace. For performance,
+ * this includes some kernel areas that do not contain secrets.
+ * Those areas might be adjacent to the parts of the kernel image
+ * being freed, which may contain secrets. Remove the "high kernel
+ * image mapping" for these freed areas, ensuring they are not even
+ * potentially vulnerable to Meltdown regardless of the specific
+ * optimizations PTI is currently using.
+ *
+ * The "noalias" prevents unmapping the direct map alias which is
+ * needed to access the freed pages.
+ *
+ * This is only valid for 64bit kernels. 32bit has only one mapping
+ * which can't be treated in this way for obvious reasons.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
+ set_memory_np_noalias(begin_ul, len_pages);
}
void __ref free_initmem(void)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index c04153796f61..0a74996a1149 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -53,6 +53,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
+#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -1486,6 +1487,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+ /* Has caller explicitly disabled alias checking? */
+ if (in_flag & CPA_NO_CHECK_ALIAS)
+ checkalias = 0;
ret = __change_page_attr_set_clr(&cpa, checkalias);
@@ -1772,6 +1776,15 @@ int set_memory_np(unsigned long addr, int numpages)
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
+int set_memory_np_noalias(unsigned long addr, int numpages)
+{
+ int cpa_flags = CPA_NO_CHECK_ALIAS;
+
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(_PAGE_PRESENT), 0,
+ cpa_flags, NULL);
+}
+
int set_memory_4k(unsigned long addr, int numpages)
{
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
Hi!
With 4.18.1 and l1tf=full no problems so far and no noticeable performance-degradation in normal
desktop-usage including a VirtualBox-VM. Compiling a kernel seems to take a bit longer, though.
4.9.120 and 4.14.63 also seem OK in cursory testing.
CPU: Core i3.
Thanks and so long!
Rainer Fiebig
--
The truth always turns out to be simpler than you thought.
Richard Feynman
Dear Supplier,
Please view our company catalog in attach file below and selected
items then provide quotation based on our requirement.
For further clarification or any question feel free to contact
me.
Your further reply as above will be very appreciated and helpful
for our next conversation.
Hope we can establish long-term business relationship.
Best regards.
Walter Benson.
Marketing & Business Development
APTECH AFRICA LTD
JUBA SOUTH SUDAN
TEL:+211922414561
www.aptechafrica.com
This is the start of the stable review cycle for the 4.9.120 release.
There are 107 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Thu Aug 16 17:14:53 UTC 2018.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.9.120-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.9.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.9.120-rc1
Josh Poimboeuf <jpoimboe(a)redhat.com>
x86/microcode: Allow late microcode loading with SMT disabled
Ashok Raj <ashok.raj(a)intel.com>
x86/microcode: Do not upload microcode if CPUs are offline
David Woodhouse <dwmw(a)amazon.co.uk>
tools headers: Synchronise x86 cpufeatures.h for L1TF additions
Andi Kleen <ak(a)linux.intel.com>
x86/mm/kmmio: Make the tracer robust against L1TF
Andi Kleen <ak(a)linux.intel.com>
x86/mm/pat: Make set_memory_np() L1TF safe
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Make pmd/pud_mknotpresent() invert
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Invert all not present mappings
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Fix SMT supported evaluation
Paolo Bonzini <pbonzini(a)redhat.com>
KVM: VMX: Tell the nested hypervisor to skip L1D flush on vmentry
Paolo Bonzini <pbonzini(a)redhat.com>
x86/speculation: Use ARCH_CAPABILITIES to skip L1D flush on vmentry
Paolo Bonzini <pbonzini(a)redhat.com>
x86/speculation: Simplify sysfs report of VMX L1TF vulnerability
Paolo Bonzini <pbonzini(a)redhat.com>
KVM: VMX: support MSR_IA32_ARCH_CAPABILITIES as a feature MSR
Wanpeng Li <wanpengli(a)tencent.com>
KVM: X86: Allow userspace to define the microcode version
Wanpeng Li <wanpengli(a)tencent.com>
KVM: X86: Introduce kvm_get_msr_feature()
Tom Lendacky <thomas.lendacky(a)amd.com>
KVM: SVM: Add MSR-based feature support for serializing LFENCE
Tom Lendacky <thomas.lendacky(a)amd.com>
KVM: x86: Add a framework for supporting MSR-based features
Thomas Gleixner <tglx(a)linutronix.de>
Documentation/l1tf: Remove Yonah processors from not vulnerable list
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Don't set l1tf_flush_l1d from vmx_handle_external_intr()
Nicolai Stange <nstange(a)suse.de>
x86/irq: Let interrupt handlers set kvm_cpu_l1tf_flush_l1d
Nicolai Stange <nstange(a)suse.de>
x86: Don't include linux/irq.h from asm/hardirq.h
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Introduce per-host-cpu analogue of l1tf_flush_l1d
Nicolai Stange <nstange(a)suse.de>
x86/irq: Demote irq_cpustat_t::__softirq_pending to u16
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Move the l1tf_flush_l1d test to vmx_l1d_flush()
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Replace 'vmx_l1d_flush_always' with 'vmx_l1d_flush_cond'
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Don't set l1tf_flush_l1d to true from vmx_l1d_flush()
Josh Poimboeuf <jpoimboe(a)redhat.com>
cpu/hotplug: detect SMT disabled by BIOS
Tony Luck <tony.luck(a)intel.com>
Documentation/l1tf: Fix typos
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Initialize the vmx_l1d_flush_pages' content
Thomas Gleixner <tglx(a)linutronix.de>
Documentation: Add section about CPU vulnerabilities
Jiri Kosina <jkosina(a)suse.cz>
x86/bugs, kvm: Introduce boot-time control of L1TF mitigations
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Set CPU_SMT_NOT_SUPPORTED early
Jiri Kosina <jkosina(a)suse.cz>
cpu/hotplug: Expose SMT control init function
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Allow runtime control of L1D flush
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Serialize L1D flush parameter setter
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Add static key for flush always
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Move l1tf setup function
Thomas Gleixner <tglx(a)linutronix.de>
x86/l1tf: Handle EPT disabled state proper
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Drop L1TF MSR list approach
Thomas Gleixner <tglx(a)linutronix.de>
x86/litf: Introduce vmx status variable
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Online siblings when SMT control is turned on
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Use MSR save list for IA32_FLUSH_CMD if required
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Extend add_atomic_switch_msr() to allow VMENTER only MSRs
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Separate the VMX AUTOLOAD guest/host number accounting
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Add find_msr() helper function
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Split the VMX MSR LOAD structures to have an host/guest numbers
Jim Mattson <jmattson(a)google.com>
kvm: nVMX: Update MSR load counts on a VMCS switch
Paolo Bonzini <pbonzini(a)redhat.com>
x86/KVM/VMX: Add L1D flush logic
Paolo Bonzini <pbonzini(a)redhat.com>
x86/KVM/VMX: Add L1D MSR based flush
Paolo Bonzini <pbonzini(a)redhat.com>
x86/KVM/VMX: Add L1D flush algorithm
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Add module argument for L1TF mitigation
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM: Warn user if KVM is loaded SMT and L1TF CPU bug being present
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Boot HT siblings at least once
Thomas Gleixner <tglx(a)linutronix.de>
Revert "x86/apic: Ignore secondary threads if nosmt=force"
Michal Hocko <mhocko(a)suse.cz>
x86/speculation/l1tf: Fix up pte->pfn conversion for PAE
Vlastimil Babka <vbabka(a)suse.cz>
x86/speculation/l1tf: Protect PAE swap entries against L1TF
Borislav Petkov <bp(a)suse.de>
x86/CPU/AMD: Move TOPOEXT reenablement before reading smp_num_siblings
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/cpufeatures: Add detection of L1D cache flush support.
Vlastimil Babka <vbabka(a)suse.cz>
x86/speculation/l1tf: Extend 64bit swap file size limit
Thomas Gleixner <tglx(a)linutronix.de>
x86/apic: Ignore secondary threads if nosmt=force
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/AMD: Evaluate smp_num_siblings early
Borislav Petkov <bp(a)suse.de>
x86/CPU/AMD: Do not check CPUID max ext level before parsing SMP info
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/intel: Evaluate smp_num_siblings early
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/topology: Provide detect_extended_topology_early()
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/common: Provide detect_ht_early()
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/AMD: Remove the pointless detect_ht() call
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu: Remove the pointless CPU printout
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Provide knobs to control SMT
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Split do_cpu_down()
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Make bringup/teardown of smp threads symmetric
Thomas Gleixner <tglx(a)linutronix.de>
x86/topology: Provide topology_smt_supported()
Thomas Gleixner <tglx(a)linutronix.de>
x86/smp: Provide topology_is_primary_thread()
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/bugs: Move the l1tf function and define pr_fmt properly
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Limit swap file size to MAX_PA/2
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Add sysfs reporting for l1tf
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Make sure the first page is always reserved
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation
Linus Torvalds <torvalds(a)linux-foundation.org>
x86/speculation/l1tf: Protect swap entries against L1TF
Linus Torvalds <torvalds(a)linux-foundation.org>
x86/speculation/l1tf: Change order of offset/type in swap entry
Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
mm: x86: move _PAGE_SWP_SOFT_DIRTY from bit 7 to bit 1
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Increase 32bit PAE __PHYSICAL_PAGE_SHIFT
Nick Desaulniers <ndesaulniers(a)google.com>
x86/irqflags: Provide a declaration for native_save_fl
Masami Hiramatsu <mhiramat(a)kernel.org>
kprobes/x86: Fix %p uses in error messages
Jiri Kosina <jkosina(a)suse.cz>
x86/speculation: Protect against userspace-userspace spectreRSB
Peter Zijlstra <peterz(a)infradead.org>
x86/paravirt: Fix spectre-v2 mitigations for paravirt guests
Oleksij Rempel <o.rempel(a)pengutronix.de>
ARM: dts: imx6sx: fix irq for pcie bridge
Michael Mera <dev(a)michaelmera.com>
IB/ocrdma: fix out of bounds access to local buffer
Fabio Estevam <fabio.estevam(a)nxp.com>
mtd: nand: qcom: Add a NULL check for devm_kasprintf()
Jack Morgenstein <jackm(a)dev.mellanox.co.il>
IB/mlx4: Mark user MR as writable if actual virtual memory is writable
Jack Morgenstein <jackm(a)dev.mellanox.co.il>
IB/core: Make testing MR flags for writability a static inline function
Eric W. Biederman <ebiederm(a)xmission.com>
proc: Fix proc_sys_prune_dcache to hold a sb reference
Eric W. Biederman <ebiederm(a)xmission.com>
proc/sysctl: Don't grab i_lock under sysctl_lock.
Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru>
proc/sysctl: prune stale dentries during unregistering
Al Viro <viro(a)zeniv.linux.org.uk>
fix __legitimize_mnt()/mntput() race
Al Viro <viro(a)zeniv.linux.org.uk>
fix mntput/mntput race
Al Viro <viro(a)zeniv.linux.org.uk>
make sure that __dentry_kill() always invalidates d_seq, unhashed or not
Al Viro <viro(a)zeniv.linux.org.uk>
root dentries need RCU-delayed freeing
Linus Torvalds <torvalds(a)linux-foundation.org>
init: rename and re-order boot_cpu_state_init()
Bart Van Assche <bart.vanassche(a)wdc.com>
scsi: sr: Avoid that opening a CD-ROM hangs with runtime power management enabled
Hans de Goede <hdegoede(a)redhat.com>
ACPI / LPSS: Add missing prv_offset setting for byt/cht PWM devices
Juergen Gross <jgross(a)suse.com>
xen/netfront: don't cache skb_shinfo()
Linus Torvalds <torvalds(a)linux-foundation.org>
Mark HI and TASKLET softirq synchronous
Andrey Konovalov <andreyknvl(a)google.com>
kasan: add no_sanitize attribute for clang builds
John David Anglin <dave.anglin(a)bell.net>
parisc: Define mb() and add memory barriers to assembler unlock sequences
Helge Deller <deller(a)gmx.de>
parisc: Enable CONFIG_MLONGCALLS by default
Tadeusz Struk <tadeusz.struk(a)intel.com>
tpm: fix race condition in tpm_common_write()
Theodore Ts'o <tytso(a)mit.edu>
ext4: fix check to prevent initializing reserved inodes
-------------
Diffstat:
Documentation/ABI/testing/sysfs-devices-system-cpu | 24 +
Documentation/index.rst | 1 +
Documentation/kernel-parameters.txt | 78 +++
Documentation/l1tf.rst | 610 +++++++++++++++++++++
Documentation/virtual/kvm/api.txt | 40 +-
Makefile | 4 +-
arch/Kconfig | 3 +
arch/arm/boot/dts/imx6sx.dtsi | 2 +-
arch/parisc/Kconfig | 2 +-
arch/parisc/include/asm/barrier.h | 32 ++
arch/parisc/kernel/entry.S | 2 +
arch/parisc/kernel/pacache.S | 1 +
arch/parisc/kernel/syscall.S | 4 +
arch/x86/Kconfig | 1 +
arch/x86/include/asm/apic.h | 10 +
arch/x86/include/asm/cpufeatures.h | 4 +-
arch/x86/include/asm/dmi.h | 2 +-
arch/x86/include/asm/hardirq.h | 26 +-
arch/x86/include/asm/irqflags.h | 2 +
arch/x86/include/asm/kvm_host.h | 9 +
arch/x86/include/asm/msr-index.h | 7 +
arch/x86/include/asm/page_32_types.h | 9 +-
arch/x86/include/asm/pgtable-2level.h | 17 +
arch/x86/include/asm/pgtable-3level.h | 37 +-
arch/x86/include/asm/pgtable-invert.h | 32 ++
arch/x86/include/asm/pgtable.h | 85 ++-
arch/x86/include/asm/pgtable_64.h | 48 +-
arch/x86/include/asm/pgtable_types.h | 10 +-
arch/x86/include/asm/processor.h | 17 +
arch/x86/include/asm/topology.h | 6 +-
arch/x86/include/asm/vmx.h | 11 +
arch/x86/kernel/apic/apic.c | 17 +
arch/x86/kernel/apic/htirq.c | 2 +
arch/x86/kernel/apic/io_apic.c | 1 +
arch/x86/kernel/apic/msi.c | 1 +
arch/x86/kernel/apic/vector.c | 1 +
arch/x86/kernel/cpu/amd.c | 53 +-
arch/x86/kernel/cpu/bugs.c | 171 ++++--
arch/x86/kernel/cpu/common.c | 56 +-
arch/x86/kernel/cpu/cpu.h | 2 +
arch/x86/kernel/cpu/intel.c | 7 +
arch/x86/kernel/cpu/microcode/core.c | 26 +
arch/x86/kernel/cpu/topology.c | 41 +-
arch/x86/kernel/fpu/core.c | 1 +
arch/x86/kernel/ftrace.c | 1 +
arch/x86/kernel/hpet.c | 1 +
arch/x86/kernel/i8259.c | 1 +
arch/x86/kernel/irq.c | 1 +
arch/x86/kernel/irq_32.c | 1 +
arch/x86/kernel/irq_64.c | 1 +
arch/x86/kernel/irqinit.c | 1 +
arch/x86/kernel/kprobes/core.c | 5 +-
arch/x86/kernel/kprobes/opt.c | 1 +
arch/x86/kernel/paravirt.c | 14 +-
arch/x86/kernel/setup.c | 6 +
arch/x86/kernel/smp.c | 1 +
arch/x86/kernel/smpboot.c | 18 +
arch/x86/kernel/time.c | 1 +
arch/x86/kvm/svm.c | 46 +-
arch/x86/kvm/vmx.c | 426 ++++++++++++--
arch/x86/kvm/x86.c | 133 ++++-
arch/x86/mm/fault.c | 1 +
arch/x86/mm/init.c | 23 +
arch/x86/mm/kaiser.c | 1 +
arch/x86/mm/kmmio.c | 25 +-
arch/x86/mm/mmap.c | 21 +
arch/x86/mm/pageattr.c | 8 +-
arch/x86/platform/efi/efi_64.c | 1 +
arch/x86/platform/efi/quirks.c | 1 +
.../intel-mid/device_libs/platform_mrfld_wdt.c | 1 +
arch/x86/platform/uv/tlb_uv.c | 1 +
arch/x86/xen/enlighten.c | 1 +
arch/x86/xen/setup.c | 1 +
drivers/acpi/acpi_lpss.c | 2 +
drivers/base/cpu.c | 8 +
drivers/char/tpm/tpm-dev.c | 43 +-
drivers/infiniband/core/umem.c | 11 +-
drivers/infiniband/hw/mlx4/mr.c | 50 +-
drivers/infiniband/hw/ocrdma/ocrdma_stats.c | 2 +-
drivers/mtd/nand/qcom_nandc.c | 3 +
drivers/net/xen-netfront.c | 8 +-
drivers/pci/host/pci-hyperv.c | 2 +
drivers/scsi/sr.c | 29 +-
fs/dcache.c | 13 +-
fs/ext4/ialloc.c | 5 +-
fs/ext4/super.c | 8 +-
fs/namespace.c | 28 +-
fs/proc/inode.c | 3 +-
fs/proc/internal.h | 7 +-
fs/proc/proc_sysctl.c | 83 ++-
include/asm-generic/pgtable.h | 12 +
include/linux/compiler-clang.h | 3 +
include/linux/cpu.h | 23 +-
include/linux/swapfile.h | 2 +
include/linux/sysctl.h | 1 +
include/rdma/ib_verbs.h | 14 +
include/uapi/linux/kvm.h | 2 +
init/main.c | 2 +-
kernel/cpu.c | 282 +++++++++-
kernel/smp.c | 2 +
kernel/softirq.c | 12 +-
mm/memory.c | 29 +-
mm/mprotect.c | 49 ++
mm/swapfile.c | 46 +-
tools/arch/x86/include/asm/cpufeatures.h | 4 +-
105 files changed, 2680 insertions(+), 366 deletions(-)
From: Aaron Knister <aaron.s.knister(a)nasa.gov>
Inside of start_xmit() the call to check if the connection is up and the
queueing of the packets for later transmission is not atomic which
leaves a window where cm_rep_handler can run, set the connection up,
dequeue pending packets and leave the subsequently queued packets by
start_xmit() sitting on neigh->queue until they're dropped when the
connection is torn down. This only applies to connected mode. These
dropped packets can really upset TCP, for example, and cause
multi-minute delays in transmission for open connections.
I've got a reproducer available if it's needed.
Here's the code in start_xmit where we check to see if the connection
is up:
if (ipoib_cm_get(neigh)) {
if (ipoib_cm_up(neigh)) {
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
goto unref;
}
}
The race occurs if cm_rep_handler execution occurs after the above
connection check (specifically if it gets to the point where it acquires
priv->lock to dequeue pending skb's) but before the below code snippet
in start_xmit where packets are queued.
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
push_pseudo_header(skb, phdr->hwaddr);
spin_lock_irqsave(&priv->lock, flags);
__skb_queue_tail(&neigh->queue, skb);
spin_unlock_irqrestore(&priv->lock, flags);
} else {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
}
The patch re-checks ipoib_cm_up with priv->lock held to avoid this
race condition. Since odds are the conn should be up most of the time
(and thus the connection *not* down most of the time) we don't hold the
lock for the first check attempt to avoid a slowdown from unecessary
locking for the majority of the packets transmitted during the
connection's life.
Cc: stable(a)vger.kernel.org
Tested-by: Ira Weiny <ira.weiny(a)intel.com>
Signed-off-by: Aaron Knister <aaron.s.knister(a)nasa.gov>
---
drivers/infiniband/ulp/ipoib/ipoib_main.c | 53 +++++++++++++++++++++++++------
1 file changed, 44 insertions(+), 9 deletions(-)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 26cde95b..529dbeab 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1093,6 +1093,34 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
spin_unlock_irqrestore(&priv->lock, flags);
}
+static void defer_neigh_skb(struct sk_buff *skb, struct net_device *dev,
+ struct ipoib_neigh *neigh,
+ struct ipoib_pseudo_header *phdr,
+ unsigned long *flags)
+{
+ struct ipoib_dev_priv *priv = ipoib_priv(dev);
+ unsigned long local_flags;
+ int acquire_priv_lock = 0;
+
+ /* Passing in pointer to spin_lock flags indicates spin lock
+ * already acquired so we don't need to acquire the priv lock */
+ if (flags == NULL) {
+ flags = &local_flags;
+ acquire_priv_lock = 1;
+ }
+
+ if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ push_pseudo_header(skb, phdr->hwaddr);
+ if (acquire_priv_lock)
+ spin_lock_irqsave(&priv->lock, *flags);
+ __skb_queue_tail(&neigh->queue, skb);
+ spin_unlock_irqrestore(&priv->lock, *flags);
+ } else {
+ ++dev->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ }
+}
+
static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -1160,6 +1188,21 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
goto unref;
}
+ /*
+ * Re-check ipoib_cm_up with priv->lock held to avoid
+ * race condition between start_xmit and skb_dequeue in
+ * cm_rep_handler. Since odds are the conn should be up
+ * most of the time, we don't hold the lock for the
+ * first check above
+ */
+ spin_lock_irqsave(&priv->lock, flags);
+ if (ipoib_cm_up(neigh)) {
+ spin_unlock_irqrestore(&priv->lock, flags);
+ ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+ } else {
+ defer_neigh_skb(skb, dev, neigh, phdr, &flags);
+ }
+ goto unref;
} else if (neigh->ah && neigh->ah->valid) {
neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
IPOIB_QPN(phdr->hwaddr));
@@ -1168,15 +1211,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
neigh_refresh_path(neigh, phdr->hwaddr, dev);
}
- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
- push_pseudo_header(skb, phdr->hwaddr);
- spin_lock_irqsave(&priv->lock, flags);
- __skb_queue_tail(&neigh->queue, skb);
- spin_unlock_irqrestore(&priv->lock, flags);
- } else {
- ++dev->stats.tx_dropped;
- dev_kfree_skb_any(skb);
- }
+ defer_neigh_skb(skb, dev, neigh, phdr, NULL);
unref:
ipoib_neigh_put(neigh);
--
2.12.3
From: Andi Kleen <ak(a)linux.intel.com>
The stable backport of the
x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings
patch for 4.4 and 4.9 put new C code for !__HAVE_ARCH_PFN_MODIFY_ALLOWED
code outside the assembler ifdef. This breaks the xtensa and ia64
build as reported by 0day which somehow include this file
into assembler.
Just add an #ifdef __ASSEMBLY__ around the new code to fix this.
This patch is only needed for 4.9 and 4.4 stable, the newer stables
don't have this problem.
Fixes: 7c5b42f82c13 ("x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings")
Signed-off-by: Andi Kleen <ak(a)linux.intel.com>
---
include/asm-generic/pgtable.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a88ea9e37a25..abc2a1b15dd8 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -825,6 +825,7 @@ static inline int pmd_free_pte_page(pmd_t *pmd)
#endif
#endif
+#ifndef __ASSEMBLY__
struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot);
@@ -839,6 +840,9 @@ static inline bool arch_has_pfn_modify_check(void)
{
return false;
}
+
+#endif
+
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
#endif /* !__ASSEMBLY__ */
--
2.17.1
This is the start of the stable review cycle for the 4.18.1 release.
There are 79 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Thu Aug 16 17:13:16 UTC 2018.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.18.1-rc1…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.18.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.18.1-rc1
Josh Poimboeuf <jpoimboe(a)redhat.com>
x86/microcode: Allow late microcode loading with SMT disabled
David Woodhouse <dwmw(a)amazon.co.uk>
tools headers: Synchronise x86 cpufeatures.h for L1TF additions
Andi Kleen <ak(a)linux.intel.com>
x86/mm/kmmio: Make the tracer robust against L1TF
Andi Kleen <ak(a)linux.intel.com>
x86/mm/pat: Make set_memory_np() L1TF safe
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Make pmd/pud_mknotpresent() invert
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Invert all not present mappings
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Fix SMT supported evaluation
Paolo Bonzini <pbonzini(a)redhat.com>
KVM: VMX: Tell the nested hypervisor to skip L1D flush on vmentry
Paolo Bonzini <pbonzini(a)redhat.com>
x86/speculation: Use ARCH_CAPABILITIES to skip L1D flush on vmentry
Paolo Bonzini <pbonzini(a)redhat.com>
x86/speculation: Simplify sysfs report of VMX L1TF vulnerability
Thomas Gleixner <tglx(a)linutronix.de>
Documentation/l1tf: Remove Yonah processors from not vulnerable list
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Don't set l1tf_flush_l1d from vmx_handle_external_intr()
Nicolai Stange <nstange(a)suse.de>
x86/irq: Let interrupt handlers set kvm_cpu_l1tf_flush_l1d
Nicolai Stange <nstange(a)suse.de>
x86: Don't include linux/irq.h from asm/hardirq.h
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Introduce per-host-cpu analogue of l1tf_flush_l1d
Nicolai Stange <nstange(a)suse.de>
x86/irq: Demote irq_cpustat_t::__softirq_pending to u16
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Move the l1tf_flush_l1d test to vmx_l1d_flush()
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Replace 'vmx_l1d_flush_always' with 'vmx_l1d_flush_cond'
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Don't set l1tf_flush_l1d to true from vmx_l1d_flush()
Josh Poimboeuf <jpoimboe(a)redhat.com>
cpu/hotplug: detect SMT disabled by BIOS
Tony Luck <tony.luck(a)intel.com>
Documentation/l1tf: Fix typos
Nicolai Stange <nstange(a)suse.de>
x86/KVM/VMX: Initialize the vmx_l1d_flush_pages' content
Jiri Kosina <jkosina(a)suse.cz>
x86/speculation/l1tf: Unbreak !__HAVE_ARCH_PFN_MODIFY_ALLOWED architectures
Thomas Gleixner <tglx(a)linutronix.de>
Documentation: Add section about CPU vulnerabilities
Jiri Kosina <jkosina(a)suse.cz>
x86/bugs, kvm: Introduce boot-time control of L1TF mitigations
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Set CPU_SMT_NOT_SUPPORTED early
Jiri Kosina <jkosina(a)suse.cz>
cpu/hotplug: Expose SMT control init function
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Allow runtime control of L1D flush
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Serialize L1D flush parameter setter
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Add static key for flush always
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Move l1tf setup function
Thomas Gleixner <tglx(a)linutronix.de>
x86/l1tf: Handle EPT disabled state proper
Thomas Gleixner <tglx(a)linutronix.de>
x86/kvm: Drop L1TF MSR list approach
Thomas Gleixner <tglx(a)linutronix.de>
x86/litf: Introduce vmx status variable
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Online siblings when SMT control is turned on
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Use MSR save list for IA32_FLUSH_CMD if required
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Extend add_atomic_switch_msr() to allow VMENTER only MSRs
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Separate the VMX AUTOLOAD guest/host number accounting
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Add find_msr() helper function
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Split the VMX MSR LOAD structures to have an host/guest numbers
Paolo Bonzini <pbonzini(a)redhat.com>
x86/KVM/VMX: Add L1D flush logic
Paolo Bonzini <pbonzini(a)redhat.com>
x86/KVM/VMX: Add L1D MSR based flush
Paolo Bonzini <pbonzini(a)redhat.com>
x86/KVM/VMX: Add L1D flush algorithm
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM/VMX: Add module argument for L1TF mitigation
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/KVM: Warn user if KVM is loaded SMT and L1TF CPU bug being present
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Boot HT siblings at least once
Thomas Gleixner <tglx(a)linutronix.de>
Revert "x86/apic: Ignore secondary threads if nosmt=force"
Michal Hocko <mhocko(a)suse.cz>
x86/speculation/l1tf: Fix up pte->pfn conversion for PAE
Vlastimil Babka <vbabka(a)suse.cz>
x86/speculation/l1tf: Protect PAE swap entries against L1TF
Borislav Petkov <bp(a)suse.de>
x86/CPU/AMD: Move TOPOEXT reenablement before reading smp_num_siblings
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/cpufeatures: Add detection of L1D cache flush support.
Vlastimil Babka <vbabka(a)suse.cz>
x86/speculation/l1tf: Extend 64bit swap file size limit
Thomas Gleixner <tglx(a)linutronix.de>
x86/apic: Ignore secondary threads if nosmt=force
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/AMD: Evaluate smp_num_siblings early
Borislav Petkov <bp(a)suse.de>
x86/CPU/AMD: Do not check CPUID max ext level before parsing SMP info
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/intel: Evaluate smp_num_siblings early
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/topology: Provide detect_extended_topology_early()
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/common: Provide detect_ht_early()
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu/AMD: Remove the pointless detect_ht() call
Thomas Gleixner <tglx(a)linutronix.de>
x86/cpu: Remove the pointless CPU printout
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Provide knobs to control SMT
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Split do_cpu_down()
Thomas Gleixner <tglx(a)linutronix.de>
cpu/hotplug: Make bringup/teardown of smp threads symmetric
Thomas Gleixner <tglx(a)linutronix.de>
x86/topology: Provide topology_smt_supported()
Thomas Gleixner <tglx(a)linutronix.de>
x86/smp: Provide topology_is_primary_thread()
Peter Zijlstra <peterz(a)infradead.org>
sched/smt: Update sched_smt_present at runtime
Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
x86/bugs: Move the l1tf function and define pr_fmt properly
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Limit swap file size to MAX_PA/2
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Add sysfs reporting for l1tf
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Make sure the first page is always reserved
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation
Linus Torvalds <torvalds(a)linux-foundation.org>
x86/speculation/l1tf: Protect swap entries against L1TF
Linus Torvalds <torvalds(a)linux-foundation.org>
x86/speculation/l1tf: Change order of offset/type in swap entry
Andi Kleen <ak(a)linux.intel.com>
x86/speculation/l1tf: Increase 32bit PAE __PHYSICAL_PAGE_SHIFT
Nick Desaulniers <ndesaulniers(a)google.com>
x86/irqflags: Provide a declaration for native_save_fl
Masami Hiramatsu <mhiramat(a)kernel.org>
kprobes/x86: Fix %p uses in error messages
Jiri Kosina <jkosina(a)suse.cz>
x86/speculation: Protect against userspace-userspace spectreRSB
Peter Zijlstra <peterz(a)infradead.org>
x86/paravirt: Fix spectre-v2 mitigations for paravirt guests
-------------
Diffstat:
Documentation/ABI/testing/sysfs-devices-system-cpu | 24 +
Documentation/admin-guide/index.rst | 9 +
Documentation/admin-guide/kernel-parameters.txt | 78 +++
Documentation/admin-guide/l1tf.rst | 610 +++++++++++++++++++++
Makefile | 4 +-
arch/Kconfig | 3 +
arch/x86/Kconfig | 1 +
arch/x86/include/asm/apic.h | 9 +
arch/x86/include/asm/cpufeatures.h | 3 +
arch/x86/include/asm/dmi.h | 2 +-
arch/x86/include/asm/hardirq.h | 26 +-
arch/x86/include/asm/irqflags.h | 2 +
arch/x86/include/asm/kvm_host.h | 6 +
arch/x86/include/asm/msr-index.h | 7 +
arch/x86/include/asm/page_32_types.h | 9 +-
arch/x86/include/asm/pgtable-2level.h | 17 +
arch/x86/include/asm/pgtable-3level.h | 37 +-
arch/x86/include/asm/pgtable-invert.h | 32 ++
arch/x86/include/asm/pgtable.h | 74 ++-
arch/x86/include/asm/pgtable_64.h | 38 +-
arch/x86/include/asm/processor.h | 17 +
arch/x86/include/asm/topology.h | 6 +-
arch/x86/include/asm/vmx.h | 11 +
arch/x86/kernel/apic/apic.c | 16 +
arch/x86/kernel/apic/io_apic.c | 1 +
arch/x86/kernel/apic/msi.c | 1 +
arch/x86/kernel/apic/vector.c | 1 +
arch/x86/kernel/cpu/amd.c | 51 +-
arch/x86/kernel/cpu/bugs.c | 171 ++++--
arch/x86/kernel/cpu/common.c | 56 +-
arch/x86/kernel/cpu/cpu.h | 2 +
arch/x86/kernel/cpu/intel.c | 7 +
arch/x86/kernel/cpu/microcode/core.c | 16 +-
arch/x86/kernel/cpu/topology.c | 41 +-
arch/x86/kernel/fpu/core.c | 1 +
arch/x86/kernel/hpet.c | 1 +
arch/x86/kernel/i8259.c | 1 +
arch/x86/kernel/idt.c | 1 +
arch/x86/kernel/irq.c | 1 +
arch/x86/kernel/irq_32.c | 1 +
arch/x86/kernel/irq_64.c | 1 +
arch/x86/kernel/irqinit.c | 1 +
arch/x86/kernel/kprobes/core.c | 5 +-
arch/x86/kernel/paravirt.c | 14 +-
arch/x86/kernel/setup.c | 6 +
arch/x86/kernel/smp.c | 1 +
arch/x86/kernel/smpboot.c | 18 +
arch/x86/kernel/time.c | 1 +
arch/x86/kvm/mmu.c | 1 +
arch/x86/kvm/vmx.c | 455 ++++++++++++---
arch/x86/kvm/x86.c | 34 +-
arch/x86/mm/init.c | 23 +
arch/x86/mm/kmmio.c | 25 +-
arch/x86/mm/mmap.c | 21 +
arch/x86/mm/pageattr.c | 8 +-
arch/x86/mm/pti.c | 1 +
.../intel-mid/device_libs/platform_mrfld_wdt.c | 1 +
arch/x86/platform/uv/tlb_uv.c | 1 +
arch/x86/xen/enlighten.c | 1 +
drivers/base/cpu.c | 8 +
drivers/gpu/drm/i915/i915_pmu.c | 1 +
drivers/gpu/drm/i915/intel_lpe_audio.c | 1 +
drivers/pci/controller/pci-hyperv.c | 1 +
include/asm-generic/pgtable.h | 12 +
include/linux/cpu.h | 21 +
include/linux/swapfile.h | 2 +
kernel/cpu.c | 280 +++++++++-
kernel/sched/core.c | 30 +-
kernel/sched/fair.c | 1 +
kernel/smp.c | 2 +
mm/memory.c | 37 +-
mm/mprotect.c | 49 ++
mm/swapfile.c | 46 +-
tools/arch/x86/include/asm/cpufeatures.h | 3 +
74 files changed, 2206 insertions(+), 300 deletions(-)
ARM64's pfn_valid() shifts away the upper PAGE_SHIFT bits of the input
before seeing if the PFN is valid. This leads to false positives when
some of the upper bits are set, but the lower bits match a valid PFN.
For example, the following userspace code looks up a bogus entry in
/proc/kpageflags:
int pagemap = open("/proc/self/pagemap", O_RDONLY);
int pageflags = open("/proc/kpageflags", O_RDONLY);
uint64_t pfn, val;
lseek64(pagemap, [...], SEEK_SET);
read(pagemap, &pfn, sizeof(pfn));
if (pfn & (1UL << 63)) { /* valid PFN */
pfn &= ((1UL << 55) - 1); /* clear flag bits */
pfn |= (1UL << 55);
lseek64(pageflags, pfn * sizeof(uint64_t), SEEK_SET);
read(pageflags, &val, sizeof(val));
}
On ARM64 this causes the userspace process to crash with SIGSEGV rather
than reading (1 << KPF_NOPAGE). kpageflags_read() treats the offset as
valid, and stable_page_flags() will try to access an address between the
user and kernel address ranges.
Fixes: c1cc1552616d ("arm64: MMU initialisation")
Cc: stable(a)vger.kernel.org
Signed-off-by: Greg Hackmann <ghackmann(a)google.com>
---
arch/arm64/mm/init.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 9abf8a1e7b25..787e27964ab9 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -287,7 +287,11 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
#ifdef CONFIG_HAVE_ARCH_PFN_VALID
int pfn_valid(unsigned long pfn)
{
- return memblock_is_map_memory(pfn << PAGE_SHIFT);
+ phys_addr_t addr = pfn << PAGE_SHIFT;
+
+ if ((addr >> PAGE_SHIFT) != pfn)
+ return 0;
+ return memblock_is_map_memory(addr);
}
EXPORT_SYMBOL(pfn_valid);
#endif
--
2.18.0.865.gffc8e1a3cd6-goog