The patch titled
Subject: selftests/mm: compaction_test: support platform with huge mount of memory
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Feng Tang <feng.tang(a)linux.alibaba.com>
Subject: selftests/mm: compaction_test: support platform with huge mount of memory
Date: Wed, 23 Apr 2025 18:36:45 +0800
When running mm selftest to verify mm patches, 'compaction_test' case
failed on an x86 server with 1TB memory. And the root cause is that it
has too much free memory than what the test supports.
The test case tries to allocate 100000 huge pages, which is about 200 GB
for that x86 server, and when it succeeds, it expects it's large than 1/3
of 80% of the free memory in system. This logic only works for platform
with 750 GB ( 200 / (1/3) / 80% ) or less free memory, and may raise false
alarm for others.
Fix it by changing the fixed page number to self-adjustable number
according to the real number of free memory.
Link: https://lkml.kernel.org/r/20250423103645.2758-1-feng.tang@linux.alibaba.com
Fixes: bd67d5c15cc19 ("Test compaction of mlocked memory")
Signed-off-by: Feng Tang <feng.tang(a)linux.alibaba.com>
Acked-by: Dev Jain <dev.jain(a)arm.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Sri Jayaramappa <sjayaram(a)akamai.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/compaction_test.c | 19 ++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
--- a/tools/testing/selftests/mm/compaction_test.c~selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory
+++ a/tools/testing/selftests/mm/compaction_test.c
@@ -90,6 +90,8 @@ int check_compaction(unsigned long mem_f
int compaction_index = 0;
char nr_hugepages[20] = {0};
char init_nr_hugepages[24] = {0};
+ char target_nr_hugepages[24] = {0};
+ int slen;
snprintf(init_nr_hugepages, sizeof(init_nr_hugepages),
"%lu", initial_nr_hugepages);
@@ -106,11 +108,18 @@ int check_compaction(unsigned long mem_f
goto out;
}
- /* Request a large number of huge pages. The Kernel will allocate
- as much as it can */
- if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
- ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n",
- strerror(errno));
+ /*
+ * Request huge pages for about half of the free memory. The Kernel
+ * will allocate as much as it can, and we expect it will get at least 1/3
+ */
+ nr_hugepages_ul = mem_free / hugepage_size / 2;
+ snprintf(target_nr_hugepages, sizeof(target_nr_hugepages),
+ "%lu", nr_hugepages_ul);
+
+ slen = strlen(target_nr_hugepages);
+ if (write(fd, target_nr_hugepages, slen) != slen) {
+ ksft_print_msg("Failed to write %lu to /proc/sys/vm/nr_hugepages: %s\n",
+ nr_hugepages_ul, strerror(errno));
goto close_fd;
}
_
Patches currently in -mm which might be from feng.tang(a)linux.alibaba.com are
selftests-mm-compaction_test-support-platform-with-huge-mount-of-memory.patch
From: Long Li <longli(a)microsoft.com>
There are use cases that interrupt and monitor pages are mapped to
user-mode through UIO, they need to be system page aligned. Some Hyper-V
allocation APIs introduced earlier broke those requirements.
Fix those APIs by always allocating Hyper-V page at system page boundaries.
Cc: stable(a)vger.kernel.org
Fixes: ca48739e59df ("Drivers: hv: vmbus: Move Hyper-V page allocator to arch neutral code")
Signed-off-by: Long Li <longli(a)microsoft.com>
---
drivers/hv/hv_common.c | 29 +++++++----------------------
1 file changed, 7 insertions(+), 22 deletions(-)
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index a7d7494feaca..f426aaa9b8f9 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -106,41 +106,26 @@ void __init hv_common_free(void)
}
/*
- * Functions for allocating and freeing memory with size and
- * alignment HV_HYP_PAGE_SIZE. These functions are needed because
- * the guest page size may not be the same as the Hyper-V page
- * size. We depend upon kmalloc() aligning power-of-two size
- * allocations to the allocation size boundary, so that the
- * allocated memory appears to Hyper-V as a page of the size
- * it expects.
+ * A Hyper-V page can be used by UIO for mapping to user-space, it should
+ * always be allocated on system page boundaries.
*/
-
void *hv_alloc_hyperv_page(void)
{
- BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
-
- if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
- return (void *)__get_free_page(GFP_KERNEL);
- else
- return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+ BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
+ return (void *)__get_free_page(GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
void *hv_alloc_hyperv_zeroed_page(void)
{
- if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
- return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- else
- return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+ BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
void hv_free_hyperv_page(void *addr)
{
- if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
- free_page((unsigned long)addr);
- else
- kfree(addr);
+ free_page((unsigned long)addr);
}
EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
--
2.34.1
Syzkaller detected a use-after-free issue in ext4_insert_dentry that was
caused by out-of-bounds access due to incorrect splitting in do_split.
BUG: KASAN: use-after-free in ext4_insert_dentry+0x36a/0x6d0 fs/ext4/namei.c:2109
Write of size 251 at addr ffff888074572f14 by task syz-executor335/5847
CPU: 0 UID: 0 PID: 5847 Comm: syz-executor335 Not tainted 6.12.0-rc6-syzkaller-00318-ga9cda7c0ffed #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/30/2024
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:94 [inline]
dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
print_address_description mm/kasan/report.c:377 [inline]
print_report+0x169/0x550 mm/kasan/report.c:488
kasan_report+0x143/0x180 mm/kasan/report.c:601
kasan_check_range+0x282/0x290 mm/kasan/generic.c:189
__asan_memcpy+0x40/0x70 mm/kasan/shadow.c:106
ext4_insert_dentry+0x36a/0x6d0 fs/ext4/namei.c:2109
add_dirent_to_buf+0x3d9/0x750 fs/ext4/namei.c:2154
make_indexed_dir+0xf98/0x1600 fs/ext4/namei.c:2351
ext4_add_entry+0x222a/0x25d0 fs/ext4/namei.c:2455
ext4_add_nondir+0x8d/0x290 fs/ext4/namei.c:2796
ext4_symlink+0x920/0xb50 fs/ext4/namei.c:3431
vfs_symlink+0x137/0x2e0 fs/namei.c:4615
do_symlinkat+0x222/0x3a0 fs/namei.c:4641
__do_sys_symlink fs/namei.c:4662 [inline]
__se_sys_symlink fs/namei.c:4660 [inline]
__x64_sys_symlink+0x7a/0x90 fs/namei.c:4660
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x77/0x7f
</TASK>
The following loop is located right above 'if' statement.
for (i = count-1; i >= 0; i--) {
/* is more than half of this entry in 2nd half of the block? */
if (size + map[i].size/2 > blocksize/2)
break;
size += map[i].size;
move++;
}
'i' in this case could go down to -1, in which case sum of active entries
wouldn't exceed half the block size, but previous behaviour would also do
split in half if sum would exceed at the very last block, which in case of
having too many long name files in a single block could lead to
out-of-bounds access and following use-after-free.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Cc: stable(a)vger.kernel.org
Fixes: 5872331b3d91 ("ext4: fix potential negative array index in do_split()")
Signed-off-by: Artem Sadovnikov <a.sadovnikov(a)ispras.ru>
---
fs/ext4/namei.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cb5cb33b1d91..e9712e64ec8f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1971,7 +1971,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
* split it in half by count; each resulting block will have at least
* half the space free.
*/
- if (i > 0)
+ if (i >= 0)
split = count - move;
else
split = count/2;
--
2.43.0
Getting / Setting the frame interval using the V4L2 subdev pad ops
get_frame_interval/set_frame_interval causes a deadlock, as the
subdev state is locked in the [1] but also in the driver itself.
In [2] it's described that the caller is responsible to acquire and
release the lock in this case. Therefore, acquiring the lock in the
driver is wrong.
Remove the lock acquisitions/releases from mt9m114_ifp_get_frame_interval()
and mt9m114_ifp_set_frame_interval().
[1] drivers/media/v4l2-core/v4l2-subdev.c - line 1129
[2] Documentation/driver-api/media/v4l2-subdev.rst
Fixes: 24d756e914fc ("media: i2c: Add driver for onsemi MT9M114 camera sensor")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathis Foerst <mathis.foerst(a)mt.com>
---
drivers/media/i2c/mt9m114.c | 8 --------
1 file changed, 8 deletions(-)
diff --git a/drivers/media/i2c/mt9m114.c b/drivers/media/i2c/mt9m114.c
index 65b9124e464f..79c97ab19be9 100644
--- a/drivers/media/i2c/mt9m114.c
+++ b/drivers/media/i2c/mt9m114.c
@@ -1644,13 +1644,9 @@ static int mt9m114_ifp_get_frame_interval(struct v4l2_subdev *sd,
if (interval->which != V4L2_SUBDEV_FORMAT_ACTIVE)
return -EINVAL;
- mutex_lock(sensor->ifp.hdl.lock);
-
ival->numerator = 1;
ival->denominator = sensor->ifp.frame_rate;
- mutex_unlock(sensor->ifp.hdl.lock);
-
return 0;
}
@@ -1669,8 +1665,6 @@ static int mt9m114_ifp_set_frame_interval(struct v4l2_subdev *sd,
if (interval->which != V4L2_SUBDEV_FORMAT_ACTIVE)
return -EINVAL;
- mutex_lock(sensor->ifp.hdl.lock);
-
if (ival->numerator != 0 && ival->denominator != 0)
sensor->ifp.frame_rate = min_t(unsigned int,
ival->denominator / ival->numerator,
@@ -1684,8 +1678,6 @@ static int mt9m114_ifp_set_frame_interval(struct v4l2_subdev *sd,
if (sensor->streaming)
ret = mt9m114_set_frame_rate(sensor);
- mutex_unlock(sensor->ifp.hdl.lock);
-
return ret;
}
--
2.34.1
From: James Smart <jsmart2021(a)gmail.com>
[ Upstream commit 577a942df3de2666f6947bdd3a5c9e8d30073424 ]
If lpfc_issue_els_flogi() fails and returns non-zero status, the node
reference count is decremented to trigger the release of the nodelist
structure. However, if there is a prior registration or dev-loss-evt work
pending, the node may be released prematurely. When dev-loss-evt
completes, the released node is referenced causing a use-after-free null
pointer dereference.
Similarly, when processing non-zero ELS PLOGI completion status in
lpfc_cmpl_els_plogi(), the ndlp flags are checked for a transport
registration before triggering node removal. If dev-loss-evt work is
pending, the node may be released prematurely and a subsequent call to
lpfc_dev_loss_tmo_handler() results in a use after free ndlp dereference.
Add test for pending dev-loss before decrementing the node reference count
for FLOGI, PLOGI, PRLI, and ADISC handling.
Link: https://lore.kernel.org/r/20220412222008.126521-9-jsmart2021@gmail.com
Co-developed-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: James Smart <jsmart2021(a)gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
[Minor context change fixed.]
Signed-off-by: Bin Lan <bin.lan.cn(a)windriver.com>
Signed-off-by: He Zhe <zhe.he(a)windriver.com>
---
Build test passed.
---
drivers/scsi/lpfc/lpfc_els.c | 51 +++++++++++++++++++++++++-----------
1 file changed, 35 insertions(+), 16 deletions(-)
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index 5f44a0763f37..134d56bd00da 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -1517,10 +1517,13 @@ lpfc_initial_flogi(struct lpfc_vport *vport)
}
if (lpfc_issue_els_flogi(vport, ndlp, 0)) {
- /* This decrement of reference count to node shall kick off
- * the release of the node.
+ /* A node reference should be retained while registered with a
+ * transport or dev-loss-evt work is pending.
+ * Otherwise, decrement node reference to trigger release.
*/
- lpfc_nlp_put(ndlp);
+ if (!(ndlp->fc4_xpt_flags & (SCSI_XPT_REGD | NVME_XPT_REGD)) &&
+ !(ndlp->nlp_flag & NLP_IN_DEV_LOSS))
+ lpfc_nlp_put(ndlp);
return 0;
}
return 1;
@@ -1563,10 +1566,13 @@ lpfc_initial_fdisc(struct lpfc_vport *vport)
}
if (lpfc_issue_els_fdisc(vport, ndlp, 0)) {
- /* decrement node reference count to trigger the release of
- * the node.
+ /* A node reference should be retained while registered with a
+ * transport or dev-loss-evt work is pending.
+ * Otherwise, decrement node reference to trigger release.
*/
- lpfc_nlp_put(ndlp);
+ if (!(ndlp->fc4_xpt_flags & (SCSI_XPT_REGD | NVME_XPT_REGD)) &&
+ !(ndlp->nlp_flag & NLP_IN_DEV_LOSS))
+ lpfc_nlp_put(ndlp);
return 0;
}
return 1;
@@ -1967,6 +1973,7 @@ lpfc_cmpl_els_plogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
struct lpfc_dmabuf *prsp;
int disc;
struct serv_parm *sp = NULL;
+ bool release_node = false;
/* we pass cmdiocb to state machine which needs rspiocb as well */
cmdiocb->context_un.rsp_iocb = rspiocb;
@@ -2047,19 +2054,21 @@ lpfc_cmpl_els_plogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
spin_unlock_irq(&ndlp->lock);
goto out;
}
- spin_unlock_irq(&ndlp->lock);
/* No PLOGI collision and the node is not registered with the
* scsi or nvme transport. It is no longer an active node. Just
* start the device remove process.
*/
if (!(ndlp->fc4_xpt_flags & (SCSI_XPT_REGD | NVME_XPT_REGD))) {
- spin_lock_irq(&ndlp->lock);
ndlp->nlp_flag &= ~NLP_NPR_2B_DISC;
- spin_unlock_irq(&ndlp->lock);
+ if (!(ndlp->nlp_flag & NLP_IN_DEV_LOSS))
+ release_node = true;
+ }
+ spin_unlock_irq(&ndlp->lock);
+
+ if (release_node)
lpfc_disc_state_machine(vport, ndlp, cmdiocb,
NLP_EVT_DEVICE_RM);
- }
} else {
/* Good status, call state machine */
prsp = list_entry(((struct lpfc_dmabuf *)
@@ -2269,6 +2278,7 @@ lpfc_cmpl_els_prli(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
struct lpfc_nodelist *ndlp;
char *mode;
u32 loglevel;
+ bool release_node = false;
/* we pass cmdiocb to state machine which needs rspiocb as well */
cmdiocb->context_un.rsp_iocb = rspiocb;
@@ -2335,14 +2345,18 @@ lpfc_cmpl_els_prli(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
* it is no longer an active node. Otherwise devloss
* handles the final cleanup.
*/
+ spin_lock_irq(&ndlp->lock);
if (!(ndlp->fc4_xpt_flags & (SCSI_XPT_REGD | NVME_XPT_REGD)) &&
!ndlp->fc4_prli_sent) {
- spin_lock_irq(&ndlp->lock);
ndlp->nlp_flag &= ~NLP_NPR_2B_DISC;
- spin_unlock_irq(&ndlp->lock);
+ if (!(ndlp->nlp_flag & NLP_IN_DEV_LOSS))
+ release_node = true;
+ }
+ spin_unlock_irq(&ndlp->lock);
+
+ if (release_node)
lpfc_disc_state_machine(vport, ndlp, cmdiocb,
NLP_EVT_DEVICE_RM);
- }
} else {
/* Good status, call state machine. However, if another
* PRLI is outstanding, don't call the state machine
@@ -2713,6 +2727,7 @@ lpfc_cmpl_els_adisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
IOCB_t *irsp;
struct lpfc_nodelist *ndlp;
int disc;
+ bool release_node = false;
/* we pass cmdiocb to state machine which needs rspiocb as well */
cmdiocb->context_un.rsp_iocb = rspiocb;
@@ -2771,13 +2786,17 @@ lpfc_cmpl_els_adisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
* transport, it is no longer an active node. Otherwise
* devloss handles the final cleanup.
*/
+ spin_lock_irq(&ndlp->lock);
if (!(ndlp->fc4_xpt_flags & (SCSI_XPT_REGD | NVME_XPT_REGD))) {
- spin_lock_irq(&ndlp->lock);
ndlp->nlp_flag &= ~NLP_NPR_2B_DISC;
- spin_unlock_irq(&ndlp->lock);
+ if (!(ndlp->nlp_flag & NLP_IN_DEV_LOSS))
+ release_node = true;
+ }
+ spin_unlock_irq(&ndlp->lock);
+
+ if (release_node)
lpfc_disc_state_machine(vport, ndlp, cmdiocb,
NLP_EVT_DEVICE_RM);
- }
} else
/* Good status, call state machine */
lpfc_disc_state_machine(vport, ndlp, cmdiocb,
--
2.34.1
Nouveau is mostly designed in a way that it's expected that fences only
ever get signaled through nouveau_fence_signal(). However, in at least
one other place, nouveau_fence_done(), can signal fences, too. If that
happens (race) a signaled fence remains in the pending list for a while,
until it gets removed by nouveau_fence_update().
Should nouveau_fence_context_kill() run in the meantime, this would be
a bug because the function would attempt to set an error code on an
already signaled fence.
Have nouveau_fence_context_kill() check for a fence being signaled.
Cc: <stable(a)vger.kernel.org> # v5.10+
Fixes: ea13e5abf807 ("drm/nouveau: signal pending fences when channel has been killed")
Suggested-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Philipp Stanner <phasta(a)kernel.org>
---
drivers/gpu/drm/nouveau/nouveau_fence.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 7622587f149e..6ded8c2b6d3b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -90,7 +90,7 @@ nouveau_fence_context_kill(struct nouveau_fence_chan *fctx, int error)
while (!list_empty(&fctx->pending)) {
fence = list_entry(fctx->pending.next, typeof(*fence), head);
- if (error)
+ if (error && !dma_fence_is_signaled_locked(&fence->base))
dma_fence_set_error(&fence->base, error);
if (nouveau_fence_signal(fence))
--
2.48.1
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 4ce385f56434f3810ef103e1baea357ddcc6667e
Gitweb: https://git.kernel.org/tip/4ce385f56434f3810ef103e1baea357ddcc6667e
Author: Juergen Gross <jgross(a)suse.com>
AuthorDate: Tue, 22 Apr 2025 15:17:17 +02:00
Committer: Dave Hansen <dave.hansen(a)linux.intel.com>
CommitterDate: Wed, 23 Apr 2025 07:49:14 -07:00
x86/mm: Fix _pgd_alloc() for Xen PV mode
Recently _pgd_alloc() was switched from using __get_free_pages() to
pagetable_alloc_noprof(), which might return a compound page in case
the allocation order is larger than 0.
On x86 this will be the case if CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
is set, even if PTI has been disabled at runtime.
When running as a Xen PV guest (this will always disable PTI), using
a compound page for a PGD will result in VM_BUG_ON_PGFLAGS being
triggered when the Xen code tries to pin the PGD.
Fix the Xen issue together with the not needed 8k allocation for a
PGD with PTI disabled by replacing PGD_ALLOCATION_ORDER with an
inline helper returning the needed order for PGD allocations.
Fixes: a9b3c355c2e6 ("asm-generic: pgalloc: provide generic __pgd_{alloc,free}")
Reported-by: Petr Vaněk <arkamar(a)atlas.cz>
Signed-off-by: Juergen Gross <jgross(a)suse.com>
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Tested-by: Petr Vaněk <arkamar(a)atlas.cz>
Cc:stable@vger.kernel.org
Link: https://lore.kernel.org/all/20250422131717.25724-1-jgross%40suse.com
---
arch/x86/include/asm/pgalloc.h | 19 +++++++++++--------
arch/x86/kernel/machine_kexec_32.c | 4 ++--
arch/x86/mm/pgtable.c | 4 ++--
arch/x86/platform/efi/efi_64.c | 4 ++--
4 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index a331475..c88691b 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -6,6 +6,8 @@
#include <linux/mm.h> /* for struct page */
#include <linux/pagemap.h>
+#include <asm/cpufeature.h>
+
#define __HAVE_ARCH_PTE_ALLOC_ONE
#define __HAVE_ARCH_PGD_FREE
#include <asm-generic/pgalloc.h>
@@ -29,16 +31,17 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif
-#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
- * Instead of one PGD, we acquire two PGDs. Being order-1, it is
- * both 8k in size and 8k-aligned. That lets us just flip bit 12
- * in a pointer to swap between the two 4k halves.
+ * In case of Page Table Isolation active, we acquire two PGDs instead of one.
+ * Being order-1, it is both 8k in size and 8k-aligned. That lets us just
+ * flip bit 12 in a pointer to swap between the two 4k halves.
*/
-#define PGD_ALLOCATION_ORDER 1
-#else
-#define PGD_ALLOCATION_ORDER 0
-#endif
+static inline unsigned int pgd_allocation_order(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ return 1;
+ return 0;
+}
/*
* Allocate and free page tables.
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8026516..1f32530 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -42,7 +42,7 @@ static void load_segments(void)
static void machine_kexec_free_page_tables(struct kimage *image)
{
- free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER);
+ free_pages((unsigned long)image->arch.pgd, pgd_allocation_order());
image->arch.pgd = NULL;
#ifdef CONFIG_X86_PAE
free_page((unsigned long)image->arch.pmd0);
@@ -59,7 +59,7 @@ static void machine_kexec_free_page_tables(struct kimage *image)
static int machine_kexec_alloc_page_tables(struct kimage *image)
{
image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- PGD_ALLOCATION_ORDER);
+ pgd_allocation_order());
#ifdef CONFIG_X86_PAE
image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index a05fcdd..f7ae44d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -360,7 +360,7 @@ static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
* We allocate one page for pgd.
*/
if (!SHARED_KERNEL_PMD)
- return __pgd_alloc(mm, PGD_ALLOCATION_ORDER);
+ return __pgd_alloc(mm, pgd_allocation_order());
/*
* Now PAE kernel is not running as a Xen domain. We can allocate
@@ -380,7 +380,7 @@ static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
{
- return __pgd_alloc(mm, PGD_ALLOCATION_ORDER);
+ return __pgd_alloc(mm, pgd_allocation_order());
}
static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac57259..a4b4ebd 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -73,7 +73,7 @@ int __init efi_alloc_page_tables(void)
gfp_t gfp_mask;
gfp_mask = GFP_KERNEL | __GFP_ZERO;
- efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
+ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, pgd_allocation_order());
if (!efi_pgd)
goto fail;
@@ -96,7 +96,7 @@ free_p4d:
if (pgtable_l5_enabled())
free_page((unsigned long)pgd_page_vaddr(*pgd));
free_pgd:
- free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
+ free_pages((unsigned long)efi_pgd, pgd_allocation_order());
fail:
return -ENOMEM;
}