Hello guys,
During the last few months, I felt a performance regression when using
read() and write() on my high-speed Nvme SSD (about 7GB/s).
To get more precise information about it I quickly developed benchmark
tool basically running read() or write() in a loop to simulate a
sequential file read or write. The tool also measures the real time
consumed by the loop. Finally, the tool can call open() with or without
O_DIRECT.
I ran the tests on EXT4 and Exfat with following settings (buffer
values have been set for best result):
- Write settings: buffer 400mb * 100
- Read settings: buffer 200mb
- Drop caches before non-direct read/write test
With this hardware:
- CPU AMD Ryzen 7600X
- RAM DDR5 5200 32GB
- SSD Kingston Fury Renegade 4TB with 4K LBA
Here are some results I got with last upstream kernels (default
config):
+------------------+----------+------------------+------------------+--
----------------+------------------+------------------+
| ~42GB | O_DIRECT | Linux 6.2.0 | Linux 6.3.0 |
Linux 6.4.0 | Linux 6.5.0 | Linux 6.5.5 |
+------------------+----------+------------------+------------------+--
----------------+------------------+------------------+
| Ext4 (sector 4k) | | | |
| | |
| Read | no | 7.2s (5800MB/s) | 7.1s (5890MB/s) |
8.3s (5050MB/s) | 13.2s (3180MB/s) | 13.2s (3180MB/s) |
| Write | no | 12.0s (3500MB/s) | 12.6s (3340MB/s) |
12.2s (3440MB/s) | 28.9s (1450MB/s) | 28.9s (1450MB/s) |
| Read | yes | 6.0s (7000MB/s) | 6.0s (7020MB/s) |
5.9s (7170MB/s) | 5.9s (7100MB/s) | 5.9s (7100MB/s) |
| Write | yes | 6.7s (6220MB/s) | 6.7s (6290MB/s) |
6.9s (6080MB/s) | 6.9s (6080MB/s) | 6.9s (6970MB/s) |
| Exfat (sector ?) | | | |
| | |
| Read | no | 7.3s (5770MB/s) | 7.2s (5830MB/s) |
9s (4620MB/s) | 13.3s (3150MB/s) | 13.2s (3180MB/s) |
| Write | no | 8.3s (5040MB/s) | 8.9s (4750MB/s) |
8.3s (5040MB/s) | 18.3s (2290MB/s) | 18.5s (2260MB/s) |
| Read | yes | 6.2s (6760MB/s) | 6.1s (6870MB/s) |
6.0s (6980MB/s) | 6.5s (6440MB/s) | 6.6s (6320MB/s) |
| Write | yes | 16.1s (2610MB/s) | 16.0s (2620MB/s) |
18.7s (2240MB/s) | 34.1s (1230MB/s) | 34.5s (1220MB/s) |
+------------------+----------+------------------+------------------+--
----------------+------------------+------------------+
Please note that I rounded some values to clarify readiness. Small
variations can be considered as margin error.
Ext4 results: cached reads/writes time have increased of almost 100%
from 6.2.0 to 6.5.0 with a first increase with 6.4.0. Direct access
times have stayed similar though.
Exfat results: performance decrease too with and without direct access
this time.
I realize there are thousands of commits between, plus the issue can
come from multiple kernel parts such as the page cache, the file system
implementation (especially for Exfat), the IO engine, a driver, etc.
The results also showed that there is not only a specific version
impacted. Anyway, at the end the performance have highly decreased.
If you want to verify my benchmark tool source code, please ask.
PS: sending again as only text body is accepted
Regards
Florent DELAHAYE
[ Replying to show this patch that got dropped from the mailing list ]
On Sat, 30 Sep 2023 16:32:16 -0400
Steven Rostedt <rostedt(a)goodmis.org> wrote:
> From: Beau Belgrave <beaub(a)linux.microsoft.com>
>
> All architectures should use a long aligned address passed to set_bit().
> User processes can pass either a 32-bit or 64-bit sized value to be
> updated when tracing is enabled when on a 64-bit kernel. Both cases are
> ensured to be naturally aligned, however, that is not enough. The
> address must be long aligned without affecting checks on the value
> within the user process which require different adjustments for the bit
> for little and big endian CPUs.
>
> Add a compat flag to user_event_enabler that indicates when a 32-bit
> value is being used on a 64-bit kernel. Long align addresses and correct
> the bit to be used by set_bit() to account for this alignment. Ensure
> compat flags are copied during forks and used during deletion clears.
>
> Link: https://lore.kernel.org/linux-trace-kernel/20230925230829.341-2-beaub@linux…
> Link: https://lore.kernel.org/linux-trace-kernel/20230914131102.179100-1-cleger@r…
>
> Cc: stable(a)vger.kernel.org
> Fixes: 7235759084a4 ("tracing/user_events: Use remote writes for event enablement")
> Reported-by: Clément Léger <cleger(a)rivosinc.com>
> Suggested-by: Clément Léger <cleger(a)rivosinc.com>
> Signed-off-by: Beau Belgrave <beaub(a)linux.microsoft.com>
> Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
> ---
> kernel/trace/trace_events_user.c | 58 ++++++++++++++++++++++++++++----
> 1 file changed, 51 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
> index 6f046650e527..b87f41187c6a 100644
> --- a/kernel/trace/trace_events_user.c
> +++ b/kernel/trace/trace_events_user.c
> @@ -127,8 +127,13 @@ struct user_event_enabler {
> /* Bit 7 is for freeing status of enablement */
> #define ENABLE_VAL_FREEING_BIT 7
>
> -/* Only duplicate the bit value */
> -#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
> +/* Bit 8 is for marking 32-bit on 64-bit */
> +#define ENABLE_VAL_32_ON_64_BIT 8
> +
> +#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT)
> +
> +/* Only duplicate the bit and compat values */
> +#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK)
>
> #define ENABLE_BITOPS(e) (&(e)->values)
>
> @@ -174,6 +179,30 @@ struct user_event_validator {
> int flags;
> };
>
> +static inline void align_addr_bit(unsigned long *addr, int *bit,
> + unsigned long *flags)
> +{
> + if (IS_ALIGNED(*addr, sizeof(long))) {
> +#ifdef __BIG_ENDIAN
> + /* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */
> + if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags))
> + *bit += 32;
> +#endif
> + return;
> + }
> +
> + *addr = ALIGN_DOWN(*addr, sizeof(long));
> +
> + /*
> + * We only support 32 and 64 bit values. The only time we need
> + * to align is a 32 bit value on a 64 bit kernel, which on LE
> + * is always 32 bits, and on BE requires no change when unaligned.
> + */
> +#ifdef __LITTLE_ENDIAN
> + *bit += 32;
> +#endif
> +}
> +
> typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
> void *tpdata, bool *faulted);
>
> @@ -482,6 +511,7 @@ static int user_event_enabler_write(struct user_event_mm *mm,
> unsigned long *ptr;
> struct page *page;
> void *kaddr;
> + int bit = ENABLE_BIT(enabler);
> int ret;
>
> lockdep_assert_held(&event_mutex);
> @@ -497,6 +527,8 @@ static int user_event_enabler_write(struct user_event_mm *mm,
> test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
> return -EBUSY;
>
> + align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler));
> +
> ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
> &page, NULL);
>
> @@ -515,9 +547,9 @@ static int user_event_enabler_write(struct user_event_mm *mm,
>
> /* Update bit atomically, user tracers must be atomic as well */
> if (enabler->event && enabler->event->status)
> - set_bit(ENABLE_BIT(enabler), ptr);
> + set_bit(bit, ptr);
> else
> - clear_bit(ENABLE_BIT(enabler), ptr);
> + clear_bit(bit, ptr);
>
> kunmap_local(kaddr);
> unpin_user_pages_dirty_lock(&page, 1, true);
> @@ -849,6 +881,12 @@ static struct user_event_enabler
> enabler->event = user;
> enabler->addr = uaddr;
> enabler->values = reg->enable_bit;
> +
> +#if BITS_PER_LONG >= 64
> + if (reg->enable_size == 4)
> + set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler));
> +#endif
> +
> retry:
> /* Prevents state changes from racing with new enablers */
> mutex_lock(&event_mutex);
> @@ -2377,7 +2415,8 @@ static long user_unreg_get(struct user_unreg __user *ureg,
> }
>
> static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
> - unsigned long uaddr, unsigned char bit)
> + unsigned long uaddr, unsigned char bit,
> + unsigned long flags)
> {
> struct user_event_enabler enabler;
> int result;
> @@ -2385,7 +2424,7 @@ static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
>
> memset(&enabler, 0, sizeof(enabler));
> enabler.addr = uaddr;
> - enabler.values = bit;
> + enabler.values = bit | flags;
> retry:
> /* Prevents state changes from racing with new enablers */
> mutex_lock(&event_mutex);
> @@ -2415,6 +2454,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
> struct user_event_mm *mm = current->user_event_mm;
> struct user_event_enabler *enabler, *next;
> struct user_unreg reg;
> + unsigned long flags;
> long ret;
>
> ret = user_unreg_get(ureg, ®);
> @@ -2425,6 +2465,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
> if (!mm)
> return -ENOENT;
>
> + flags = 0;
> ret = -ENOENT;
>
> /*
> @@ -2441,6 +2482,9 @@ static long user_events_ioctl_unreg(unsigned long uarg)
> ENABLE_BIT(enabler) == reg.disable_bit) {
> set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
>
> + /* We must keep compat flags for the clear */
> + flags |= enabler->values & ENABLE_VAL_COMPAT_MASK;
> +
> if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
> user_event_enabler_destroy(enabler, true);
>
> @@ -2454,7 +2498,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
> /* Ensure bit is now cleared for user, regardless of event status */
> if (!ret)
> ret = user_event_mm_clear_bit(mm, reg.disable_addr,
> - reg.disable_bit);
> + reg.disable_bit, flags);
>
> return ret;
> }
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
It was discovered that the ring buffer polling was incorrectly stating
that read would not block, but that's because polling did not take into
account that reads will block if the "buffer-percent" was set. Instead,
the ring buffer polling would say reads would not block if there was any
data in the ring buffer. This was incorrect behavior from a user space
point of view. This was fixed by commit 42fb0a1e84ff by having the polling
code check if the ring buffer had more data than what the user specified
"buffer percent" had.
The problem now is that the polling code did not register itself to the
writer that it wanted to wait for a specific "full" value of the ring
buffer. The result was that the writer would wake the polling waiter
whenever there was a new event. The polling waiter would then wake up, see
that there's not enough data in the ring buffer to notify user space and
then go back to sleep. The next event would wake it up again.
Before the polling fix was added, the code would wake up around 100 times
for a hackbench 30 benchmark. After the "fix", due to the constant waking
of the writer, it would wake up over 11,0000 times! It would never leave
the kernel, so the user space behavior was still "correct", but this
definitely is not the desired effect.
To fix this, have the polling code add what it's waiting for to the
"shortest_full" variable, to tell the writer not to wake it up if the
buffer is not as full as it expects to be.
Note, after this fix, it appears that the waiter is now woken up around 2x
the times it was before (~200). This is a tremendous improvement from the
11,000 times, but I will need to spend some time to see why polling is
more aggressive in its wakeups than the read blocking code.
Link: https://lore.kernel.org/linux-trace-kernel/20230929180113.01c2cae3@rorschac…
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Fixes: 42fb0a1e84ff ("tracing/ring-buffer: Have polling block on watermark")
Reported-by: Julia Lawall <julia.lawall(a)inria.fr>
Tested-by: Julia Lawall <julia.lawall(a)inria.fr>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ring_buffer.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28daf0ce95c5..515cafdb18d9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1137,6 +1137,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
if (full) {
poll_wait(filp, &work->full_waiters, poll_table);
work->full_waiters_pending = true;
+ if (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)
+ cpu_buffer->shortest_full = full;
} else {
poll_wait(filp, &work->waiters, poll_table);
work->waiters_pending = true;
--
2.40.1
For an unknown TX CQE error type (probably from a newer hardware),
still free the SKB, update the queue tail, etc., otherwise the
accounting will be wrong.
Also, TX errors can be triggered by injecting corrupted packets, so
replace the WARN_ONCE to ratelimited error logging, because we don't
need stack trace here.
Cc: stable(a)vger.kernel.org
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Signed-off-by: Haiyang Zhang <haiyangz(a)microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 4a16ebff3d1d..5cdcf7561b38 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1317,19 +1317,23 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
case CQE_TX_VPORT_IDX_OUT_OF_RANGE:
case CQE_TX_VPORT_DISABLED:
case CQE_TX_VLAN_TAGGING_VIOLATION:
- WARN_ONCE(1, "TX: CQE error %d: ignored.\n",
- cqe_oob->cqe_hdr.cqe_type);
+ if (net_ratelimit())
+ netdev_err(ndev, "TX: CQE error %d\n",
+ cqe_oob->cqe_hdr.cqe_type);
+
apc->eth_stats.tx_cqe_err++;
break;
default:
- /* If the CQE type is unexpected, log an error, assert,
- * and go through the error path.
+ /* If the CQE type is unknown, log an error,
+ * and still free the SKB, update tail, etc.
*/
- WARN_ONCE(1, "TX: Unexpected CQE type %d: HW BUG?\n",
- cqe_oob->cqe_hdr.cqe_type);
+ if (net_ratelimit())
+ netdev_err(ndev, "TX: unknown CQE type %d\n",
+ cqe_oob->cqe_hdr.cqe_type);
+
apc->eth_stats.tx_cqe_unknown_type++;
- return;
+ break;
}
if (WARN_ON_ONCE(txq->gdma_txq_id != completions[i].wq_num))
--
2.25.1
It is possible for multiple vCPUs to fault on the same IPA and attempt
to resolve the fault. One of the page table walks will actually update
the PTE and the rest will return -EAGAIN per our race detection scheme.
KVM elides the TLB invalidation on the racing threads as the return
value is nonzero.
Before commit a12ab1378a88 ("KVM: arm64: Use local TLBI on permission
relaxation") KVM always used broadcast TLB invalidations when handling
permission faults, which had the convenient property of making the
stage-2 updates visible to all CPUs in the system. However now we do a
local invalidation, and TLBI elision leads to vCPUs getting stuck in a
permission fault loop. Remember that the architecture permits the TLB to
cache translations that precipitate a permission fault.
Invalidate the TLB entry responsible for the permission fault if the
stage-2 descriptor has been relaxed, regardless of which thread actually
did the job.
Cc: stable(a)vger.kernel.org
Fixes: a12ab1378a88 ("KVM: arm64: Use local TLBI on permission relaxation")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
arch/arm64/kvm/hyp/pgtable.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index f155b8c9e98c..286888751793 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1314,7 +1314,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
KVM_PGTABLE_WALK_HANDLE_FAULT |
KVM_PGTABLE_WALK_SHARED);
- if (!ret)
+ if (!ret || ret == -EAGAIN)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
return ret;
}
base-commit: ce9ecca0238b140b88f43859b211c9fdfd8e5b70
--
2.42.0.515.g380fc7ccd1-goog
The quilt patch titled
Subject: Crash: add lock to serialize crash hotplug handling
has been removed from the -mm tree. Its filename was
crash-add-lock-to-serialize-crash-hotplug-handling.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Baoquan He <bhe(a)redhat.com>
Subject: Crash: add lock to serialize crash hotplug handling
Date: Tue, 26 Sep 2023 20:09:05 +0800
Eric reported that handling corresponding crash hotplug event can be
failed easily when many memory hotplug event are notified in a short
period. They failed because failing to take __kexec_lock.
=======
[ 78.714569] Fallback order for Node 0: 0
[ 78.714575] Built 1 zonelists, mobility grouping on. Total pages: 1817886
[ 78.717133] Policy zone: Normal
[ 78.724423] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate
[ 78.727207] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate
[ 80.056643] PEFILE: Unsigned PE binary
=======
The memory hotplug events are notified very quickly and very many, while
the handling of crash hotplug is much slower relatively. So the atomic
variable __kexec_lock and kexec_trylock() can't guarantee the
serialization of crash hotplug handling.
Here, add a new mutex lock __crash_hotplug_lock to serialize crash hotplug
handling specifically. This doesn't impact the usage of __kexec_lock.
Link: https://lkml.kernel.org/r/20230926120905.392903-1-bhe@redhat.com
Fixes: 247262756121 ("crash: add generic infrastructure for crash hotplug support")
Signed-off-by: Baoquan He <bhe(a)redhat.com>
Tested-by: Eric DeVolder <eric.devolder(a)oracle.com>
Reviewed-by: Eric DeVolder <eric.devolder(a)oracle.com>
Reviewed-by: Valentin Schneider <vschneid(a)redhat.com>
Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/crash_core.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
--- a/kernel/crash_core.c~crash-add-lock-to-serialize-crash-hotplug-handling
+++ a/kernel/crash_core.c
@@ -740,6 +740,17 @@ subsys_initcall(crash_notes_memory_init)
#define pr_fmt(fmt) "crash hp: " fmt
/*
+ * Different than kexec/kdump loading/unloading/jumping/shrinking which
+ * usually rarely happen, there will be many crash hotplug events notified
+ * during one short period, e.g one memory board is hot added and memory
+ * regions are online. So mutex lock __crash_hotplug_lock is used to
+ * serialize the crash hotplug handling specifically.
+ */
+DEFINE_MUTEX(__crash_hotplug_lock);
+#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
+#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
+
+/*
* This routine utilized when the crash_hotplug sysfs node is read.
* It reflects the kernel's ability/permission to update the crash
* elfcorehdr directly.
@@ -748,9 +759,11 @@ int crash_check_update_elfcorehdr(void)
{
int rc = 0;
+ crash_hotplug_lock();
/* Obtain lock while reading crash information */
if (!kexec_trylock()) {
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ crash_hotplug_unlock();
return 0;
}
if (kexec_crash_image) {
@@ -761,6 +774,7 @@ int crash_check_update_elfcorehdr(void)
}
/* Release lock now that update complete */
kexec_unlock();
+ crash_hotplug_unlock();
return rc;
}
@@ -783,9 +797,11 @@ static void crash_handle_hotplug_event(u
{
struct kimage *image;
+ crash_hotplug_lock();
/* Obtain lock while changing crash information */
if (!kexec_trylock()) {
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ crash_hotplug_unlock();
return;
}
@@ -852,6 +868,7 @@ static void crash_handle_hotplug_event(u
out:
/* Release lock now that update complete */
kexec_unlock();
+ crash_hotplug_unlock();
}
static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
_
Patches currently in -mm which might be from bhe(a)redhat.com are
crash_corec-remove-unnecessary-parameter-of-function.patch
crash_core-change-the-prototype-of-function-parse_crashkernel.patch
crash_core-change-parse_crashkernel-to-support-crashkernel=highlow-parsing.patch
crash_core-add-generic-function-to-do-reservation.patch
crash_core-move-crashk_res-definition-into-crash_corec.patch
x86-kdump-use-generic-interface-to-simplify-crashkernel-reservation-code.patch
x86-kdump-use-generic-interface-to-simplify-crashkernel-reservation-code-fix.patch
arm64-kdump-use-generic-interface-to-simplify-crashkernel-reservation.patch
riscv-kdump-use-generic-interface-to-simplify-crashkernel-reservation.patch
crash_corec-remove-unneeded-functions.patch
The quilt patch titled
Subject: selftests/mm: fix awk usage in charge_reserved_hugetlb.sh and hugetlb_reparenting_test.sh that may cause error
has been removed from the -mm tree. Its filename was
selftests-mm-fix-awk-usage-in-charge_reserved_hugetlbsh-and-hugetlb_reparenting_testsh-that-may-cause-error.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Juntong Deng <juntong.deng(a)outlook.com>
Subject: selftests/mm: fix awk usage in charge_reserved_hugetlb.sh and hugetlb_reparenting_test.sh that may cause error
Date: Wed, 27 Sep 2023 02:19:44 +0800
According to the awk manual, the -e option does not need to be specified
in front of 'program' (unless you need to mix program-file).
The redundant -e option can cause error when users use awk tools other
than gawk (for example, mawk does not support the -e option).
Error Example:
awk: not an option: -e
Link: https://lkml.kernel.org/r/VI1P193MB075228810591AF2FDD7D42C599C3A@VI1P193MB0…
Signed-off-by: Juntong Deng <juntong.deng(a)outlook.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 4 ++--
tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh~selftests-mm-fix-awk-usage-in-charge_reserved_hugetlbsh-and-hugetlb_reparenting_testsh-that-may-cause-error
+++ a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -25,7 +25,7 @@ if [[ "$1" == "-cgroup-v2" ]]; then
fi
if [[ $cgroup2 ]]; then
- cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+ cgroup_path=$(mount -t cgroup2 | head -1 | awk '{print $3}')
if [[ -z "$cgroup_path" ]]; then
cgroup_path=/dev/cgroup/memory
mount -t cgroup2 none $cgroup_path
@@ -33,7 +33,7 @@ if [[ $cgroup2 ]]; then
fi
echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
else
- cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+ cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}')
if [[ -z "$cgroup_path" ]]; then
cgroup_path=/dev/cgroup/memory
mount -t cgroup memory,hugetlb $cgroup_path
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh~selftests-mm-fix-awk-usage-in-charge_reserved_hugetlbsh-and-hugetlb_reparenting_testsh-that-may-cause-error
+++ a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -20,7 +20,7 @@ fi
if [[ $cgroup2 ]]; then
- CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+ CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk '{print $3}')
if [[ -z "$CGROUP_ROOT" ]]; then
CGROUP_ROOT=/dev/cgroup/memory
mount -t cgroup2 none $CGROUP_ROOT
@@ -28,7 +28,7 @@ if [[ $cgroup2 ]]; then
fi
echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
else
- CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+ CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}')
if [[ -z "$CGROUP_ROOT" ]]; then
CGROUP_ROOT=/dev/cgroup/memory
mount -t cgroup memory,hugetlb $CGROUP_ROOT
_
Patches currently in -mm which might be from juntong.deng(a)outlook.com are
The quilt patch titled
Subject: mm: mempolicy: keep VMA walk if both MPOL_MF_STRICT and MPOL_MF_MOVE are specified
has been removed from the -mm tree. Its filename was
mm-mempolicy-keep-vma-walk-if-both-mpol_mf_strict-and-mpol_mf_move-are-specified.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Yang Shi <yang(a)os.amperecomputing.com>
Subject: mm: mempolicy: keep VMA walk if both MPOL_MF_STRICT and MPOL_MF_MOVE are specified
Date: Wed, 20 Sep 2023 15:32:42 -0700
When calling mbind() with MPOL_MF_{MOVE|MOVEALL} | MPOL_MF_STRICT, kernel
should attempt to migrate all existing pages, and return -EIO if there is
misplaced or unmovable page. Then commit 6f4576e3687b ("mempolicy: apply
page table walker on queue_pages_range()") messed up the return value and
didn't break VMA scan early ianymore when MPOL_MF_STRICT alone. The
return value problem was fixed by commit a7f40cfe3b7a ("mm: mempolicy:
make mbind() return -EIO when MPOL_MF_STRICT is specified"), but it broke
the VMA walk early if unmovable page is met, it may cause some pages are
not migrated as expected.
The code should conceptually do:
if (MPOL_MF_MOVE|MOVEALL)
scan all vmas
try to migrate the existing pages
return success
else if (MPOL_MF_MOVE* | MPOL_MF_STRICT)
scan all vmas
try to migrate the existing pages
return -EIO if unmovable or migration failed
else /* MPOL_MF_STRICT alone */
break early if meets unmovable and don't call mbind_range() at all
else /* none of those flags */
check the ranges in test_walk, EFAULT without mbind_range() if discontig.
Fixed the behavior.
Link: https://lkml.kernel.org/r/20230920223242.3425775-1-yang@os.amperecomputing.…
Fixes: a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified")
Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Rafael Aquini <aquini(a)redhat.com>
Cc: Kirill A. Shutemov <kirill(a)shutemov.name>
Cc: David Rientjes <rientjes(a)google.com>
Cc: <stable(a)vger.kernel.org> [4.9+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mempolicy.c | 39 +++++++++++++++++++--------------------
1 file changed, 19 insertions(+), 20 deletions(-)
--- a/mm/mempolicy.c~mm-mempolicy-keep-vma-walk-if-both-mpol_mf_strict-and-mpol_mf_move-are-specified
+++ a/mm/mempolicy.c
@@ -426,6 +426,7 @@ struct queue_pages {
unsigned long start;
unsigned long end;
struct vm_area_struct *first;
+ bool has_unmovable;
};
/*
@@ -446,9 +447,8 @@ static inline bool queue_folio_required(
/*
* queue_folios_pmd() has three possible return values:
* 0 - folios are placed on the right node or queued successfully, or
- * special page is met, i.e. huge zero page.
- * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
- * specified.
+ * special page is met, i.e. zero page, or unmovable page is found
+ * but continue walking (indicated by queue_pages.has_unmovable).
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
* existing folio was already on a node that does not follow the
* policy.
@@ -479,7 +479,7 @@ static int queue_folios_pmd(pmd_t *pmd,
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
if (!vma_migratable(walk->vma) ||
migrate_folio_add(folio, qp->pagelist, flags)) {
- ret = 1;
+ qp->has_unmovable = true;
goto unlock;
}
} else
@@ -495,9 +495,8 @@ unlock:
*
* queue_folios_pte_range() has three possible return values:
* 0 - folios are placed on the right node or queued successfully, or
- * special page is met, i.e. zero page.
- * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
- * specified.
+ * special page is met, i.e. zero page, or unmovable page is found
+ * but continue walking (indicated by queue_pages.has_unmovable).
* -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
* on a node that does not follow the policy.
*/
@@ -508,7 +507,6 @@ static int queue_folios_pte_range(pmd_t
struct folio *folio;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- bool has_unmovable = false;
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
@@ -538,11 +536,12 @@ static int queue_folios_pte_range(pmd_t
if (!queue_folio_required(folio, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- /* MPOL_MF_STRICT must be specified if we get here */
- if (!vma_migratable(vma)) {
- has_unmovable = true;
- break;
- }
+ /*
+ * MPOL_MF_STRICT must be specified if we get here.
+ * Continue walking vmas due to MPOL_MF_MOVE* flags.
+ */
+ if (!vma_migratable(vma))
+ qp->has_unmovable = true;
/*
* Do not abort immediately since there may be
@@ -550,16 +549,13 @@ static int queue_folios_pte_range(pmd_t
* need migrate other LRU pages.
*/
if (migrate_folio_add(folio, qp->pagelist, flags))
- has_unmovable = true;
+ qp->has_unmovable = true;
} else
break;
}
pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
- if (has_unmovable)
- return 1;
-
return addr != end ? -EIO : 0;
}
@@ -599,7 +595,7 @@ static int queue_folios_hugetlb(pte_t *p
* Detecting misplaced folio but allow migrating folios which
* have been queued.
*/
- ret = 1;
+ qp->has_unmovable = true;
goto unlock;
}
@@ -620,7 +616,7 @@ static int queue_folios_hugetlb(pte_t *p
* Failed to isolate folio but allow migrating pages
* which have been queued.
*/
- ret = 1;
+ qp->has_unmovable = true;
}
unlock:
spin_unlock(ptl);
@@ -756,12 +752,15 @@ queue_pages_range(struct mm_struct *mm,
.start = start,
.end = end,
.first = NULL,
+ .has_unmovable = false,
};
const struct mm_walk_ops *ops = lock_vma ?
&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
err = walk_page_range(mm, start, end, ops, &qp);
+ if (qp.has_unmovable)
+ err = 1;
if (!qp.first)
/* whole range in hole */
err = -EFAULT;
@@ -1358,7 +1357,7 @@ static long do_mbind(unsigned long start
putback_movable_pages(&pagelist);
}
- if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
+ if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT))
err = -EIO;
} else {
up_out:
_
Patches currently in -mm which might be from yang(a)os.amperecomputing.com are