On Thu, Apr 06, 2023 at 09:53:37AM -0700, Stefan Roesch wrote:
> + case PR_SET_MEMORY_MERGE:
> + if (mmap_write_lock_killable(me->mm))
> + return -EINTR;
> +
> + if (arg2) {
> + int err = ksm_add_mm(me->mm);
> + if (err)
> + return err;
You'll return to userspace with the mutex held, no?
On 06.04.23 18:53, Stefan Roesch wrote:
> This adds the general_profit KSM sysfs knob and the process profit metric
> and process merge type knobs to ksm_stat.
>
> 1) expose general_profit metric
>
> The documentation mentions a general profit metric, however this
> metric is not calculated. In addition the formula depends on the size
> of internal structures, which makes it more difficult for an
> administrator to make the calculation. Adding the metric for a better
> user experience.
>
> 2) document general_profit sysfs knob
>
> 3) calculate ksm process profit metric
>
> The ksm documentation mentions the process profit metric and how to
> calculate it. This adds the calculation of the metric.
>
> 4) add ksm_merge_type() function
>
> This adds the ksm_merge_type function. The function returns the
> merge type for the process. For madvise it returns "madvise", for
> prctl it returns "process" and otherwise it returns "none".
I'm curious, why exactly is this change required in this context? It
might be sufficient to observe if the prctl is set for a process. If
not, the ksm stats can reveal whether KSM is still active for that
process -> madvise.
For your use case, I'd assume it's pretty unnecessary to expose that.
If there is no compelling reason, I'd suggest to drop this and limit
this patch to exposing the general/per-mm profit, which I can understand
why it's desirable when fine-tuning a workload.
[...]
> Signed-off-by: Stefan Roesch <shr(a)devkernel.io>
> Reviewed-by: Bagas Sanjaya <bagasdotme(a)gmail.com>
> Cc: David Hildenbrand <david(a)redhat.com>
> Cc: Johannes Weiner <hannes(a)cmpxchg.org>
> Cc: Michal Hocko <mhocko(a)suse.com>
> Cc: Rik van Riel <riel(a)surriel.com>
> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
> ---
> Documentation/ABI/testing/sysfs-kernel-mm-ksm | 8 +++++
> Documentation/admin-guide/mm/ksm.rst | 8 ++++-
> fs/proc/base.c | 5 +++
> include/linux/ksm.h | 5 +++
> mm/ksm.c | 32 +++++++++++++++++++
> 5 files changed, 57 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
> index d244674a9480..7768e90f7a8f 100644
> --- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
> @@ -51,3 +51,11 @@ Description: Control merging pages across different NUMA nodes.
>
> When it is set to 0 only pages from the same node are merged,
> otherwise pages from all nodes can be merged together (default).
> +
> +What: /sys/kernel/mm/ksm/general_profit
> +Date: January 2023
^ No
> +KernelVersion: 6.1
^ Outdated
(kind of weird having to come up with the right numbers before getting
it merged)
[...]
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 07463ad4a70a..c74450318e05 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -96,6 +96,7 @@
> #include <linux/time_namespace.h>
> #include <linux/resctrl.h>
> #include <linux/cn_proc.h>
> +#include <linux/ksm.h>
> #include <trace/events/oom.h>
> #include "internal.h"
> #include "fd.h"
> @@ -3199,6 +3200,7 @@ static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *
>
> return 0;
> }
> +
^ unrelated change
> static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
> struct pid *pid, struct task_struct *task)
> {
> @@ -3208,6 +3210,9 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
> if (mm) {
> seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
> seq_printf(m, "zero_pages_sharing %lu\n", mm->ksm_zero_pages_sharing);
> + seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
> + seq_printf(m, "ksm_merge_type %s\n", ksm_merge_type(mm));
> + seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
> mmput(mm);
> }
>
> diff --git a/include/linux/ksm.h b/include/linux/ksm.h
> index c65455bf124c..4c32f9bca723 100644
> --- a/include/linux/ksm.h
> +++ b/include/linux/ksm.h
> @@ -60,6 +60,11 @@ struct page *ksm_might_need_to_copy(struct page *page,
> void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
> void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
>
> +#ifdef CONFIG_PROC_FS
> +long ksm_process_profit(struct mm_struct *);
> +const char *ksm_merge_type(struct mm_struct *mm);
> +#endif /* CONFIG_PROC_FS */
> +
> #else /* !CONFIG_KSM */
>
> static inline int ksm_add_mm(struct mm_struct *mm)
> diff --git a/mm/ksm.c b/mm/ksm.c
> index ab95ae0f9def..76b10ff840ac 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -3042,6 +3042,25 @@ static void wait_while_offlining(void)
> }
> #endif /* CONFIG_MEMORY_HOTREMOVE */
>
> +#ifdef CONFIG_PROC_FS
> +long ksm_process_profit(struct mm_struct *mm)
> +{
> + return (long)mm->ksm_merging_pages * PAGE_SIZE -
Do we really need the cast to long? mm->ksm_merging_pages is defined as
"unsigned long". Just like "ksm_pages_sharing" below.
> + mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
> +}
> +
> +/* Return merge type name as string. */
> +const char *ksm_merge_type(struct mm_struct *mm)
> +{
> + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
> + return "process";
> + else if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
> + return "madvise";
> + else
> + return "none";
> +}
> +#endif /* CONFIG_PROC_FS */
> +
Apart from these nits, LGTM (again, I don't see why the merge type
should belong into this patch, and why there is a real need to expose it
like that).
Acked-by: David Hildenbrand <david(a)redhat.com>
--
Thanks,
David / dhildenb
At present the kselftest header can't be used with nolibc since it makes
use of vprintf() which is not available in nolibc. Fortunately nolibc
has a vfprintf() so we can just wrap that in order to allow kselftests
to be built with nolibc and still use the standard kselftest headers
with a small change to prevent the inclusion of the standard libc
headers. This allows us to avoid open coding of KTAP output for
selftests that need to use nolibc in order to test interfaces that are
controlled by libc, we've got several open coded examples of this in the
tree already.
As an example of using this the existing za-fork test is converted to
use kselftest.h. The changes to kselftest and nolibc don't have any
interaction until they are used by a test so could be merged separately
if desired.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v2:
- Turns out nolibc has a vfprintf() already which we can use so do that.
- Link to v1: https://lore.kernel.org/r/20230405-kselftest-nolibc-v1-0-63fbcd70b202@kerne…
---
Mark Brown (3):
tools/nolibc/stdio: Implement vprintf()
kselftest: Support nolibc
kselftest/arm64: Convert za-fork to use kselftest.h
tools/include/nolibc/stdio.h | 6 ++
tools/testing/selftests/arm64/fp/Makefile | 2 +-
tools/testing/selftests/arm64/fp/za-fork.c | 88 ++++++------------------------
tools/testing/selftests/kselftest.h | 2 +
4 files changed, 25 insertions(+), 73 deletions(-)
---
base-commit: e8d018dd0257f744ca50a729e3d042cf2ec9da65
change-id: 20230405-kselftest-nolibc-cb2ce0446d09
Best regards,
--
Mark Brown <broonie(a)kernel.org>
From: Feng Zhou <zhoufeng.zf(a)bytedance.com>
When access traced function arguments with type is u32*, bpf verifier failed.
Because u32 have typedef, needs to skip modifier. Add btf_type_is_modifier in
is_int_ptr. Add a selftest to check it.
Feng Zhou (2):
bpf/btf: Fix is_int_ptr()
selftests/bpf: Add test to access u32 ptr argument in tracing program
Changelog:
v2->v3: Addressed comments from jirka
- Fix an issue that caused other test items to fail
Details in here:
https://lore.kernel.org/all/20230407084608.62296-1-zhoufeng.zf@bytedance.co…
v1->v2: Addressed comments from Martin KaFai Lau
- Add a selftest.
- use btf_type_skip_modifiers.
Some details in here:
https://lore.kernel.org/all/20221012125815.76120-1-zhouchengming@bytedance.…
kernel/bpf/btf.c | 8 ++------
net/bpf/test_run.c | 8 +++++++-
.../testing/selftests/bpf/verifier/btf_ctx_access.c | 13 +++++++++++++
3 files changed, 22 insertions(+), 7 deletions(-)
--
2.20.1
This is the basic functionality for iommufd to support
iommufd_device_replace() and IOMMU_HWPT_ALLOC for physical devices.
iommufd_device_replace() allows changing the HWPT associated with the
device to a new IOAS or HWPT. Replace does this in way that failure leaves
things unchanged, and utilizes the iommu iommu_group_replace_domain() API
to allow the iommu driver to perform an optional non-disruptive change.
IOMMU_HWPT_ALLOC allows HWPTs to be explicitly allocated by the user and
used by attach or replace. At this point it isn't very useful since the
HWPT is the same as the automatically managed HWPT from the IOAS. However
a following series will allow userspace to customize the created HWPT.
The implementation is complicated because we have to introduce some
per-iommu_group memory in iommufd and redo how we think about multi-device
groups to be more explicit. This solves all the locking problems in the
prior attempts.
This series is infrastructure work for the following series which:
- Add replace for attach
- Expose replace through VFIO APIs
- Implement driver parameters for HWPT creation (nesting)
I'll update the linux-next branch with this version and keep it on a side
branch and accumulate the following series when they are ready so we can have
a stable base and make more incremental progress. When we have all the parts
together to get a full implementation it can go to Linus.
This is on github: https://github.com/jgunthorpe/linux/commits/iommufd_hwpt
v5:
- Got back to the v3 version of the code, keep the comment changes from
v4. Syzkaller says the group lock change in v4 didn't work.
- Adjust the fail_nth test to cover the path syzkaller found. We need to
have an ioas with a mapped page installed to inject a failure during
domain attachment.
v4: https://lore.kernel.org/r/0-v4-9cd79ad52ee8+13f5-iommufd_alloc_jgg@nvidia.c…
- Refine comments and commit messages
- Move the group lock into iommufd_hw_pagetable_attach()
- Fix error unwind in iommufd_device_do_replace()
v3: https://lore.kernel.org/r/0-v3-61d41fd9e13e+1f5-iommufd_alloc_jgg@nvidia.com
- Refine comments and commit messages
- Adjust the flow in iommufd_device_auto_get_domain() so pt_id is only
set on success
- Reject replace on non-attached devices
- Add missing __reserved check for IOMMU_HWPT_ALLOC
v2: https://lore.kernel.org/r/0-v2-51b9896e7862+8a8c-iommufd_alloc_jgg@nvidia.c…
- Use WARN_ON for the igroup->group test and move that logic to a
function iommufd_group_try_get()
- Change igroup->devices to igroup->device list
Replace will need to iterate over all attached idevs
- Rename to iommufd_group_setup_msi()
- New patch to export iommu_get_resv_regions()
- New patch to use per-device reserved regions instead of per-group
regions
- Split out the reorganizing of iommufd_device_change_pt() from the
replace patch
- Replace uses the per-dev reserved regions
- Use stdev_id in a few more places in the selftest
- Fix error handling in IOMMU_HWPT_ALLOC
- Clarify comments
- Rebase on v6.3-rc1
v1: https://lore.kernel.org/all/0-v1-7612f88c19f5+2f21-iommufd_alloc_jgg@nvidia…
Jason Gunthorpe (15):
iommufd: Move isolated msi enforcement to iommufd_device_bind()
iommufd: Add iommufd_group
iommufd: Replace the hwpt->devices list with iommufd_group
iommu: Export iommu_get_resv_regions()
iommufd: Keep track of each device's reserved regions instead of
groups
iommufd: Use the iommufd_group to avoid duplicate MSI setup
iommufd: Make sw_msi_start a group global
iommufd: Move putting a hwpt to a helper function
iommufd: Add enforced_cache_coherency to iommufd_hw_pagetable_alloc()
iommufd: Reorganize iommufd_device_attach into
iommufd_device_change_pt
iommufd: Add iommufd_device_replace()
iommufd: Make destroy_rwsem use a lock class per object type
iommufd: Add IOMMU_HWPT_ALLOC
iommufd/selftest: Return the real idev id from selftest mock_domain
iommufd/selftest: Add a selftest for IOMMU_HWPT_ALLOC
Nicolin Chen (2):
iommu: Introduce a new iommu_group_replace_domain() API
iommufd/selftest: Test iommufd_device_replace()
drivers/iommu/iommu-priv.h | 10 +
drivers/iommu/iommu.c | 41 +-
drivers/iommu/iommufd/device.c | 516 +++++++++++++-----
drivers/iommu/iommufd/hw_pagetable.c | 96 +++-
drivers/iommu/iommufd/io_pagetable.c | 27 +-
drivers/iommu/iommufd/iommufd_private.h | 51 +-
drivers/iommu/iommufd/iommufd_test.h | 6 +
drivers/iommu/iommufd/main.c | 17 +-
drivers/iommu/iommufd/selftest.c | 40 ++
include/linux/iommufd.h | 1 +
include/uapi/linux/iommufd.h | 26 +
tools/testing/selftests/iommu/iommufd.c | 67 ++-
.../selftests/iommu/iommufd_fail_nth.c | 67 ++-
tools/testing/selftests/iommu/iommufd_utils.h | 63 ++-
14 files changed, 825 insertions(+), 203 deletions(-)
create mode 100644 drivers/iommu/iommu-priv.h
base-commit: fd8c1a4aee973e87d890a5861e106625a33b2c4e
--
2.40.0
From: Benjamin Berg <benjamin.berg(a)intel.com>
The existing KUNIT_ARRAY_PARAM macro requires a separate function to
get the description. However, in a lot of cases the description can
just be copied directly from the array. Add a second macro that
avoids having to write a static function just for a single strscpy.
Signed-off-by: Benjamin Berg <benjamin.berg(a)intel.com>
---
include/kunit/test.h | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/include/kunit/test.h b/include/kunit/test.h
index 08d3559dd703..519b90261c72 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -1414,6 +1414,25 @@ do { \
return NULL; \
}
+/**
+ * KUNIT_ARRAY_PARAM_DESC() - Define test parameter generator from an array.
+ * @name: prefix for the test parameter generator function.
+ * @array: array of test parameters.
+ * @desc_member: structure member from array element to use as description
+ *
+ * Define function @name_gen_params which uses @array to generate parameters.
+ */
+#define KUNIT_ARRAY_PARAM_DESC(name, array, desc_member) \
+ static const void *name##_gen_params(const void *prev, char *desc) \
+ { \
+ typeof((array)[0]) *__next = prev ? ((typeof(__next)) prev) + 1 : (array); \
+ if (__next - (array) < ARRAY_SIZE((array))) { \
+ strscpy(desc, __next->desc_member, KUNIT_PARAM_DESC_SIZE); \
+ return __next; \
+ } \
+ return NULL; \
+ }
+
// TODO(dlatypov(a)google.com): consider eventually migrating users to explicitly
// include resource.h themselves if they need it.
#include <kunit/resource.h>
--
2.39.2
No need to call maybe_mkwrite() to then wrprotect if the source PMD was not
writable.
It's worth nothing that this now allows for PTEs to be writable even if
the source PMD was not writable: if vma->vm_page_prot includes write
permissions.
As documented in commit 931298e103c2 ("mm/userfaultfd: rely on
vma->vm_page_prot in uffd_wp_range()"), any mechanism that intends to
have pages wrprotected (COW, writenotify, mprotect, uffd-wp, softdirty,
...) has to properly adjust vma->vm_page_prot upfront, to not include
write permissions. If vma->vm_page_prot includes write permissions, the
PTE/PMD can be writable as default.
This now mimics the handling in mm/migrate.c:remove_migration_pte() and in
mm/huge_memory.c:remove_migration_pmd(), which has been in place for a
long time (except that 96a9c287e25d ("mm/migrate: fix wrongly apply write
bit after mkdirty on sparc64") temporarily changed it).
Signed-off-by: David Hildenbrand <david(a)redhat.com>
---
mm/huge_memory.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6f3af65435c8..8332e16ac97b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2235,11 +2235,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_swp_mkuffd_wp(entry);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
- entry = maybe_mkwrite(entry, vma);
+ if (write)
+ entry = maybe_mkwrite(entry, vma);
if (anon_exclusive)
SetPageAnonExclusive(page + i);
- if (!write)
- entry = pte_wrprotect(entry);
if (!young)
entry = pte_mkold(entry);
/* NOTE: this may set soft-dirty too on some archs */
--
2.39.2
[ This series depends on the VFIO device cdev series ]
Changelog
v6:
* Rebased on top of cdev v8 series
https://lore.kernel.org/kvm/20230327094047.47215-1-yi.l.liu@intel.com/
* Added "Reviewed-by" from Kevin to PATCH-4
* Squashed access->ioas updating lines into iommufd_access_change_pt(),
and changed function return type accordingly for simplification.
v5:
https://lore.kernel.org/linux-iommu/cover.1679559476.git.nicolinc@nvidia.co…
* Kept the cmd->id in the iommufd_test_create_access() so the access can
be created with an ioas by default. Then, renamed the previous ioctl
IOMMU_TEST_OP_ACCESS_SET_IOAS to IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, so
it would be used to replace an access->ioas pointer.
* Added iommufd_access_replace() API after the introductions of the other
two APIs iommufd_access_attach() and iommufd_access_detach().
* Since vdev->iommufd_attached is also set in emulated pathway too, call
iommufd_access_update(), similar to the physical pathway.
v4:
https://lore.kernel.org/linux-iommu/cover.1678284812.git.nicolinc@nvidia.co…
* Rebased on top of Jason's series adding replace() and hwpt_alloc()
https://lore.kernel.org/linux-iommu/0-v2-51b9896e7862+8a8c-iommufd_alloc_jg…
* Rebased on top of cdev series v6
https://lore.kernel.org/kvm/20230308132903.465159-1-yi.l.liu@intel.com/
* Dropped the patch that's moved to cdev series.
* Added unmap function pointer sanity before calling it.
* Added "Reviewed-by" from Kevin and Yi.
* Added back the VFIO change updating the ATTACH uAPI.
v3:
https://lore.kernel.org/linux-iommu/cover.1677288789.git.nicolinc@nvidia.co…
* Rebased on top of Jason's iommufd_hwpt branch:
https://lore.kernel.org/linux-iommu/0-v2-406f7ac07936+6a-iommufd_hwpt_jgg@n…
* Dropped patches from this series accordingly. There were a couple of
VFIO patches that will be submitted after the VFIO cdev series. Also,
renamed the series to be "emulated".
* Moved dma_unmap sanity patch to the first in the series.
* Moved dma_unmap sanity to cover both VFIO and IOMMUFD pathways.
* Added Kevin's "Reviewed-by" to two of the patches.
* Fixed a NULL pointer bug in vfio_iommufd_emulated_bind().
* Moved unmap() call to the common place in iommufd_access_set_ioas().
v2:
https://lore.kernel.org/linux-iommu/cover.1675802050.git.nicolinc@nvidia.co…
* Rebased on top of vfio_device cdev v2 series.
* Update the kdoc and commit message of iommu_group_replace_domain().
* Dropped revert-to-core-domain part in iommu_group_replace_domain().
* Dropped !ops->dma_unmap check in vfio_iommufd_emulated_attach_ioas().
* Added missing rc value in vfio_iommufd_emulated_attach_ioas() from the
iommufd_access_set_ioas() call.
* Added a new patch in vfio_main to deny vfio_pin/unpin_pages() calls if
vdev->ops->dma_unmap is not implemented.
* Added a __iommmufd_device_detach helper and let the replace routine do
a partial detach().
* Added restriction on auto_domains to use the replace feature.
* Added the patch "iommufd/device: Make hwpt_list list_add/del symmetric"
from the has_group removal series.
v1:
https://lore.kernel.org/linux-iommu/cover.1675320212.git.nicolinc@nvidia.co…
Hi all,
The existing IOMMU APIs provide a pair of functions: iommu_attach_group()
for callers to attach a device from the default_domain (NULL if not being
supported) to a given iommu domain, and iommu_detach_group() for callers
to detach a device from a given domain to the default_domain. Internally,
the detach_dev op is deprecated for the newer drivers with default_domain.
This means that those drivers likely can switch an attaching domain to
another one, without stagging the device at a blocking or default domain,
for use cases such as:
1) vPASID mode, when a guest wants to replace a single pasid (PASID=0)
table with a larger table (PASID=N)
2) Nesting mode, when switching the attaching device from an S2 domain
to an S1 domain, or when switching between relevant S1 domains.
This series is rebased on top of Jason Gunthorpe's series that introduces
iommu_group_replace_domain API and IOMMUFD infrastructure for the IOMMUFD
"physical" devices. The IOMMUFD "emulated" deivces will need some extra
steps to replace the access->ioas object and its iopt pointer.
You can also find this series on Github:
https://github.com/nicolinc/iommufd/commits/iommu_group_replace_domain-v6
Thank you
Nicolin Chen
Nicolin Chen (4):
vfio: Do not allow !ops->dma_unmap in vfio_pin/unpin_pages()
iommufd: Add iommufd_access_replace() API
iommufd/selftest: Add IOMMU_TEST_OP_ACCESS_REPLACE_IOAS coverage
vfio: Support IO page table replacement
drivers/iommu/iommufd/device.c | 53 ++++++++++++++-----
drivers/iommu/iommufd/iommufd_test.h | 4 ++
drivers/iommu/iommufd/selftest.c | 19 +++++++
drivers/vfio/iommufd.c | 11 ++--
drivers/vfio/vfio_main.c | 4 ++
include/linux/iommufd.h | 1 +
include/uapi/linux/vfio.h | 6 +++
tools/testing/selftests/iommu/iommfd*.c | 0
tools/testing/selftests/iommu/iommufd.c | 29 +++++++++-
tools/testing/selftests/iommu/iommufd_utils.h | 19 +++++++
10 files changed, 127 insertions(+), 19 deletions(-)
create mode 100644 tools/testing/selftests/iommu/iommfd*.c
--
2.40.0
From: Rong Tao <rongtao(a)cestc.cn>
When the number of symbols is greater than MAX_SYMS (300000), the access
array struct ksym syms[MAX_SYMS] goes out of bounds, which will result in
a segfault.
Resolve this issue by judging the maximum number and exiting the loop, and
increasing the default size appropriately. (6.2.9 = 329839 below)
$ cat /proc/kallsyms | wc -l
329839
GDB debugging:
$ cd linux/samples/bpf
$ sudo gdb ./sampleip
...
(gdb) r
...
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff7e2debf in malloc () from /lib64/libc.so.6
Missing separate debuginfos, use: dnf debuginfo-install
elfutils-libelf-0.189-1.fc37.x86_64 glibc-2.36-9.fc37.x86_64
libzstd-1.5.4-1.fc37.x86_64 zlib-1.2.12-5.fc37.x86_64
(gdb) bt
#0 0x00007ffff7e2debf in malloc () from /lib64/libc.so.6
#1 0x00007ffff7e33f8e in strdup () from /lib64/libc.so.6
#2 0x0000000000403fb0 in load_kallsyms_refresh() from trace_helpers.c
#3 0x00000000004038b2 in main ()
Signed-off-by: Rong Tao <rongtao(a)cestc.cn>
---
tools/testing/selftests/bpf/trace_helpers.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 09a16a77bae4..a9d589c560d2 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -14,7 +14,7 @@
#define DEBUGFS "/sys/kernel/debug/tracing/"
-#define MAX_SYMS 300000
+#define MAX_SYMS 400000
static struct ksym syms[MAX_SYMS];
static int sym_cnt;
@@ -44,7 +44,8 @@ int load_kallsyms_refresh(void)
continue;
syms[i].addr = (long) addr;
syms[i].name = strdup(func);
- i++;
+ if (++i >= MAX_SYMS)
+ break;
}
fclose(f);
sym_cnt = i;
--
2.39.2
There is a spelling mistake in a test assert message. Fix it.
Signed-off-by: Colin Ian King <colin.i.king(a)gmail.com>
---
tools/testing/selftests/kvm/aarch64/smccc_filter.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/kvm/aarch64/smccc_filter.c b/tools/testing/selftests/kvm/aarch64/smccc_filter.c
index 0f9db0641847..82650313451a 100644
--- a/tools/testing/selftests/kvm/aarch64/smccc_filter.c
+++ b/tools/testing/selftests/kvm/aarch64/smccc_filter.c
@@ -211,7 +211,7 @@ static void expect_call_fwd_to_user(struct kvm_vcpu *vcpu, uint32_t func_id,
"KVM_HYPERCALL_EXIT_SMC is not set");
else
TEST_ASSERT(!(run->hypercall.flags & KVM_HYPERCALL_EXIT_SMC),
- "KVM_HYPERCAL_EXIT_SMC is set");
+ "KVM_HYPERCALL_EXIT_SMC is set");
}
/* SMCCC calls forwarded to userspace cause KVM_EXIT_HYPERCALL exits */
--
2.30.2
From: Feng Zhou <zhoufeng.zf(a)bytedance.com>
When access traced function arguments with type is u32*, bpf verifier failed.
Because u32 have typedef, needs to skip modifier. Add btf_type_is_modifier in
is_int_ptr. Add a selftest to check it.
Feng Zhou (2):
bpf/btf: Fix is_int_ptr()
selftests/bpf: Add test to access u32 ptr argument in tracing program
Changelog:
v1->v2: Addressed comments from Martin KaFai Lau
- Add a selftest.
- use btf_type_skip_modifiers.
Some details in here:
https://lore.kernel.org/all/20221012125815.76120-1-zhouchengming@bytedance.…
kernel/bpf/btf.c | 5 ++---
net/bpf/test_run.c | 8 +++++++-
.../testing/selftests/bpf/verifier/btf_ctx_access.c | 13 +++++++++++++
3 files changed, 22 insertions(+), 4 deletions(-)
--
2.20.1
On Thu, 6 Apr 2023 09:53:37 -0700 Stefan Roesch <shr(a)devkernel.io> wrote:
> So far KSM can only be enabled by calling madvise for memory regions. To
> be able to use KSM for more workloads, KSM needs to have the ability to be
> enabled / disabled at the process / cgroup level.
>
> ...
>
> @@ -53,6 +62,18 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
>
> #else /* !CONFIG_KSM */
>
> +static inline int ksm_add_mm(struct mm_struct *mm)
> +{
> +}
The compiler doesn't like the lack of a return value.
I queued up a patch to simply delete the above function - seems that
ksm_add_mm() has no callers if CONFIG_KSM=n.
The same might be true of the ksm_add_vma()...ksm_exit() stubs also,
Perhaps some kind soul could take a look at whether we can simply clean
those out.
This patch set adds support for using FOU or GUE encapsulation with
an ipip device operating in collect-metadata mode and a set of kfuncs
for controlling encap parameters exposed to a BPF tc-hook.
BPF tc-hooks allow us to read tunnel metadata (like remote IP addresses)
in the ingress path of an externally controlled tunnel interface via
the bpf_skb_get_tunnel_{key,opt} bpf-helpers. Packets can then be
redirected to the same or a different externally controlled tunnel
interface by overwriting metadata via the bpf_skb_set_tunnel_{key,opt}
helpers and a call to bpf_redirect. This enables us to redirect packets
between tunnel interfaces - and potentially change the encapsulation
type - using only a single BPF program.
Today this approach works fine for a couple of tunnel combinations.
For example: redirecting packets between Geneve and GRE interfaces or
GRE and plain ipip interfaces. However, redirecting using FOU or GUE is
not supported today. The ip_tunnel module does not allow us to egress
packets using additional UDP encapsulation from an ipip device in
collect-metadata mode.
Patch 1 lifts this restriction by adding a struct ip_tunnel_encap to
the tunnel metadata. It can be filled by a new BPF kfunc introduced
in Patch 2 and evaluated by the ip_tunnel egress path. This will allow
us to use FOU and GUE encap with externally controlled ipip devices.
Patch 2 introduces two new BPF kfuncs: bpf_skb_{set,get}_fou_encap.
These helpers can be used to set and get UDP encap parameters from the
BPF tc-hook doing the packet redirect.
Patch 3 adds BPF tunnel selftests using the two kfuncs.
---
v2:
- Fixes for checkpatch.pl
- Fixes for kernel test robot
Christian Ehrig (3):
ipip,ip_tunnel,sit: Add FOU support for externally controlled ipip
devices
bpf,fou: Add bpf_skb_{set,get}_fou_encap kfuncs
selftests/bpf: Test FOU kfuncs for externally controlled ipip devices
include/net/fou.h | 2 +
include/net/ip_tunnels.h | 28 +++--
net/ipv4/Makefile | 2 +-
net/ipv4/fou_bpf.c | 119 ++++++++++++++++++
net/ipv4/fou_core.c | 5 +
net/ipv4/ip_tunnel.c | 22 +++-
net/ipv4/ipip.c | 1 +
net/ipv6/sit.c | 2 +-
.../selftests/bpf/progs/test_tunnel_kern.c | 117 +++++++++++++++++
tools/testing/selftests/bpf/test_tunnel.sh | 81 ++++++++++++
10 files changed, 362 insertions(+), 17 deletions(-)
create mode 100644 net/ipv4/fou_bpf.c
--
2.39.2
Hi Christophe Leroy and gpio experts,
Greeting!
Platform: Tigerlake-H and so on x86 platforms
All detailed info is in link: https://github.com/xupengfe/syzkaller_logs/tree/main/issue_bisect/230406_gp…
Bisect info: https://github.com/xupengfe/syzkaller_logs/blob/main/issue_bisect/230406_gp…
gpio-mockup.sh kslef-test failed in v6.3-rc5 kernel.
gpio-mockup.sh(gpio overflow test) in kself-test could reproduce this issue:
cd linux/tools/testing/selftests
1. ./kselftest_install.sh
2. cd linux/tools/testing/selftests/kselftest_install/gpio
# ./gpio-mockup.sh
1. Module load tests
1.1. dynamic allocation of gpio
2. Module load error tests
2.1 gpio overflow
test failed: unexpected chip - gpiochip1
GPIO gpio-mockup test FAIL
And the simplified steps to reproduce this issue are as follow:
"
# Load gpio_mockup with overflow ranges -1,1024:
modprobe -q gpio_mockup gpio_mockup_ranges="-1,1024"
# Check is there some Call Trace generated in dmesg
dmesg | grep -C 5 Call
# Should not generate any gpiochip folder like /sys/kernel/debug/gpio-mockup/gpiochip1
# Because load gpio_mockup with overflow ranges -1,1024
find "/sys/kernel/debug/gpio-mockup/" -name gpiochip* -type d | sort
# Unload the gpio_mockup module
modprobe -r gpio_mockup
# Check is there "Call Trace" generated in dmesg
dmesg | grep -C 5 Call
"
Actually the judgement "gpio-mockup.sh" test/bisect judgement point is that:
Should not generate any gpiochip folder like
/sys/kernel/debug/gpio-mockup/gpiochip1 after load gpio_mockup with overflow
ranges -1,1024.
I met gpio-mockup.sh test failed but there is no any "Call Trace" dmesg info
sometimes.
So the shortest check steps are as follow:
"
1. modprobe -q gpio_mockup gpio_mockup_ranges="-1,1024"
After above gpio_mockup module loaded with overflow range "-1,1024":
Correct behavior as previous v6.1 or older kernel:"gpio should not load "gpiochip1" due to overflow range -1,1024";
Wrong behavior in v6.3-rc5 kernel: "gpio *load* "gpiochip1" with overflow range -1,1024 and "gpiochip1" should not be loaded".
The underlying problem was already buried here.
2. Could use below command to check if "gpiochip1" generated:
As before v6.1, there was no "/sys/kernel/debug/gpio-mockup/gpiochip1" sysfs folder due to overflow range -1,1024";
Wrong behavior in v6.3-rc5 kernel: "/sys/kernel/debug/gpio-mockup/gpiochip1" sysfs folder generated as follow command check:
# find "/sys/kernel/debug/gpio-mockup/" -name gpiochip* -type d | sort
/sys/kernel/debug/gpio-mockup/gpiochip1
If there is gpiochip* generated, gpio-mockup.sh kself-test would be failed also.
"
Bisected and found the bad commit was:
"
7b61212f2a07a5afd213c8876e52b5c9946441e2
gpiolib: Get rid of ARCH_NR_GPIOS
"
And after reverted the above commit on top of v6.3-rc5 kernel, above
gpio-mockup.sh kself-test could pass and this issue was gone.
Now gpio-mockup.sh kself-test is failed on almost all x86 platform from
v6.2 cycle mainline kernel.
I hope above info is helpful to solve the "gpio-mockup.sh kself-test failed"
problem.
Thanks!
BR.
Hi Linus,
Please pull the following Kselftest fixes update for Linux 6.3-rc6.
This Kselftest fixes update for Linux 6.3-rc6 consists of one single
fix to mount_setattr_test build failure.
diff is attached.
thanks,
-- Shuah
----------------------------------------------------------------
The following changes since commit 05107edc910135d27fe557267dc45be9630bf3dd:
selftests: sigaltstack: fix -Wuninitialized (2023-03-20 17:28:31 -0600)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest tags/linux-kselftest-fixes-6.3-rc6
for you to fetch changes up to f1594bc676579133a3cd906d7d27733289edfb86:
selftests mount: Fix mount_setattr_test builds failed (2023-03-31 09:18:45 -0600)
----------------------------------------------------------------
linux-kselftest-fixes-6.3-rc6
This Kselftest fixes update for Linux 6.3-rc6 consists of one single
fix to mount_setattr_test build failure.
----------------------------------------------------------------
Anh Tuan Phan (1):
selftests mount: Fix mount_setattr_test builds failed
tools/testing/selftests/mount_setattr/mount_setattr_test.c | 1 +
1 file changed, 1 insertion(+)
----------------------------------------------------------------
So far KSM can only be enabled by calling madvise for memory regions. To
be able to use KSM for more workloads, KSM needs to have the ability to be
enabled / disabled at the process / cgroup level.
Use case 1:
The madvise call is not available in the programming language. An example for
this are programs with forked workloads using a garbage collected language without
pointers. In such a language madvise cannot be made available.
In addition the addresses of objects get moved around as they are garbage
collected. KSM sharing needs to be enabled "from the outside" for these type of
workloads.
Use case 2:
The same interpreter can also be used for workloads where KSM brings no
benefit or even has overhead. We'd like to be able to enable KSM on a workload
by workload basis.
Use case 3:
With the madvise call sharing opportunities are only enabled for the current
process: it is a workload-local decision. A considerable number of sharing
opportuniites may exist across multiple workloads or jobs. Only a higler level
entity like a job scheduler or container can know for certain if its running
one or more instances of a job. That job scheduler however doesn't have
the necessary internal worklaod knowledge to make targeted madvise calls.
Security concerns:
In previous discussions security concerns have been brought up. The problem is
that an individual workload does not have the knowledge about what else is
running on a machine. Therefore it has to be very conservative in what memory
areas can be shared or not. However, if the system is dedicated to running
multiple jobs within the same security domain, its the job scheduler that has
the knowledge that sharing can be safely enabled and is even desirable.
Performance:
Experiments with using UKSM have shown a capacity increase of around 20%.
1. New options for prctl system command
This patch series adds two new options to the prctl system call. The first
one allows to enable KSM at the process level and the second one to query the
setting.
The setting will be inherited by child processes.
With the above setting, KSM can be enabled for the seed process of a cgroup
and all processes in the cgroup will inherit the setting.
2. Changes to KSM processing
When KSM is enabled at the process level, the KSM code will iterate over all
the VMA's and enable KSM for the eligible VMA's.
When forking a process that has KSM enabled, the setting will be inherited by
the new child process.
In addition when KSM is disabled for a process, KSM will be disabled for the
VMA's where KSM has been enabled.
3. Add general_profit metric
The general_profit metric of KSM is specified in the documentation, but not
calculated. This adds the general profit metric to /sys/kernel/debug/mm/ksm.
4. Add more metrics to ksm_stat
This adds the process profit and ksm type metric to /proc/<pid>/ksm_stat.
5. Add more tests to ksm_tests
This adds an option to specify the merge type to the ksm_tests. This allows to
test madvise and prctl KSM. It also adds a new option to query if prctl KSM has
been enabled. It adds a fork test to verify that the KSM process setting is
inherited by client processes.
Changes:
- V4:
- removing check in prctl for MMF_VM_MERGEABLE in PR_SET_MEMORY_MERGE
handling
- Checking for VM_MERGEABLE AND MMF_VM_MERGE_ANY to avoid chaning vm_flags
- This requires also checking that the vma is compatible. The
compatibility check is provided by a new helper
- processes which have set MMF_VM_MERGE_ANY, only need to call the
helper and not madvise.
- removed unmerge_vmas function, this function is no longer necessary,
clearing the MMF_VM_MERGE_ANY bit is sufficient
- V3:
- folded patch 1 - 6
- folded patch 7 - 14
- folded patch 15 - 19
- Expanded on the use cases in the cover letter
- Added a section on security concerns to the cover letter
- V2:
- Added use cases to the cover letter
- Removed the tracing patch from the patch series and posted it as an
individual patch
- Refreshed repo
Stefan Roesch (3):
mm: add new api to enable ksm per process
mm: add new KSM process and sysfs knobs
selftests/mm: add new selftests for KSM
Documentation/ABI/testing/sysfs-kernel-mm-ksm | 8 +
Documentation/admin-guide/mm/ksm.rst | 8 +-
fs/proc/base.c | 5 +
include/linux/ksm.h | 19 +-
include/linux/sched/coredump.h | 1 +
include/uapi/linux/prctl.h | 2 +
kernel/sys.c | 27 ++
mm/ksm.c | 137 +++++++---
tools/include/uapi/linux/prctl.h | 2 +
tools/testing/selftests/mm/Makefile | 2 +-
tools/testing/selftests/mm/ksm_tests.c | 254 +++++++++++++++---
11 files changed, 388 insertions(+), 77 deletions(-)
--
2.34.1
So far KSM can only be enabled by calling madvise for memory regions. To
be able to use KSM for more workloads, KSM needs to have the ability to be
enabled / disabled at the process / cgroup level.
Use case 1:
The madvise call is not available in the programming language. An example for
this are programs with forked workloads using a garbage collected language without
pointers. In such a language madvise cannot be made available.
In addition the addresses of objects get moved around as they are garbage
collected. KSM sharing needs to be enabled "from the outside" for these type of
workloads.
Use case 2:
The same interpreter can also be used for workloads where KSM brings no
benefit or even has overhead. We'd like to be able to enable KSM on a workload
by workload basis.
Use case 3:
With the madvise call sharing opportunities are only enabled for the current
process: it is a workload-local decision. A considerable number of sharing
opportunities may exist across multiple workloads or jobs (if they are part
of the same security domain). Only a higler level entity like a job scheduler
or container can know for certain if its running one or more instances of a
job. That job scheduler however doesn't have the necessary internal workload
knowledge to make targeted madvise calls.
Security concerns:
In previous discussions security concerns have been brought up. The problem is
that an individual workload does not have the knowledge about what else is
running on a machine. Therefore it has to be very conservative in what memory
areas can be shared or not. However, if the system is dedicated to running
multiple jobs within the same security domain, its the job scheduler that has
the knowledge that sharing can be safely enabled and is even desirable.
Performance:
Experiments with using UKSM have shown a capacity increase of around 20%.
1. New options for prctl system command
This patch series adds two new options to the prctl system call. The first
one allows to enable KSM at the process level and the second one to query the
setting.
The setting will be inherited by child processes.
With the above setting, KSM can be enabled for the seed process of a cgroup
and all processes in the cgroup will inherit the setting.
2. Changes to KSM processing
When KSM is enabled at the process level, the KSM code will iterate over all
the VMA's and enable KSM for the eligible VMA's.
When forking a process that has KSM enabled, the setting will be inherited by
the new child process.
In addition when KSM is disabled for a process, KSM will be disabled for the
VMA's where KSM has been enabled.
3. Add general_profit metric
The general_profit metric of KSM is specified in the documentation, but not
calculated. This adds the general profit metric to /sys/kernel/debug/mm/ksm.
4. Add more metrics to ksm_stat
This adds the process profit and ksm type metric to /proc/<pid>/ksm_stat.
5. Add more tests to ksm_tests
This adds an option to specify the merge type to the ksm_tests. This allows to
test madvise and prctl KSM. It also adds a new option to query if prctl KSM has
been enabled. It adds a fork test to verify that the KSM process setting is
inherited by client processes.
Changes:
- V5:
- When the prctl system call is invoked, mark all compatible VMA
as mergeable
- Instead of checcking during scan if VMA is mergeable, mark the VMA
mergeable when the VMA is created (in case the VMA is compatible)
- Remove earlier changes, they are no longer necessary
- Unset the flag MMF_VM_MERGE_ANY in gmap_mark_unmergeable().
- When unsetting the MMF_VM_MERGE_ANY flag with prctl, only unset the
flag
- Remove pages_volatile function (with the simplar general_profit calculation,
the function is no longer needed)
- Use simpler formula for calculation of general_profit
- V4:
- removing check in prctl for MMF_VM_MERGEABLE in PR_SET_MEMORY_MERGE
handling
- Checking for VM_MERGEABLE AND MMF_VM_MERGE_ANY to avoid chaning vm_flags
- This requires also checking that the vma is compatible. The
compatibility check is provided by a new helper
- processes which have set MMF_VM_MERGE_ANY, only need to call the
helper and not madvise.
- removed unmerge_vmas function, this function is no longer necessary,
clearing the MMF_VM_MERGE_ANY bit is sufficient
- V3:
- folded patch 1 - 6
- folded patch 7 - 14
- folded patch 15 - 19
- Expanded on the use cases in the cover letter
- Added a section on security concerns to the cover letter
- V2:
- Added use cases to the cover letter
- Removed the tracing patch from the patch series and posted it as an
individual patch
- Refreshed repo
Stefan Roesch (3):
mm: add new api to enable ksm per process
mm: add new KSM process and sysfs knobs
selftests/mm: add new selftests for KSM
Documentation/ABI/testing/sysfs-kernel-mm-ksm | 8 +
Documentation/admin-guide/mm/ksm.rst | 8 +-
arch/s390/mm/gmap.c | 1 +
fs/proc/base.c | 5 +
include/linux/ksm.h | 36 ++-
include/linux/sched/coredump.h | 1 +
include/uapi/linux/prctl.h | 2 +
kernel/fork.c | 1 +
kernel/sys.c | 24 ++
mm/ksm.c | 143 ++++++++--
mm/mmap.c | 7 +
tools/include/uapi/linux/prctl.h | 2 +
tools/testing/selftests/mm/Makefile | 2 +-
tools/testing/selftests/mm/ksm_tests.c | 254 +++++++++++++++---
14 files changed, 426 insertions(+), 68 deletions(-)
--
2.34.1
This patch set makes it possible to have synchronized dynamic ATU and FDB
entries on locked ports. As locked ports are not able to automatically
learn, they depend on userspace added entries, where userspace can add
static or dynamic entries. The lifetime of static entries are completely
dependent on userspace intervention, and thus not of interest here. We
are only concerned with dynamic entries, which can be added with a
command like:
bridge fdb replace ADDR dev <DEV> master dynamic
We choose only to support this feature on locked ports, as it involves
utilizing the CPU to handle ATU related switchcore events (typically
interrupts) and thus can result in significant performance loss if
exposed to heavy traffic.
On locked ports it is important for userspace to know when an authorized
station has become silent, hence not breaking the communication of a
station that has been authorized based on the MAC-Authentication Bypass
(MAB) scheme. Thus if the station keeps being active after authorization,
it will continue to have an open port as long as it is active. Only after
a silent period will it have to be reauthorized. As the ageing process in
the ATU is dependent on incoming traffic to the switchcore port, it is
necessary for the ATU to signal that an entry has aged out, so that the
FDB can be updated at the correct time.
This patch set includes a solution for the Marvell mv88e6xxx driver, where
for this driver we use the Hold-At-One feature so that an age-out
violation interrupt occurs when a station has been silent for the
system-set age time. The age out violation interrupt allows the switchcore
driver to remove both the ATU and the FDB entry at the same time.
It is up to the maintainers of other switchcore drivers to implement the
feature for their specific driver.
LOG:
V2: Ensure the port is locked when using the feature as we
must ensure that learning is enabled at all times for
the interrupts to occur. This was missed in the previous
version.
Instead of ignoring unsupported flags, ensure that
drivers are only called when supporting the feature.
As 'dynamic' flag is legacy, all drivers support it at
least by their previous handling.
Hans J. Schultz (6):
net: bridge: add dynamic flag to switchdev notifier
net: dsa: propagate flags down towards drivers
drivers: net: dsa: add fdb entry flags incoming to switchcore drivers
net: bridge: ensure FDB offloaded flag is handled as needed
net: dsa: mv88e6xxx: implementation of dynamic ATU entries
selftests: forwarding: add dynamic FDB test
drivers/net/dsa/b53/b53_common.c | 4 +-
drivers/net/dsa/b53/b53_priv.h | 4 +-
drivers/net/dsa/hirschmann/hellcreek.c | 4 +-
drivers/net/dsa/lan9303-core.c | 4 +-
drivers/net/dsa/lantiq_gswip.c | 4 +-
drivers/net/dsa/microchip/ksz_common.c | 6 +-
drivers/net/dsa/mt7530.c | 4 +-
drivers/net/dsa/mv88e6xxx/chip.c | 20 ++++--
drivers/net/dsa/mv88e6xxx/chip.h | 9 ++-
drivers/net/dsa/mv88e6xxx/global1_atu.c | 21 +++++++
drivers/net/dsa/mv88e6xxx/port.c | 6 +-
drivers/net/dsa/mv88e6xxx/switchdev.c | 61 +++++++++++++++++++
drivers/net/dsa/mv88e6xxx/switchdev.h | 5 ++
drivers/net/dsa/mv88e6xxx/trace.h | 5 ++
drivers/net/dsa/ocelot/felix.c | 4 +-
drivers/net/dsa/qca/qca8k-common.c | 4 +-
drivers/net/dsa/qca/qca8k.h | 4 +-
drivers/net/dsa/rzn1_a5psw.c | 4 +-
drivers/net/dsa/sja1105/sja1105_main.c | 11 ++--
include/net/dsa.h | 9 ++-
include/net/switchdev.h | 1 +
net/bridge/br_fdb.c | 5 +-
net/bridge/br_switchdev.c | 1 +
net/dsa/dsa.c | 6 ++
net/dsa/port.c | 28 +++++----
net/dsa/port.h | 8 +--
net/dsa/slave.c | 20 ++++--
net/dsa/switch.c | 26 +++++---
net/dsa/switch.h | 1 +
.../net/forwarding/bridge_locked_port.sh | 36 +++++++++++
30 files changed, 258 insertions(+), 67 deletions(-)
--
2.34.1
At present the kselftest header can't be used with nolibc since it makes
use of vprintf() which is not available in nolibc and seems like it would
be inappropriate to implement given the minimal system requirements and
environment intended for nolibc. This has resulted in some open coded
kselftests which use nolibc to test features that are supposed to be
controlled via libc and therefore better exercised in an environment with
no libc.
Rather than continue this let's factor out the I/O routines in kselftest.h
into a separate header file and provide a nolibc implementation which only
allows simple strings to be provided rather than full printf() support.
This is limiting but a great improvement on sharing no code at all.
As an example of using this I've updated the arm64 za-fork test to use
the standard kselftest.h.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Mark Brown (2):
kselftest: Support nolibc
kselftest/arm64: Convert za-fork to use kselftest.h
tools/testing/selftests/arm64/fp/Makefile | 2 +-
tools/testing/selftests/arm64/fp/za-fork.c | 88 +++--------------
tools/testing/selftests/kselftest-nolibc.h | 93 ++++++++++++++++++
tools/testing/selftests/kselftest-std.h | 151 +++++++++++++++++++++++++++++
tools/testing/selftests/kselftest.h | 149 +++-------------------------
5 files changed, 272 insertions(+), 211 deletions(-)
---
base-commit: e8d018dd0257f744ca50a729e3d042cf2ec9da65
change-id: 20230405-kselftest-nolibc-cb2ce0446d09
Best regards,
--
Mark Brown <broonie(a)kernel.org>
On 10.03.23 19:28, Stefan Roesch wrote:
> This adds the general_profit KSM sysfs knob and the process profit metric
> and process merge type knobs to ksm_stat.
>
> 1) split off pages_volatile function
>
> This splits off the pages_volatile function. The next patch will
> use this function.
>
> 2) expose general_profit metric
>
> The documentation mentions a general profit metric, however this
> metric is not calculated. In addition the formula depends on the size
> of internal structures, which makes it more difficult for an
> administrator to make the calculation. Adding the metric for a better
> user experience.
>
> 3) document general_profit sysfs knob
>
> 4) calculate ksm process profit metric
>
> The ksm documentation mentions the process profit metric and how to
> calculate it. This adds the calculation of the metric.
>
> 5) add ksm_merge_type() function
>
> This adds the ksm_merge_type function. The function returns the
> merge type for the process. For madvise it returns "madvise", for
> prctl it returns "process" and otherwise it returns "none".
>
> 6) mm: expose ksm process profit metric and merge type in ksm_stat
>
> This exposes the ksm process profit metric in /proc/<pid>/ksm_stat.
> The name of the value is ksm_merge_type. The documentation mentions
> the formula for the ksm process profit metric, however it does not
> calculate it. In addition the formula depends on the size of internal
> structures. So it makes sense to expose it.
>
> 7) document new procfs ksm knobs
>
Often, when you have to start making a list of things that a patch does,
it might make sense to split some of the items into separate patches
such that you can avoid lists and just explain in list-free text how the
pieces in the patch fit together.
I'd suggest splitting this patch into logical pieces. For example,
separating the general profit calculation/exposure from the per-mm
profit and the per-mm ksm type indication.
> Link: https://lkml.kernel.org/r/20230224044000.3084046-3-shr@devkernel.io
> Signed-off-by: Stefan Roesch <shr(a)devkernel.io>
> Reviewed-by: Bagas Sanjaya <bagasdotme(a)gmail.com>
> Cc: David Hildenbrand <david(a)redhat.com>
> Cc: Johannes Weiner <hannes(a)cmpxchg.org>
> Cc: Michal Hocko <mhocko(a)suse.com>
> Cc: Rik van Riel <riel(a)surriel.com>
> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
> ---
[...]
> KSM_ATTR_RO(pages_volatile);
>
> @@ -3280,6 +3305,21 @@ static ssize_t zero_pages_sharing_show(struct kobject *kobj,
> }
> KSM_ATTR_RO(zero_pages_sharing);
>
> +static ssize_t general_profit_show(struct kobject *kobj,
> + struct kobj_attribute *attr, char *buf)
> +{
> + long general_profit;
> + long all_rmap_items;
> +
> + all_rmap_items = ksm_max_page_sharing + ksm_pages_shared +
> + ksm_pages_unshared + pages_volatile();
Are you sure you want to count a config knob (ksm_max_page_sharing) into
that formula? I yet have to digest what this calculation implies, but it
does feel odd.
Further, maybe just avoid pages_volatile(). Expanding the formula
(excluding ksm_max_page_sharing for now):
all_rmap = ksm_pages_shared + ksm_pages_unshared + pages_volatile();
-> expand pages_volatile() (ignoring the < 0 case)
all_rmap = ksm_pages_shared + ksm_pages_unshared + ksm_rmap_items -
ksm_pages_shared - ksm_pages_sharing - ksm_pages_unshared;
-> simplify
all_rmap = ksm_rmap_items + ksm_pages_sharing;
Or is the < 0 case relevant here?
> + general_profit = ksm_pages_sharing * PAGE_SIZE -
> + all_rmap_items * sizeof(struct ksm_rmap_item);
> +
> + return sysfs_emit(buf, "%ld\n", general_profit);
> +}
> +KSM_ATTR_RO(general_profit);
> +
> static ssize_t stable_node_dups_show(struct kobject *kobj,
> struct kobj_attribute *attr, char *buf)
> {
> @@ -3345,6 +3385,7 @@ static struct attribute *ksm_attrs[] = {
> &stable_node_dups_attr.attr,
> &stable_node_chains_prune_millisecs_attr.attr,
> &use_zero_pages_attr.attr,
> + &general_profit_attr.attr,
> NULL,
> };
>
The calculations (profit) don't include when KSM places the shared
zeropage I guess. Accounting that per MM (and eventually globally) is in
the works. [1]
[1]
https://lore.kernel.org/lkml/20230328153852.26c2577e4bd921c371c47a7e@linux-…
--
Thanks,
David / dhildenb
Commit 515bddf0ec41 ("selftests/clone3: test clone3 with CLONE_NEWTIME")
added an additional test, so the number passed to ksft_set_plan needs to
be bumped accordingly.
Also use ksft_finish to print results and exit. This will catch future
mismatches between ksft_set_plan and the number of tests being run.
Fixes: 515bddf0ec41 ("selftests/clone3: test clone3 with CLONE_NEWTIME")
Cc: Christian Brauner <brauner(a)kernel.org>
Signed-off-by: Tobias Klauser <tklauser(a)distanz.ch>
---
tools/testing/selftests/clone3/clone3.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c
index 4fce46afe6db..7b45c9854202 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
uid_t uid = getuid();
ksft_print_header();
- ksft_set_plan(17);
+ ksft_set_plan(18);
test_clone3_supported();
/* Just a simple clone3() should return 0.*/
--
2.39.1
Add unaligned descriptor test for frame size of 4001. Using an odd frame
size ensures that the end of the UMEM is not near a page boundary. This
allows testing descriptors that staddle the end of the UMEM but not a
page.
This test used to fail without the previous commit ("xsk: Add check for
unaligned descriptors that overrun UMEM").
Signed-off-by: Kal Conley <kal.conley(a)dectris.com>
---
tools/testing/selftests/bpf/xskxceiver.c | 24 ++++++++++++++++++++++++
tools/testing/selftests/bpf/xskxceiver.h | 1 +
2 files changed, 25 insertions(+)
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 1a4bdd5aa78c..5a9691e942de 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -69,6 +69,7 @@
*/
#define _GNU_SOURCE
+#include <assert.h>
#include <fcntl.h>
#include <errno.h>
#include <getopt.h>
@@ -1876,6 +1877,29 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
test->ifobj_rx->umem->unaligned_mode = true;
testapp_invalid_desc(test);
break;
+ case TEST_TYPE_UNALIGNED_INV_DESC_4K1_FRAME: {
+ u64 page_size, umem_size;
+
+ if (!hugepages_present(test->ifobj_tx)) {
+ ksft_test_result_skip("No 2M huge pages present.\n");
+ return;
+ }
+ test_spec_set_name(test, "UNALIGNED_INV_DESC_4K1_FRAME_SIZE");
+ /* Odd frame size so the UMEM doesn't end near a page boundary. */
+ test->ifobj_tx->umem->frame_size = 4001;
+ test->ifobj_rx->umem->frame_size = 4001;
+ test->ifobj_tx->umem->unaligned_mode = true;
+ test->ifobj_rx->umem->unaligned_mode = true;
+ /* This test exists to test descriptors that staddle the end of
+ * the UMEM but not a page.
+ */
+ page_size = sysconf(_SC_PAGESIZE);
+ umem_size = test->ifobj_tx->umem->num_frames * test->ifobj_tx->umem->frame_size;
+ assert(umem_size % page_size > PKT_SIZE);
+ assert(umem_size % page_size < page_size - PKT_SIZE);
+ testapp_invalid_desc(test);
+ break;
+ }
case TEST_TYPE_UNALIGNED:
if (!testapp_unaligned(test))
return;
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index cc24ab72f3ff..919327807a4e 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -78,6 +78,7 @@ enum test_type {
TEST_TYPE_ALIGNED_INV_DESC,
TEST_TYPE_ALIGNED_INV_DESC_2K_FRAME,
TEST_TYPE_UNALIGNED_INV_DESC,
+ TEST_TYPE_UNALIGNED_INV_DESC_4K1_FRAME,
TEST_TYPE_HEADROOM,
TEST_TYPE_TEARDOWN,
TEST_TYPE_BIDI,
--
2.39.2