I'd like to cut down the memory usage of parsing vmlinux BTF in ebpf-go. With some upcoming changes the library is sitting at 5MiB for a parse. Most of that memory is simply copying the BTF blob into user space. By allowing vmlinux BTF to be mmapped read-only into user space I can cut memory usage by about 75%.
Signed-off-by: Lorenz Bauer lmb@isovalent.com --- Changes in v2: - Use btf__new in selftest - Avoid vm_iomap_memory in btf_vmlinux_mmap - Add VM_DONTDUMP - Add support to libbpf - Link to v1: https://lore.kernel.org/r/20250501-vmlinux-mmap-v1-0-aa2724572598@isovalent....
--- Lorenz Bauer (3): btf: allow mmap of vmlinux btf selftests: bpf: add a test for mmapable vmlinux BTF libbpf: Use mmap to parse vmlinux BTF from sysfs
include/asm-generic/vmlinux.lds.h | 3 +- kernel/bpf/sysfs_btf.c | 36 +++++++++- tools/lib/bpf/btf.c | 82 +++++++++++++++++++--- tools/testing/selftests/bpf/prog_tests/btf_sysfs.c | 82 ++++++++++++++++++++++ 4 files changed, 189 insertions(+), 14 deletions(-) --- base-commit: 38d976c32d85ef12dcd2b8a231196f7049548477 change-id: 20250501-vmlinux-mmap-2ec5563c3ef1
Best regards,
User space needs access to kernel BTF for many modern features of BPF. Right now each process needs to read the BTF blob either in pieces or as a whole. Allow mmaping the sysfs file so that processes can directly access the memory allocated for it in the kernel.
Signed-off-by: Lorenz Bauer lmb@isovalent.com --- include/asm-generic/vmlinux.lds.h | 3 ++- kernel/bpf/sysfs_btf.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #ifdef CONFIG_DEBUG_INFO_BTF #define BTF \ + . = ALIGN(PAGE_SIZE); \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.BTF, _BTF) \ } \ - . = ALIGN(4); \ + . = ALIGN(PAGE_SIZE); \ .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \ *(.BTF_ids) \ } diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a7157929c50f62a5c6862e7a3d081..f4b59b1c2e5b11ffffa80662ad39334c730019ee 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,18 +7,50 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[];
+struct kobject *btf_kobj; + +static int btf_vmlinux_mmap(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + phys_addr_t start = virt_to_phys(__start_BTF); + size_t btf_size = __stop_BTF - __start_BTF; + size_t vm_size = vma->vm_end - vma->vm_start; + unsigned long pfn = start >> PAGE_SHIFT; + unsigned long pages = PAGE_ALIGN(btf_size) >> PAGE_SHIFT; + + if (kobj != btf_kobj) + return -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE|VM_EXEC|VM_MAYSHARE)) + return -EACCES; + + if (pfn + pages < pfn) + return -EINVAL; + + if (vm_size >> PAGE_SHIFT > pages) + return -EINVAL; + + vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC|VM_MAYWRITE); + return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot); +} + static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, .read_new = sysfs_bin_attr_simple_read, + .mmap = btf_vmlinux_mmap, };
-struct kobject *btf_kobj; - static int __init btf_vmlinux_init(void) { bin_attr_btf_vmlinux.private = __start_BTF;
On Fri, May 2, 2025 at 11:20 AM Lorenz Bauer lmb@isovalent.com wrote:
User space needs access to kernel BTF for many modern features of BPF. Right now each process needs to read the BTF blob either in pieces or as a whole. Allow mmaping the sysfs file so that processes can directly access the memory allocated for it in the kernel.
I just realised that there is also code which exposes module BTF via sysfs, which my code currently doesn't handle. I'll send a v3.
On Fri, May 2, 2025 at 3:20 AM Lorenz Bauer lmb@isovalent.com wrote:
User space needs access to kernel BTF for many modern features of BPF. Right now each process needs to read the BTF blob either in pieces or as a whole. Allow mmaping the sysfs file so that processes can directly access the memory allocated for it in the kernel.
Signed-off-by: Lorenz Bauer lmb@isovalent.com
include/asm-generic/vmlinux.lds.h | 3 ++- kernel/bpf/sysfs_btf.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #ifdef CONFIG_DEBUG_INFO_BTF #define BTF \
. = ALIGN(PAGE_SIZE); \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.BTF, _BTF) \ } \
. = ALIGN(4); \
. = ALIGN(PAGE_SIZE); \ .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \ *(.BTF_ids) \ }
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a7157929c50f62a5c6862e7a3d081..f4b59b1c2e5b11ffffa80662ad39334c730019ee 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,18 +7,50 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[];
+struct kobject *btf_kobj;
+static int btf_vmlinux_mmap(struct file *filp, struct kobject *kobj,
const struct bin_attribute *attr,
struct vm_area_struct *vma)
+{
phys_addr_t start = virt_to_phys(__start_BTF);
size_t btf_size = __stop_BTF - __start_BTF;
size_t vm_size = vma->vm_end - vma->vm_start;
unsigned long pfn = start >> PAGE_SHIFT;
unsigned long pages = PAGE_ALIGN(btf_size) >> PAGE_SHIFT;
if (kobj != btf_kobj)
return -EINVAL;
if (vma->vm_pgoff)
return -EINVAL;
if (vma->vm_flags & (VM_WRITE|VM_EXEC|VM_MAYSHARE))
return -EACCES;
if (pfn + pages < pfn)
return -EINVAL;
if (vm_size >> PAGE_SHIFT > pages)
return -EINVAL;
vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC|VM_MAYWRITE);
return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
remap_pfn_range() should be avoided. See big comment in kernel/events/core.c in map_range().
The following seems to work: diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index f4b59b1c2e5b..7d0fd28070d8 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -20,13 +20,13 @@ static int btf_vmlinux_mmap(struct file *filp, struct kobject *kobj, const struct bin_attribute *attr, struct vm_area_struct *vma) { - phys_addr_t start = virt_to_phys(__start_BTF); + unsigned long addr = (unsigned long)__start_BTF; size_t btf_size = __stop_BTF - __start_BTF; size_t vm_size = vma->vm_end - vma->vm_start; - unsigned long pfn = start >> PAGE_SHIFT; unsigned long pages = PAGE_ALIGN(btf_size) >> PAGE_SHIFT; + int i, err = 0;
- if (kobj != btf_kobj) + if (kobj != btf_kobj || !pages) return -EINVAL;
if (vma->vm_pgoff) @@ -35,14 +35,17 @@ static int btf_vmlinux_mmap(struct file *filp, struct kobject *kobj, if (vma->vm_flags & (VM_WRITE|VM_EXEC|VM_MAYSHARE)) return -EACCES;
- if (pfn + pages < pfn) - return -EINVAL; - if (vm_size >> PAGE_SHIFT > pages) return -EINVAL;
vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC|VM_MAYWRITE); - return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot); + + for (i = 0; i < pages && !err; i++, addr += PAGE_SIZE) + err = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, + virt_to_page(addr)); + if (err) + zap_page_range_single(vma, vma->vm_start, pages * PAGE_SIZE, NULL); + return err; }
Great that you added: /* Check padding is zeroed */ for (int i = 0; i < trailing; i++) { if (((__u8 *)raw_data)[btf_size + i] != 0) { PRINT_FAIL("tail of BTF is not zero at page offset %d\n", i); goto cleanup; } }
but this part is puzzling: trailing = page_size - (btf_size % page_size) % page_size;
On Fri, May 2, 2025 at 6:15 PM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
remap_pfn_range() should be avoided. See big comment in kernel/events/core.c in map_range().
The following seems to work:
Thanks, this helped a lot.
but this part is puzzling: trailing = page_size - (btf_size % page_size) % page_size;
The intention is to calculate how many bytes of trailing zeroes to expect while accounting for the case where btf_size % page_size == 0. I could replace this with a check
end = btf_size + (page_size - 1) / page_size * page_size; for (i = btf_size; i < end; i++) ...
Better?
In the meantime I've looked at allowing mmap of kmods. I'm not sure it's worth the effort:
1. Allocations of btf->data in btf_parse_module() would have to use vmalloc_user() so that allocations are page aligned and zeroed appropriately. This will be a bit more expensive on systems with large pages and / or many small kmod BTFs. We could only allow mmap of BTF
= PAGE_SIZE, at additional complexity.
2. We need to hold a refcount on struct btf for each mmapped kernel module, so that btf->data doesn't get freed. Taking the refcount can happen in the sysfs mmap handler, but dropping it is tricky. kernfs / sysfs doesn't allow using vm_ops->close (see kernfs_fop_mmap). It seems possible to use struct kernfs_ops->release(), but I don't understand at all how that deals with multiple mmaps of the same file in a single process. Also makes me wonder what happens when a process mmaps the kmod BTF, the module is unloaded and then the process attempts to access the mmap. My cursory understanding is that this would raise a fault, which isn't great at all.
If nobody objects / has solutions I'll send a v3 of my original patch with reviews addressed but without being able to mmap kmods.
Thanks Lorenz
On Mon, May 5, 2025 at 7:37 AM Lorenz Bauer lmb@isovalent.com wrote:
On Fri, May 2, 2025 at 6:15 PM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
remap_pfn_range() should be avoided. See big comment in kernel/events/core.c in map_range().
The following seems to work:
Thanks, this helped a lot.
but this part is puzzling: trailing = page_size - (btf_size % page_size) % page_size;
The intention is to calculate how many bytes of trailing zeroes to expect while accounting for the case where btf_size % page_size == 0.
Well, if it was: trailing = page_size - (btf_size % page_size); then it would be clear.
Extra '% page_size' makes it odd.
I could replace this with a check
end = btf_size + (page_size - 1) / page_size * page_size;
it's equivalent to end = btf_size; '(page_size - 1) / page_size' is always zero.
for (i = btf_size; i < end; i++) ...
Better?
In the meantime I've looked at allowing mmap of kmods. I'm not sure it's worth the effort:
- Allocations of btf->data in btf_parse_module() would have to use
vmalloc_user() so that allocations are page aligned and zeroed appropriately. This will be a bit more expensive on systems with large pages and / or many small kmod BTFs.
since we kvmemdup(BTF seciton) now anyway, making it vmalloc-ed isn't a big deal.
We could only allow mmap of BTF
= PAGE_SIZE, at additional complexity.
I wouldn't go this route. Too much special casing for user space. Unless you mean that 'if (btf_size < PAGE_SIZE) dont_vmalloc' will be the kernel internal decision that is invisible to user space and libbpf-like libraries would try to mmap first anyway and always fallback to reading ?
- We need to hold a refcount on struct btf for each mmapped kernel
module, so that btf->data doesn't get freed. Taking the refcount can happen in the sysfs mmap handler, but dropping it is tricky. kernfs / sysfs doesn't allow using vm_ops->close (see kernfs_fop_mmap). It seems possible to use struct kernfs_ops->release(), but I don't understand at all how that deals with multiple mmaps of the same file in a single process. Also makes me wonder what happens when a process mmaps the kmod BTF, the module is unloaded and then the process attempts to access the mmap. My cursory understanding is that this would raise a fault, which isn't great at all.
that gets tricky indeed.
If nobody objects / has solutions I'll send a v3 of my original patch with reviews addressed but without being able to mmap kmods.
Makes sense to me. We can always follow up.
Add a basic test for the ability to mmap /sys/kernel/btf/vmlinux. Since libbpf doesn't have an API to parse BTF from memory we do some basic sanity checks ourselves.
Signed-off-by: Lorenz Bauer lmb@isovalent.com --- tools/testing/selftests/bpf/prog_tests/btf_sysfs.c | 82 ++++++++++++++++++++++ 1 file changed, 82 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c b/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c new file mode 100644 index 0000000000000000000000000000000000000000..5c8095bedb0517930aabdecc17ca7043f80f3692 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* Copyright (c) 2025 Isovalent */ + +#include <test_progs.h> +#include <bpf/btf.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <unistd.h> + +#define BTF_MAGIC 0xeB9F + +static const char *btf_path = "/sys/kernel/btf/vmlinux"; + +void test_btf_sysfs(void) +{ + struct stat st; + __u64 btf_size; + void *raw_data = NULL; + int fd = -1; + size_t trailing; + long page_size; + struct btf *btf = NULL; + + page_size = sysconf(_SC_PAGESIZE); + if (!ASSERT_GE(page_size, 0, "get_page_size")) + goto cleanup; + + if (!ASSERT_OK(stat(btf_path, &st), "stat_btf")) + goto cleanup; + + btf_size = st.st_size; + trailing = page_size - (btf_size % page_size) % page_size; + + fd = open(btf_path, O_RDONLY); + if (!ASSERT_GE(fd, 0, "open_btf")) + goto cleanup; + + raw_data = mmap(NULL, btf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_writable")) + goto cleanup; + + raw_data = mmap(NULL, btf_size, PROT_READ, MAP_SHARED, fd, 0); + if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_shared")) + goto cleanup; + + raw_data = mmap(NULL, btf_size + trailing + 1, PROT_READ, MAP_PRIVATE, fd, 0); + if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_invalid_size")) + goto cleanup; + + raw_data = mmap(NULL, btf_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (!ASSERT_NEQ(raw_data, MAP_FAILED, "mmap_btf")) + goto cleanup; + + if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_WRITE), -1, + "mprotect_writable")) + goto cleanup; + + if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_EXEC), -1, + "mprotect_executable")) + goto cleanup; + + /* Check padding is zeroed */ + for (int i = 0; i < trailing; i++) { + if (((__u8 *)raw_data)[btf_size + i] != 0) { + PRINT_FAIL("tail of BTF is not zero at page offset %d\n", i); + goto cleanup; + } + } + + btf = btf__new(raw_data, btf_size); + if (!ASSERT_NEQ(btf, NULL, "parse_btf")) + goto cleanup; + +cleanup: + if (raw_data && raw_data != MAP_FAILED) + munmap(raw_data, btf_size); + if (btf) + btf__free(btf); + if (fd >= 0) + close(fd); +}
Teach libbpf to use mmap when parsing vmlinux BTF from /sys. We don't apply this to fall-back paths on the regular file system because there is no way to ensure that modifications underlying the MAP_PRIVATE mapping are not visible to the process.
Signed-off-by: Lorenz Bauer lmb@isovalent.com --- tools/lib/bpf/btf.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 11 deletions(-)
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index b7513d4cce55b263310c341bc254df6364e829d9..7fec41a2dc617c9d388f9ab10d9850ef759c74d9 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -12,6 +12,7 @@ #include <sys/utsname.h> #include <sys/param.h> #include <sys/stat.h> +#include <sys/mman.h> #include <linux/kernel.h> #include <linux/err.h> #include <linux/btf.h> @@ -120,6 +121,9 @@ struct btf { /* whether base_btf should be freed in btf_free for this instance */ bool owns_base;
+ /* whether raw_data is a (read-only) mmap */ + bool raw_data_is_mmap; + /* BTF object FD, if loaded into kernel */ int fd;
@@ -951,6 +955,17 @@ static bool btf_is_modifiable(const struct btf *btf) return (void *)btf->hdr != btf->raw_data; }
+static void btf_free_raw_data(struct btf *btf) +{ + if (btf->raw_data_is_mmap) { + munmap(btf->raw_data, btf->raw_size); + btf->raw_data_is_mmap = false; + } else { + free(btf->raw_data); + } + btf->raw_data = NULL; +} + void btf__free(struct btf *btf) { if (IS_ERR_OR_NULL(btf)) @@ -970,7 +985,7 @@ void btf__free(struct btf *btf) free(btf->types_data); strset__free(btf->strs_set); } - free(btf->raw_data); + btf_free_raw_data(btf); free(btf->raw_data_swapped); free(btf->type_offs); if (btf->owns_base) @@ -1030,7 +1045,7 @@ struct btf *btf__new_empty_split(struct btf *base_btf) return libbpf_ptr(btf_new_empty(base_btf)); }
-static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) +static struct btf *btf_new_no_copy(void *data, __u32 size, struct btf *base_btf) { struct btf *btf; int err; @@ -1050,12 +1065,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) btf->start_str_off = base_btf->hdr->str_len; }
- btf->raw_data = malloc(size); - if (!btf->raw_data) { - err = -ENOMEM; - goto done; - } - memcpy(btf->raw_data, data, size); + btf->raw_data = data; btf->raw_size = size;
btf->hdr = btf->raw_data; @@ -1081,6 +1091,24 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) return btf; }
+static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) +{ + struct btf *btf; + void *raw_data; + + raw_data = malloc(size); + if (!raw_data) + return ERR_PTR(-ENOMEM); + + memcpy(raw_data, data, size); + + btf = btf_new_no_copy(raw_data, size, base_btf); + if (IS_ERR(btf)) + free(raw_data); + + return btf; +} + struct btf *btf__new(const void *data, __u32 size) { return libbpf_ptr(btf_new(data, size, NULL)); @@ -1659,8 +1687,7 @@ struct btf *btf__load_from_kernel_by_id(__u32 id) static void btf_invalidate_raw_data(struct btf *btf) { if (btf->raw_data) { - free(btf->raw_data); - btf->raw_data = NULL; + btf_free_raw_data(btf); } if (btf->raw_data_swapped) { free(btf->raw_data_swapped); @@ -5290,7 +5317,40 @@ struct btf *btf__load_vmlinux_btf(void) pr_warn("kernel BTF is missing at '%s', was CONFIG_DEBUG_INFO_BTF enabled?\n", sysfs_btf_path); } else { - btf = btf__parse(sysfs_btf_path, NULL); + struct stat st; + void *data = NULL; + int fd; + + fd = open(sysfs_btf_path, O_RDONLY); + if (fd < 0) { + err = -errno; + pr_warn("failed to open kernel BTF at '%s': %s\n", + sysfs_btf_path, errstr(err)); + return libbpf_err_ptr(err); + } + + if (fstat(fd, &st) < 0) { + err = -errno; + pr_warn("failed to stat kernel BTF at '%s': %s\n", + sysfs_btf_path, errstr(err)); + close(fd); + return libbpf_err_ptr(err); + } + + data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + + if (data != MAP_FAILED) { + btf = libbpf_ptr(btf_new_no_copy(data, st.st_size, NULL)); + if (!btf) + munmap(data, st.st_size); + else + btf->raw_data_is_mmap = true; + } else { + pr_debug("reading kernel BTF via file-based fallback\n"); + btf = btf__parse(sysfs_btf_path, NULL); + } + if (!btf) { err = -errno; pr_warn("failed to read kernel BTF from '%s': %s\n",
linux-kselftest-mirror@lists.linaro.org