Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once.
Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+.
Cc: stable@vger.kernel.org Cc: Jann Horn jannh@google.com Suggested-by: Andi Kleen ak@linux.intel.com Fixes: bd7525dacd7e ("bpf: Move stack_map_get_build_id into lib") Signed-off-by: Andrii Nakryiko andrii@kernel.org --- lib/buildid.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-)
diff --git a/lib/buildid.c b/lib/buildid.c index e02b5507418b..d21d86f6c19a 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -18,28 +18,29 @@ static int parse_build_id_buf(unsigned char *build_id, const void *note_start, Elf32_Word note_size) { + const char note_name[] = "GNU"; + const size_t note_name_sz = sizeof(note_name); Elf32_Word note_offs = 0, new_offs; + u32 name_sz, desc_sz; + const char *data;
while (note_offs + sizeof(Elf32_Nhdr) < note_size) { Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+ name_sz = READ_ONCE(nhdr->n_namesz); + desc_sz = READ_ONCE(nhdr->n_descsz); if (nhdr->n_type == BUILD_ID && - nhdr->n_namesz == sizeof("GNU") && - !strcmp((char *)(nhdr + 1), "GNU") && - nhdr->n_descsz > 0 && - nhdr->n_descsz <= BUILD_ID_SIZE_MAX) { - memcpy(build_id, - note_start + note_offs + - ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - nhdr->n_descsz); - memset(build_id + nhdr->n_descsz, 0, - BUILD_ID_SIZE_MAX - nhdr->n_descsz); + name_sz == note_name_sz && + strcmp((char *)(nhdr + 1), note_name) == 0 && + desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) { + data = note_start + note_offs + ALIGN(note_name_sz, 4); + memcpy(build_id, data, desc_sz); + memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz); if (size) - *size = nhdr->n_descsz; + *size = desc_sz; return 0; } - new_offs = note_offs + sizeof(Elf32_Nhdr) + - ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + new_offs = note_offs + sizeof(Elf32_Nhdr) + ALIGN(name_sz, 4) + ALIGN(desc_sz, 4); if (new_offs <= note_offs) /* overflow */ break; note_offs = new_offs; @@ -71,7 +72,7 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, { Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; Elf32_Phdr *phdr; - int i; + __u32 i, phnum;
/* * FIXME @@ -80,9 +81,10 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, */ if (ehdr->e_phoff != sizeof(Elf32_Ehdr)) return -EINVAL; + + phnum = READ_ONCE(ehdr->e_phnum); /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + if (phnum > (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) return -EINVAL;
phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); @@ -90,8 +92,8 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && !parse_build_id(page_addr, build_id, size, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) + page_addr + READ_ONCE(phdr[i].p_offset), + READ_ONCE(phdr[i].p_filesz))) return 0; } return -EINVAL; @@ -103,7 +105,7 @@ static int get_build_id_64(const void *page_addr, unsigned char *build_id, { Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; Elf64_Phdr *phdr; - int i; + __u32 i, phnum;
/* * FIXME @@ -112,18 +114,19 @@ static int get_build_id_64(const void *page_addr, unsigned char *build_id, */ if (ehdr->e_phoff != sizeof(Elf64_Ehdr)) return -EINVAL; + + phnum = READ_ONCE(ehdr->e_phnum); /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + if (phnum > (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) return -EINVAL;
phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
- for (i = 0; i < ehdr->e_phnum; ++i) { + for (i = 0; i < phnum; ++i) { if (phdr[i].p_type == PT_NOTE && !parse_build_id(page_addr, build_id, size, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) + page_addr + READ_ONCE(phdr[i].p_offset), + READ_ONCE(phdr[i].p_filesz))) return 0; } return -EINVAL;
while (note_offs + sizeof(Elf32_Nhdr) < note_size) { Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
name_sz = READ_ONCE(nhdr->n_namesz);
if (nhdr->n_type == BUILD_ID &&desc_sz = READ_ONCE(nhdr->n_descsz);
nhdr->n_namesz == sizeof("GNU") &&
!strcmp((char *)(nhdr + 1), "GNU") &&
nhdr->n_descsz > 0 &&
nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
memcpy(build_id,
note_start + note_offs +
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
nhdr->n_descsz);
memset(build_id + nhdr->n_descsz, 0,
BUILD_ID_SIZE_MAX - nhdr->n_descsz);
name_sz == note_name_sz &&
strcmp((char *)(nhdr + 1), note_name) == 0 &&
Doesn't the strcmp need a boundary check to be inside note_size too?
Other it may read into the next page, which could be unmapped, causing a fault. Given it's unlikely that this happen, and the end has guard pages, but there are some users of set_memory_np.
You could just move the later checks earlier.
The rest looks good to me.
-Andi
On Tue, Jul 30, 2024 at 9:05 PM Andi Kleen ak@linux.intel.com wrote:
while (note_offs + sizeof(Elf32_Nhdr) < note_size) { Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
name_sz = READ_ONCE(nhdr->n_namesz);
desc_sz = READ_ONCE(nhdr->n_descsz); if (nhdr->n_type == BUILD_ID &&
nhdr->n_namesz == sizeof("GNU") &&
!strcmp((char *)(nhdr + 1), "GNU") &&
nhdr->n_descsz > 0 &&
nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
memcpy(build_id,
note_start + note_offs +
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
nhdr->n_descsz);
memset(build_id + nhdr->n_descsz, 0,
BUILD_ID_SIZE_MAX - nhdr->n_descsz);
name_sz == note_name_sz &&
strcmp((char *)(nhdr + 1), note_name) == 0 &&
Doesn't the strcmp need a boundary check to be inside note_size too?
Other it may read into the next page, which could be unmapped, causing a fault. Given it's unlikely that this happen, and the end has guard pages, but there are some users of set_memory_np.
You could just move the later checks earlier.
Yep, good catch! I'll move the overflow check and will add a note_size check to it, thanks!
The rest looks good to me.
-Andi
+Matthew and fsdevel list for pagecache question
On Tue, Jul 30, 2024 at 10:39 PM Andrii Nakryiko andrii@kernel.org wrote:
Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once.
Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+.
One thing that still seems dodgy to me with this patch applied is the call from build_id_parse() to find_get_page(), followed by reading the page contents. My understanding of the page cache (which might be incorrect) is that find_get_page() can return a page whose contents have not been initialized yet, and you're supposed to check for PageUptodate() or something like that before reading from it.
Maybe Matthew can check if I understood that right?
Also, it might be a good idea to liberally spray READ_ONCE() around all the remaining unannotated shared memory accesses in build_id_parse(), get_build_id_32(), get_build_id_64() and parse_build_id_buf().
Cc: stable@vger.kernel.org Cc: Jann Horn jannh@google.com Suggested-by: Andi Kleen ak@linux.intel.com Fixes: bd7525dacd7e ("bpf: Move stack_map_get_build_id into lib") Signed-off-by: Andrii Nakryiko andrii@kernel.org
lib/buildid.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-)
diff --git a/lib/buildid.c b/lib/buildid.c index e02b5507418b..d21d86f6c19a 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -18,28 +18,29 @@ static int parse_build_id_buf(unsigned char *build_id, const void *note_start, Elf32_Word note_size) {
const char note_name[] = "GNU";
const size_t note_name_sz = sizeof(note_name); Elf32_Word note_offs = 0, new_offs;
u32 name_sz, desc_sz;
const char *data; while (note_offs + sizeof(Elf32_Nhdr) < note_size) { Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
name_sz = READ_ONCE(nhdr->n_namesz);
desc_sz = READ_ONCE(nhdr->n_descsz); if (nhdr->n_type == BUILD_ID &&
nhdr->n_namesz == sizeof("GNU") &&
!strcmp((char *)(nhdr + 1), "GNU") &&
nhdr->n_descsz > 0 &&
nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
memcpy(build_id,
note_start + note_offs +
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
nhdr->n_descsz);
memset(build_id + nhdr->n_descsz, 0,
BUILD_ID_SIZE_MAX - nhdr->n_descsz);
name_sz == note_name_sz &&
strcmp((char *)(nhdr + 1), note_name) == 0 &&
desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) {
data = note_start + note_offs + ALIGN(note_name_sz, 4);
I don't think we have any guarantee here that this addition won't result in an OOB pointer?
memcpy(build_id, data, desc_sz);
I think this can access OOB data (because "data" can already be OOB and because "desc_sz" hasn't been checked against the amount of remaining space in the page).
memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz); if (size)
*size = nhdr->n_descsz;
*size = desc_sz; return 0; }
new_offs = note_offs + sizeof(Elf32_Nhdr) +
ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
new_offs = note_offs + sizeof(Elf32_Nhdr) + ALIGN(name_sz, 4) + ALIGN(desc_sz, 4); if (new_offs <= note_offs) /* overflow */ break;
You check whether "new_offs" has wrapped here, but then on the next loop iteration, you check for "note_offs + sizeof(Elf32_Nhdr) < note_size". So if new_offs is 0xffffffff at this point, then I think the overflow check here will be passed, the loop condition will be true on 32-bit kernels (on 64-bit kernels it won't be because the addition happens on 64-bit numbers thanks to sizeof()), and "nhdr" will point in front of the note?
note_offs = new_offs;
On Wed, Aug 7, 2024 at 8:12 AM Jann Horn jannh@google.com wrote:
+Matthew and fsdevel list for pagecache question
On Tue, Jul 30, 2024 at 10:39 PM Andrii Nakryiko andrii@kernel.org wrote:
Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once.
Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+.
One thing that still seems dodgy to me with this patch applied is the call from build_id_parse() to find_get_page(), followed by reading the page contents. My understanding of the page cache (which might be incorrect) is that find_get_page() can return a page whose contents have not been initialized yet, and you're supposed to check for PageUptodate() or something like that before reading from it.
Maybe Matthew can check if I understood that right?
Also, it might be a good idea to liberally spray READ_ONCE() around all the remaining unannotated shared memory accesses in build_id_parse(), get_build_id_32(), get_build_id_64() and parse_build_id_buf().
Andi was against that, so I kept READ_ONCE() only where strictly necessary, AFAICT.
Cc: stable@vger.kernel.org Cc: Jann Horn jannh@google.com Suggested-by: Andi Kleen ak@linux.intel.com Fixes: bd7525dacd7e ("bpf: Move stack_map_get_build_id into lib") Signed-off-by: Andrii Nakryiko andrii@kernel.org
lib/buildid.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-)
diff --git a/lib/buildid.c b/lib/buildid.c index e02b5507418b..d21d86f6c19a 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -18,28 +18,29 @@ static int parse_build_id_buf(unsigned char *build_id, const void *note_start, Elf32_Word note_size) {
const char note_name[] = "GNU";
const size_t note_name_sz = sizeof(note_name); Elf32_Word note_offs = 0, new_offs;
u32 name_sz, desc_sz;
const char *data; while (note_offs + sizeof(Elf32_Nhdr) < note_size) { Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
name_sz = READ_ONCE(nhdr->n_namesz);
desc_sz = READ_ONCE(nhdr->n_descsz); if (nhdr->n_type == BUILD_ID &&
nhdr->n_namesz == sizeof("GNU") &&
!strcmp((char *)(nhdr + 1), "GNU") &&
nhdr->n_descsz > 0 &&
nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
memcpy(build_id,
note_start + note_offs +
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
nhdr->n_descsz);
memset(build_id + nhdr->n_descsz, 0,
BUILD_ID_SIZE_MAX - nhdr->n_descsz);
name_sz == note_name_sz &&
strcmp((char *)(nhdr + 1), note_name) == 0 &&
desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) {
data = note_start + note_offs + ALIGN(note_name_sz, 4);
I don't think we have any guarantee here that this addition won't result in an OOB pointer?
memcpy(build_id, data, desc_sz);
I think this can access OOB data (because "data" can already be OOB and because "desc_sz" hasn't been checked against the amount of remaining space in the page).
Andi already pointed this out and I fixed it locally, thanks.
memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz); if (size)
*size = nhdr->n_descsz;
*size = desc_sz; return 0; }
new_offs = note_offs + sizeof(Elf32_Nhdr) +
ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
new_offs = note_offs + sizeof(Elf32_Nhdr) + ALIGN(name_sz, 4) + ALIGN(desc_sz, 4); if (new_offs <= note_offs) /* overflow */ break;
You check whether "new_offs" has wrapped here, but then on the next loop iteration, you check for "note_offs + sizeof(Elf32_Nhdr) < note_size". So if new_offs is 0xffffffff at this point, then I think the overflow check here will be passed, the loop condition will be true on 32-bit kernels (on 64-bit kernels it won't be because the addition happens on 64-bit numbers thanks to sizeof()), and "nhdr" will point in front of the note?
Correct, and so I moved this new_offs calculation and overflow check to the beginning of the loop, which I think should capture this issue.
For the while() condition itself I have:
if (check_add_overflow(note_offs, note_size, ¬e_end)) return -EINVAL;
while (note_offs < note_end - sizeof(Elf32_Nhdr) - note_name_sz) { ... }
I'll try to post an updated version soon-ish, have been waiting for mm folks feedback before posting a new version.
note_offs = new_offs;
linux-stable-mirror@lists.linaro.org