From: Po-Hsu Lin <po-hsu.lin(a)canonical.com>
[ Upstream commit 8c03557c3f25271e62e39154af66ebdd1b59c9ca ]
The run_afpackettests will be marked as passed regardless the return
value of those sub-tests in the script:
--------------------
running psock_tpacket test
--------------------
[FAIL]
selftests: run_afpackettests [PASS]
Fix this by changing the return value for each tests.
Signed-off-by: Po-Hsu Lin <po-hsu.lin(a)canonical.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
tools/testing/selftests/net/run_afpackettests | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/tools/testing/selftests/net/run_afpackettests b/tools/testing/selftests/net/run_afpackettests
index 2dc95fda7ef7..ea5938ec009a 100755
--- a/tools/testing/selftests/net/run_afpackettests
+++ b/tools/testing/selftests/net/run_afpackettests
@@ -6,12 +6,14 @@ if [ $(id -u) != 0 ]; then
exit 0
fi
+ret=0
echo "--------------------"
echo "running psock_fanout test"
echo "--------------------"
./in_netns.sh ./psock_fanout
if [ $? -ne 0 ]; then
echo "[FAIL]"
+ ret=1
else
echo "[PASS]"
fi
@@ -22,6 +24,7 @@ echo "--------------------"
./in_netns.sh ./psock_tpacket
if [ $? -ne 0 ]; then
echo "[FAIL]"
+ ret=1
else
echo "[PASS]"
fi
@@ -32,6 +35,8 @@ echo "--------------------"
./in_netns.sh ./txring_overwrite
if [ $? -ne 0 ]; then
echo "[FAIL]"
+ ret=1
else
echo "[PASS]"
fi
+exit $ret
--
2.20.1
From: Po-Hsu Lin <po-hsu.lin(a)canonical.com>
[ Upstream commit 30c04d796b693e22405c38e9b78e9a364e4c77e6 ]
The run_netsocktests will be marked as passed regardless the actual test
result from the ./socket:
selftests: net: run_netsocktests
========================================
--------------------
running socket test
--------------------
[FAIL]
ok 1..6 selftests: net: run_netsocktests [PASS]
This is because the test script itself has been successfully executed.
Fix this by exit 1 when the test failed.
Signed-off-by: Po-Hsu Lin <po-hsu.lin(a)canonical.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
tools/testing/selftests/net/run_netsocktests | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/run_netsocktests b/tools/testing/selftests/net/run_netsocktests
index b093f39c298c..14e41faf2c57 100755
--- a/tools/testing/selftests/net/run_netsocktests
+++ b/tools/testing/selftests/net/run_netsocktests
@@ -7,7 +7,7 @@ echo "--------------------"
./socket
if [ $? -ne 0 ]; then
echo "[FAIL]"
+ exit 1
else
echo "[PASS]"
fi
-
--
2.20.1
From: David Ahern <dsahern(a)gmail.com>
[ Upstream commit a5f622984a623df9a84cf43f6b098d8dd76fbe05 ]
A couple of tests are verifying a route has been removed. The helper
expects the prefix as the first part of the expected output. When
checking that a route has been deleted the prefix is empty leading
to an invalid ip command:
$ ip ro ls match
Command line is not complete. Try option "help"
Fix by moving the comparison of expected output and output to a new
function that is used by both check_route and check_route6. Use the
new helper for the 2 checks on route removal.
Also, remove the reset of 'set -x' in route_setup which overrides the
user managed setting.
Fixes: d69faad76584c ("selftests: fib_tests: Add prefix route tests with metric")
Signed-off-by: David Ahern <dsahern(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
tools/testing/selftests/net/fib_tests.sh | 94 ++++++++++--------------
1 file changed, 40 insertions(+), 54 deletions(-)
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 1080ff55a788..0d2a5f4f1e63 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -605,6 +605,39 @@ run_cmd()
return $rc
}
+check_expected()
+{
+ local out="$1"
+ local expected="$2"
+ local rc=0
+
+ [ "${out}" = "${expected}" ] && return 0
+
+ if [ -z "${out}" ]; then
+ if [ "$VERBOSE" = "1" ]; then
+ printf "\nNo route entry found\n"
+ printf "Expected:\n"
+ printf " ${expected}\n"
+ fi
+ return 1
+ fi
+
+ # tricky way to convert output to 1-line without ip's
+ # messy '\'; this drops all extra white space
+ out=$(echo ${out})
+ if [ "${out}" != "${expected}" ]; then
+ rc=1
+ if [ "${VERBOSE}" = "1" ]; then
+ printf " Unexpected route entry. Have:\n"
+ printf " ${out}\n"
+ printf " Expected:\n"
+ printf " ${expected}\n\n"
+ fi
+ fi
+
+ return $rc
+}
+
# add route for a prefix, flushing any existing routes first
# expected to be the first step of a test
add_route6()
@@ -652,31 +685,7 @@ check_route6()
pfx=$1
out=$($IP -6 ro ls match ${pfx} | sed -e 's/ pref medium//')
- [ "${out}" = "${expected}" ] && return 0
-
- if [ -z "${out}" ]; then
- if [ "$VERBOSE" = "1" ]; then
- printf "\nNo route entry found\n"
- printf "Expected:\n"
- printf " ${expected}\n"
- fi
- return 1
- fi
-
- # tricky way to convert output to 1-line without ip's
- # messy '\'; this drops all extra white space
- out=$(echo ${out})
- if [ "${out}" != "${expected}" ]; then
- rc=1
- if [ "${VERBOSE}" = "1" ]; then
- printf " Unexpected route entry. Have:\n"
- printf " ${out}\n"
- printf " Expected:\n"
- printf " ${expected}\n\n"
- fi
- fi
-
- return $rc
+ check_expected "${out}" "${expected}"
}
route_cleanup()
@@ -725,7 +734,7 @@ route_setup()
ip -netns ns2 addr add 172.16.103.2/24 dev veth4
ip -netns ns2 addr add 172.16.104.1/24 dev dummy1
- set +ex
+ set +e
}
# assumption is that basic add of a single path route works
@@ -960,7 +969,8 @@ ipv6_addr_metric_test()
run_cmd "$IP li set dev dummy2 down"
rc=$?
if [ $rc -eq 0 ]; then
- check_route6 ""
+ out=$($IP -6 ro ls match 2001:db8:104::/64)
+ check_expected "${out}" ""
rc=$?
fi
log_test $rc 0 "Prefix route removed on link down"
@@ -1091,38 +1101,13 @@ check_route()
local pfx
local expected="$1"
local out
- local rc=0
set -- $expected
pfx=$1
[ "${pfx}" = "unreachable" ] && pfx=$2
out=$($IP ro ls match ${pfx})
- [ "${out}" = "${expected}" ] && return 0
-
- if [ -z "${out}" ]; then
- if [ "$VERBOSE" = "1" ]; then
- printf "\nNo route entry found\n"
- printf "Expected:\n"
- printf " ${expected}\n"
- fi
- return 1
- fi
-
- # tricky way to convert output to 1-line without ip's
- # messy '\'; this drops all extra white space
- out=$(echo ${out})
- if [ "${out}" != "${expected}" ]; then
- rc=1
- if [ "${VERBOSE}" = "1" ]; then
- printf " Unexpected route entry. Have:\n"
- printf " ${out}\n"
- printf " Expected:\n"
- printf " ${expected}\n\n"
- fi
- fi
-
- return $rc
+ check_expected "${out}" "${expected}"
}
# assumption is that basic add of a single path route works
@@ -1387,7 +1372,8 @@ ipv4_addr_metric_test()
run_cmd "$IP li set dev dummy2 down"
rc=$?
if [ $rc -eq 0 ]; then
- check_route ""
+ out=$($IP ro ls match 172.16.104.0/24)
+ check_expected "${out}" ""
rc=$?
fi
log_test $rc 0 "Prefix route removed on link down"
--
2.20.1
Hi Linus,
Please pull the following Kselftest update for Linux 5.2-rc1
This Kselftest update for Linux 5.2-rc1 consists of
- fixes to seccomp test, and kselftest framework
- cleanups to remove duplicate header defines
- fixes to efivarfs "make clean" target
- cgroup cleanup path
- Moving the IMA kexec_load selftest to selftests/kexec work from
Mimi Johar and Petr Vorel
- A framework to kselftest for writing kernel test modules addition
from Tobin C. Harding
diff is attached.
thanks,
-- Shuah
----------------------------------------------------------------
The following changes since commit 15ade5d2e7775667cf191cf2f94327a4889f8b9d:
Linux 5.1-rc4 (2019-04-07 14:09:59 -1000)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
tags/linux-kselftest-5.2-rc1
for you to fetch changes up to d917fb876f6eaeeea8a2b620d2a266ce26372f4d:
selftests: build and run gpio when output directory is the src dir
(2019-04-22 17:02:26 -0600)
----------------------------------------------------------------
linux-kselftest-5.2-rc1
This Kselftest update for Linux 5.2-rc1 consists of
- fixes to seccomp test, and kselftest framework
- cleanups to remove duplicate header defines
- fixes to efivarfs "make clean" target
- cgroup cleanup path
- Moving the IMA kexec_load selftest to selftests/kexec work from
Mimi Johar and Petr Vorel
- A framework to kselftest for writing kernel test modules addition
from Tobin C. Harding
----------------------------------------------------------------
Kees Cook (3):
selftests/seccomp: Handle namespace failures gracefully
selftests/harness: Add 30 second timeout per test
selftests/ipc: Fix msgque compiler warnings
Mathieu Desnoyers (1):
rseq/selftests: Adapt number of threads to the number of detected
cpus
Mimi Zohar (9):
selftests/kexec: move the IMA kexec_load selftest to selftests/kexec
selftests/kexec: cleanup the kexec selftest
selftests/kexec: define a set of common functions
selftests/kexec: define common logging functions
selftests/kexec: define "require_root_privileges"
selftests/kexec: kexec_file_load syscall test
selftests/kexec: check kexec_load and kexec_file_load are enabled
selftests/kexec: make kexec_load test independent of IMA being
enabled
selftests/kexec: update get_secureboot_mode
Petr Vorel (1):
selftests/kexec: Add missing '=y' to config options
Po-Hsu Lin (1):
selftests/efivarfs: clean up test files from test_create*()
Roman Gushchin (1):
selftests: cgroup: fix cleanup path in test_memcg_subtree_control()
Sabyasachi Gupta (4):
selftest/x86/mpx-dig.c: Remove duplicate header
selftest/timers: Remove duplicate header
selftest/rseq: Remove duplicate header
selftest/gpio: Remove duplicate header
Shuah Khan (2):
selftests: fix headers_install circular dependency
selftests: build and run gpio when output directory is the src dir
Tobin C. Harding (6):
lib/test_printf: Add empty module_exit function
kselftest: Add test runner creation script
kselftest: Add test module framework header
lib: Use new kselftest header
lib/string: Add strscpy_pad() function
lib: Add test module for strscpy_pad
ZhangXiaoxu (1):
selftests: efivarfs: remove the test_create_read file if it was exist
Documentation/dev-tools/kselftest.rst | 94 ++++++++-
include/linux/string.h | 4 +
lib/Kconfig.debug | 3 +
lib/Makefile | 1 +
lib/string.c | 47 ++++-
lib/test_bitmap.c | 20 +-
lib/test_printf.c | 17 +-
lib/test_strscpy.c | 150 ++++++++++++++
tools/testing/selftests/Makefile | 63 +++++-
tools/testing/selftests/cgroup/test_memcontrol.c | 38 ++--
tools/testing/selftests/efivarfs/efivarfs.sh | 28 ++-
tools/testing/selftests/gpio/gpio-mockup-chardev.c | 1 -
tools/testing/selftests/ima/config | 4 -
tools/testing/selftests/ima/test_kexec_load.sh | 54 -----
tools/testing/selftests/ipc/msgque.c | 11 +-
tools/testing/selftests/{ima => kexec}/Makefile | 5 +-
tools/testing/selftests/kexec/config | 3 +
tools/testing/selftests/kexec/kexec_common_lib.sh | 220
+++++++++++++++++++++
.../selftests/kexec/test_kexec_file_load.sh | 208
+++++++++++++++++++
tools/testing/selftests/kexec/test_kexec_load.sh | 47 +++++
tools/testing/selftests/kselftest_harness.h | 2 +
tools/testing/selftests/kselftest_module.h | 48 +++++
tools/testing/selftests/kselftest_module.sh | 84 ++++++++
tools/testing/selftests/lib.mk | 38 +++-
tools/testing/selftests/lib/Makefile | 2 +-
tools/testing/selftests/lib/bitmap.sh | 18 +-
tools/testing/selftests/lib/config | 1 +
tools/testing/selftests/lib/prime_numbers.sh | 17 +-
tools/testing/selftests/lib/printf.sh | 19 +-
tools/testing/selftests/lib/strscpy.sh | 3 +
tools/testing/selftests/rseq/rseq.h | 1 -
tools/testing/selftests/rseq/run_param_test.sh | 7 +-
tools/testing/selftests/seccomp/seccomp_bpf.c | 43 ++--
tools/testing/selftests/timers/skew_consistency.c | 1 -
tools/testing/selftests/x86/mpx-dig.c | 2 -
35 files changed, 1081 insertions(+), 223 deletions(-)
create mode 100644 lib/test_strscpy.c
delete mode 100644 tools/testing/selftests/ima/config
delete mode 100755 tools/testing/selftests/ima/test_kexec_load.sh
rename tools/testing/selftests/{ima => kexec}/Makefile (59%)
create mode 100644 tools/testing/selftests/kexec/config
create mode 100755 tools/testing/selftests/kexec/kexec_common_lib.sh
create mode 100755 tools/testing/selftests/kexec/test_kexec_file_load.sh
create mode 100755 tools/testing/selftests/kexec/test_kexec_load.sh
create mode 100644 tools/testing/selftests/kselftest_module.h
create mode 100755 tools/testing/selftests/kselftest_module.sh
create mode 100755 tools/testing/selftests/lib/strscpy.sh
----------------------------------------------------------------
=== Overview
arm64 has a feature called Top Byte Ignore, which allows to embed pointer
tags into the top byte of each pointer. Userspace programs (such as
HWASan, a memory debugging tool [1]) might use this feature and pass
tagged user pointers to the kernel through syscalls or other interfaces.
Right now the kernel is already able to handle user faults with tagged
pointers, due to these patches:
1. 81cddd65 ("arm64: traps: fix userspace cache maintenance emulation on a
tagged pointer")
2. 7dcd9dd8 ("arm64: hw_breakpoint: fix watchpoint matching for tagged
pointers")
3. 276e9327 ("arm64: entry: improve data abort handling of tagged
pointers")
This patchset extends tagged pointer support to syscall arguments.
As per the proposed ABI change [3], tagged pointers are only allowed to be
passed to syscalls when they point to memory ranges obtained by anonymous
mmap() or sbrk() (see the patchset [3] for more details).
For non-memory syscalls this is done by untaging user pointers when the
kernel performs pointer checking to find out whether the pointer comes
from userspace (most notably in access_ok). The untagging is done only
when the pointer is being checked, the tag is preserved as the pointer
makes its way through the kernel and stays tagged when the kernel
dereferences the pointer when perfoming user memory accesses.
Memory syscalls (mmap, mprotect, etc.) don't do user memory accesses but
rather deal with memory ranges, and untagged pointers are better suited to
describe memory ranges internally. Thus for memory syscalls we untag
pointers completely when they enter the kernel.
=== Other approaches
One of the alternative approaches to untagging that was considered is to
completely strip the pointer tag as the pointer enters the kernel with
some kind of a syscall wrapper, but that won't work with the countless
number of different ioctl calls. With this approach we would need a custom
wrapper for each ioctl variation, which doesn't seem practical.
An alternative approach to untagging pointers in memory syscalls prologues
is to inspead allow tagged pointers to be passed to find_vma() (and other
vma related functions) and untag them there. Unfortunately, a lot of
find_vma() callers then compare or subtract the returned vma start and end
fields against the pointer that was being searched. Thus this approach
would still require changing all find_vma() callers.
=== Testing
The following testing approaches has been taken to find potential issues
with user pointer untagging:
1. Static testing (with sparse [2] and separately with a custom static
analyzer based on Clang) to track casts of __user pointers to integer
types to find places where untagging needs to be done.
2. Static testing with grep to find parts of the kernel that call
find_vma() (and other similar functions) or directly compare against
vm_start/vm_end fields of vma.
3. Static testing with grep to find parts of the kernel that compare
user pointers with TASK_SIZE or other similar consts and macros.
4. Dynamic testing: adding BUG_ON(has_tag(addr)) to find_vma() and running
a modified syzkaller version that passes tagged pointers to the kernel.
Based on the results of the testing the requried patches have been added
to the patchset.
=== Notes
This patchset is meant to be merged together with "arm64 relaxed ABI" [3].
This patchset is a prerequisite for ARM's memory tagging hardware feature
support [4].
This patchset has been merged into the Pixel 2 & 3 kernel trees and is
now being used to enable testing of Pixel phones with HWASan.
Thanks!
[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
[2] https://github.com/lucvoo/sparse-dev/commit/5f960cb10f56ec2017c128ef9d16060…
[3] https://lkml.org/lkml/2019/3/18/819
[4] https://community.arm.com/processors/b/blog/posts/arm-a-profile-architectur…
Changes in v14:
- Moved untagging for most memory syscalls to an arm64 specific
implementation, instead of doing that in the common code.
- Dropped "net, arm64: untag user pointers in tcp_zerocopy_receive", since
the provided user pointers don't come from an anonymous map and thus are
not covered by this ABI relaxation.
- Dropped "kernel, arm64: untag user pointers in prctl_set_mm*".
- Moved untagging from __check_mem_type() to tee_shm_register().
- Updated untagging for the amdgpu and radeon drivers to cover the MMU
notifier, as suggested by Felix.
- Since this ABI relaxation doesn't actually allow tagged instruction
pointers, dropped the following patches:
- Dropped "tracing, arm64: untag user pointers in seq_print_user_ip".
- Dropped "uprobes, arm64: untag user pointers in find_active_uprobe".
- Dropped "bpf, arm64: untag user pointers in stack_map_get_build_id_offset".
- Rebased onto 5.1-rc7 (37624b58).
Changes in v13:
- Simplified untagging in tcp_zerocopy_receive().
- Looked at find_vma() callers in drivers/, which allowed to identify a
few other places where untagging is needed.
- Added patch "mm, arm64: untag user pointers in get_vaddr_frames".
- Added patch "drm/amdgpu, arm64: untag user pointers in
amdgpu_ttm_tt_get_user_pages".
- Added patch "drm/radeon, arm64: untag user pointers in
radeon_ttm_tt_pin_userptr".
- Added patch "IB/mlx4, arm64: untag user pointers in mlx4_get_umem_mr".
- Added patch "media/v4l2-core, arm64: untag user pointers in
videobuf_dma_contig_user_get".
- Added patch "tee/optee, arm64: untag user pointers in check_mem_type".
- Added patch "vfio/type1, arm64: untag user pointers".
Changes in v12:
- Changed untagging in tcp_zerocopy_receive() to also untag zc->address.
- Fixed untagging in prctl_set_mm* to only untag pointers for vma lookups
and validity checks, but leave them as is for actual user space accesses.
- Updated the link to the v2 of the "arm64 relaxed ABI" patchset [3].
- Dropped the documentation patch, as the "arm64 relaxed ABI" patchset [3]
handles that.
Changes in v11:
- Added "uprobes, arm64: untag user pointers in find_active_uprobe" patch.
- Added "bpf, arm64: untag user pointers in stack_map_get_build_id_offset"
patch.
- Fixed "tracing, arm64: untag user pointers in seq_print_user_ip" to
correctly perform subtration with a tagged addr.
- Moved untagged_addr() from SYSCALL_DEFINE3(mprotect) and
SYSCALL_DEFINE4(pkey_mprotect) to do_mprotect_pkey().
- Moved untagged_addr() definition for other arches from
include/linux/memory.h to include/linux/mm.h.
- Changed untagging in strn*_user() to perform userspace accesses through
tagged pointers.
- Updated the documentation to mention that passing tagged pointers to
memory syscalls is allowed.
- Updated the test to use malloc'ed memory instead of stack memory.
Changes in v10:
- Added "mm, arm64: untag user pointers passed to memory syscalls" back.
- New patch "fs, arm64: untag user pointers in fs/userfaultfd.c".
- New patch "net, arm64: untag user pointers in tcp_zerocopy_receive".
- New patch "kernel, arm64: untag user pointers in prctl_set_mm*".
- New patch "tracing, arm64: untag user pointers in seq_print_user_ip".
Changes in v9:
- Rebased onto 4.20-rc6.
- Used u64 instead of __u64 in type casts in the untagged_addr macro for
arm64.
- Added braces around (addr) in the untagged_addr macro for other arches.
Changes in v8:
- Rebased onto 65102238 (4.20-rc1).
- Added a note to the cover letter on why syscall wrappers/shims that untag
user pointers won't work.
- Added a note to the cover letter that this patchset has been merged into
the Pixel 2 kernel tree.
- Documentation fixes, in particular added a list of syscalls that don't
support tagged user pointers.
Changes in v7:
- Rebased onto 17b57b18 (4.19-rc6).
- Dropped the "arm64: untag user address in __do_user_fault" patch, since
the existing patches already handle user faults properly.
- Dropped the "usb, arm64: untag user addresses in devio" patch, since the
passed pointer must come from a vma and therefore be untagged.
- Dropped the "arm64: annotate user pointers casts detected by sparse"
patch (see the discussion to the replies of the v6 of this patchset).
- Added more context to the cover letter.
- Updated Documentation/arm64/tagged-pointers.txt.
Changes in v6:
- Added annotations for user pointer casts found by sparse.
- Rebased onto 050cdc6c (4.19-rc1+).
Changes in v5:
- Added 3 new patches that add untagging to places found with static
analysis.
- Rebased onto 44c929e1 (4.18-rc8).
Changes in v4:
- Added a selftest for checking that passing tagged pointers to the
kernel succeeds.
- Rebased onto 81e97f013 (4.18-rc1+).
Changes in v3:
- Rebased onto e5c51f30 (4.17-rc6+).
- Added linux-arch@ to the list of recipients.
Changes in v2:
- Rebased onto 2d618bdf (4.17-rc3+).
- Removed excessive untagging in gup.c.
- Removed untagging pointers returned from __uaccess_mask_ptr.
Changes in v1:
- Rebased onto 4.17-rc1.
Changes in RFC v2:
- Added "#ifndef untagged_addr..." fallback in linux/uaccess.h instead of
defining it for each arch individually.
- Updated Documentation/arm64/tagged-pointers.txt.
- Dropped "mm, arm64: untag user addresses in memory syscalls".
- Rebased onto 3eb2ce82 (4.16-rc7).
Signed-off-by: Andrey Konovalov <andreyknvl(a)google.com>
Andrey Konovalov (17):
uaccess: add untagged_addr definition for other arches
arm64: untag user pointers in access_ok and __uaccess_mask_ptr
lib, arm64: untag user pointers in strn*_user
mm: add ksys_ wrappers to memory syscalls
arms64: untag user pointers passed to memory syscalls
mm: untag user pointers in do_pages_move
mm, arm64: untag user pointers in mm/gup.c
mm, arm64: untag user pointers in get_vaddr_frames
fs, arm64: untag user pointers in copy_mount_options
fs, arm64: untag user pointers in fs/userfaultfd.c
drm/amdgpu, arm64: untag user pointers
drm/radeon, arm64: untag user pointers
IB/mlx4, arm64: untag user pointers in mlx4_get_umem_mr
media/v4l2-core, arm64: untag user pointers in
videobuf_dma_contig_user_get
tee, arm64: untag user pointers in tee_shm_register
vfio/type1, arm64: untag user pointers in vaddr_get_pfn
selftests, arm64: add a selftest for passing tagged pointers to kernel
arch/arm64/include/asm/uaccess.h | 10 +-
arch/arm64/kernel/sys.c | 128 ++++++++++++++++-
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +-
drivers/gpu/drm/radeon/radeon_gem.c | 2 +
drivers/gpu/drm/radeon/radeon_ttm.c | 2 +-
drivers/infiniband/hw/mlx4/mr.c | 7 +-
drivers/media/v4l2-core/videobuf-dma-contig.c | 9 +-
drivers/tee/tee_shm.c | 1 +
drivers/vfio/vfio_iommu_type1.c | 2 +
fs/namespace.c | 2 +-
fs/userfaultfd.c | 5 +
include/linux/mm.h | 4 +
include/linux/syscalls.h | 22 +++
ipc/shm.c | 7 +-
lib/strncpy_from_user.c | 3 +-
lib/strnlen_user.c | 3 +-
mm/frame_vector.c | 2 +
mm/gup.c | 4 +
mm/madvise.c | 129 +++++++++---------
mm/mempolicy.c | 21 ++-
mm/migrate.c | 1 +
mm/mincore.c | 57 ++++----
mm/mlock.c | 20 ++-
mm/mmap.c | 30 +++-
mm/mprotect.c | 6 +-
mm/mremap.c | 27 ++--
mm/msync.c | 35 +++--
tools/testing/selftests/arm64/.gitignore | 1 +
tools/testing/selftests/arm64/Makefile | 11 ++
.../testing/selftests/arm64/run_tags_test.sh | 12 ++
tools/testing/selftests/arm64/tags_test.c | 21 +++
33 files changed, 431 insertions(+), 159 deletions(-)
create mode 100644 tools/testing/selftests/arm64/.gitignore
create mode 100644 tools/testing/selftests/arm64/Makefile
create mode 100755 tools/testing/selftests/arm64/run_tags_test.sh
create mode 100644 tools/testing/selftests/arm64/tags_test.c
--
2.21.0.593.g511ec345e18-goog
On Fri, May 3, 2019 at 10:08 PM Linus Torvalds
<torvalds(a)linux-foundation.org> wrote:
>
> I'll look at it tomorrow, but I think this actually makes unnecessary changes.
>
> In particular, I think we could keep the existing entry code almost unchanged with this whole approach.
So here's what I *think* should work. Note that I also removed your
test-case code, because it really didn't have a chance in hell of
working. Doing that
int3_emulate_call(regs, (unsigned long)&int3_magic);
inside of int3_exception_notify() could not possibly be valid, since
int3_emulate_call() returns the new pt_regs that need to be used, and
throwing it away is clearly wrong.
So you can't use a register_die_notifier() to try to intercept the
'int3' error and then do it manually, it needs to be done by the
ftrace_int3_handler() code that actually returns the new regs, and
where do_kernel_int3() will then return it to the low-level handler.
End result: I haven't actually tested this code, but I've looked
through the patch something like ten times without finding any new
errors.
I've also tried *very* hard to make the patch minimal, with the
exception of the comments, which I tried to make extensive for any of
the subtle cases.
But without testing, it's probably still buggy.
I have to say, I finally like the end result here. Maybe it's because
I got to make my mark and pee in the snow, but I will say that
(a) the actual entry code modifications really are minimal now
(b) the instruction emulation really is very simple and straightforward
(c) yes, we play some stack tricks (and yes, we play them differently
on x86-64 and x86-32), but the tricks are again at least
straightforward, and we never really change the layout of any stack.
So on the whole, I think this is about as good as it gets. Did I get
all the details actually right, and it _works_? I guess we'll see.
Linus
Given that the entry_*.S changes for this functionality are somewhat
tricky, make sure the paths are tested every boot, instead of on the
rare occasion when we trip an INT3 while rewriting text.
Getting the INT3 frame setup even slightly wrong will make this come
unstuck something spectacular.
Requested-by: Andy Lutomirski <luto(a)kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
---
arch/x86/kernel/alternative.c | 81 ++++++++++++++++++++++++++++++++++++++++---
1 file changed, 77 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4db9c0d29bc1..2b853c2ab894 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -613,11 +613,83 @@ extern struct paravirt_patch_site __start_parainstructions[],
__stop_parainstructions[];
#endif /* CONFIG_PARAVIRT */
+/*
+ * Self-test for the INT3 based CALL emulation code.
+ *
+ * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
+ * properly and that there is a stack gap between the INT3 frame and the
+ * previous context. Without this gap doing a virtual PUSH on the interrupted
+ * stack would corrupt the INT3 IRET frame.
+ *
+ * See entry_{32,64}.S for more details.
+ */
+static void __init int3_magic(unsigned int *ptr)
+{
+ *ptr = 1;
+}
+
+extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
+
+static int __init
+int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+
+ if (!regs || user_mode(regs))
+ return NOTIFY_DONE;
+
+ if (val != DIE_INT3)
+ return NOTIFY_DONE;
+
+ if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
+ return NOTIFY_DONE;
+
+ int3_emulate_call(regs, (unsigned long)&int3_magic);
+ return NOTIFY_STOP;
+}
+
+static void __init int3_selftest(void)
+{
+ static __initdata struct notifier_block int3_exception_nb = {
+ .notifier_call = int3_exception_notify,
+ .priority = INT_MAX-1, /* last */
+ };
+ unsigned int val = 0;
+
+ BUG_ON(register_die_notifier(&int3_exception_nb));
+
+ /*
+ * Basically: int3_magic(&val); but really complicated :-)
+ *
+ * Stick the address of the INT3 instruction into int3_selftest_ip,
+ * then trigger the INT3, padded with NOPs to match a CALL instruction
+ * length.
+ */
+ asm volatile ("1: int3; nop; nop; nop; nop\n\t"
+ ".pushsection .init.data,\"aw\"\n\t"
+ ".align " __ASM_SEL(4, 8) "\n\t"
+ ".type int3_selftest_ip, @object\n\t"
+ ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
+ "int3_selftest_ip:\n\t"
+ __ASM_SEL(.long, .quad) " 1b\n\t"
+ ".popsection\n\t"
+ : : __ASM_SEL_RAW(a, D) (&val) : "memory");
+
+ BUG_ON(val != 1);
+
+ unregister_die_notifier(&int3_exception_nb);
+}
+
void __init alternative_instructions(void)
{
- /* The patching is not fully atomic, so try to avoid local interruptions
- that might execute the to be patched code.
- Other CPUs are not running. */
+ int3_selftest();
+
+ /*
+ * The patching is not fully atomic, so try to avoid local
+ * interruptions that might execute the to be patched code.
+ * Other CPUs are not running.
+ */
stop_nmi();
/*
@@ -642,10 +714,11 @@ void __init alternative_instructions(void)
_text, _etext);
}
- if (!uniproc_patched || num_possible_cpus() == 1)
+ if (!uniproc_patched || num_possible_cpus() == 1) {
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
+ }
#endif
apply_paravirt(__parainstructions, __parainstructions_end);
Introduce in-kernel headers which are made available as an archive
through proc (/proc/kheaders.tar.xz file). This archive makes it
possible to run eBPF and other tracing programs that need to extend the
kernel for tracing purposes without any dependency on the file system
having headers.
A github PR is sent for the corresponding BCC patch at:
https://github.com/iovisor/bcc/pull/2312
On Android and embedded systems, it is common to switch kernels but not
have kernel headers available on the file system. Further once a
different kernel is booted, any headers stored on the file system will
no longer be useful. This is an issue even well known to distros.
By storing the headers as a compressed archive within the kernel, we can
avoid these issues that have been a hindrance for a long time.
The best way to use this feature is by building it in. Several users
have a need for this, when they switch debug kernels, they do not want to
update the filesystem or worry about it where to store the headers on
it. However, the feature is also buildable as a module in case the user
desires it not being part of the kernel image. This makes it possible to
load and unload the headers from memory on demand. A tracing program can
load the module, do its operations, and then unload the module to save
kernel memory. The total memory needed is 3.3MB.
By having the archive available at a fixed location independent of
filesystem dependencies and conventions, all debugging tools can
directly refer to the fixed location for the archive, without concerning
with where the headers on a typical filesystem which significantly
simplifies tooling that needs kernel headers.
The code to read the headers is based on /proc/config.gz code and uses
the same technique to embed the headers.
Other approaches were discussed such as having an in-memory mountable
filesystem, but that has drawbacks such as requiring an in-kernel xz
decompressor which we don't have today, and requiring usage of 42 MB of
kernel memory to host the decompressed headers at anytime. Also this
approach is simpler than such approaches.
Reviewed-by: Masahiro Yamada <yamada.masahiro(a)socionext.com>
Signed-off-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
---
(Just a resend with Masahiro's Reviewed-by tag added)
v6 -> v7:
- Minor nits from Masahiro Yamada are addressed.
v5 -> v6: (Masahiro Yamada suggestions mostly)
- Dropped support for module building.
- Rebuild archive if script changes.
- Move archive file list to script.
- Move build script to kernel directory.
v4 -> v5:
(v4 was Tested-by the following folks)
Tested-by: qais.yousef(a)arm.com
Tested-by: dietmar.eggemann(a)arm.com
Tested-by: linux(a)manojrajarao.com
(Thanks to Masahiro Yamada for several excellent suggestions)
- used incbin instead of bin2c (Masahiro did similar idea)
- added module.lds if ia64 otherwise ia64 may fail to build.
- added clean-files rule to Makefile
- removed strip-comments script and doing it inline
- added set -e to header generated to die on errorsr
- fixed a minor issue where find command was noisy.
- removed unneeded tar.xz rule from kernel/.gitignore
- added Tested-by tags from ARM folks.
Changes since v3:
- Blank tar was being generated because of a one line I
forgot to push. It is updated now.
- Added module.lds since arm64 needs it to build modules.
Changes since v2:
(Thanks to Masahiro Yamada for several excellent suggestions)
- Added support for out of tree builds.
- Added incremental build support bringing down build time of
incremental builds from 50 seconds to 5 seconds.
- Fixed various small nits / cleanups.
- clean ups to kheaders.c pointed by Alexey Dobriyan.
- Fixed MODULE_LICENSE in test module and kheaders.c
- Dropped Module.symvers from archive due to circular dependency.
Changes since v1:
- removed IKH_EXTRA variable, not needed (Masahiro Yamada)
- small fix ups to selftest
- added target to main Makefile etc
- added MODULE_LICENSE to test module
- made selftest more quiet
Changes since RFC:
Both changes bring size down to 3.8MB:
- use xz for compression
- strip comments except SPDX lines
- Call out the module name in Kconfig
- Also added selftests in second patch to ensure headers are always
working.
Signed-off-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
init/Kconfig | 10 +++++
kernel/.gitignore | 1 +
kernel/Makefile | 10 +++++
kernel/gen_ikh_data.sh | 89 ++++++++++++++++++++++++++++++++++++++++++
kernel/kheaders.c | 74 +++++++++++++++++++++++++++++++++++
5 files changed, 184 insertions(+)
create mode 100755 kernel/gen_ikh_data.sh
create mode 100644 kernel/kheaders.c
diff --git a/init/Kconfig b/init/Kconfig
index 4592bf7997c0..47c0db6e63a5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -580,6 +580,16 @@ config IKCONFIG_PROC
This option enables access to the kernel configuration file
through /proc/config.gz.
+config IKHEADERS_PROC
+ tristate "Enable kernel header artifacts through /proc/kheaders.tar.xz"
+ depends on PROC_FS
+ help
+ This option enables access to the kernel header and other artifacts that
+ are generated during the build process. These can be used to build eBPF
+ tracing programs, or similar programs. If you build the headers as a
+ module, a module called kheaders.ko is built which can be loaded on-demand
+ to get access to the headers.
+
config LOG_BUF_SHIFT
int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
range 12 25
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 6e699100872f..34d1e77ee9df 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,5 +1,6 @@
#
# Generated files
#
+kheaders.md5
timeconst.h
hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c57e78817da..12399614c350 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -121,3 +122,12 @@ $(obj)/configs.o: $(obj)/config_data.gz
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
+
+$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
+
+quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
+cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@
+$(obj)/kheaders_data.tar.xz: FORCE
+ $(call cmd,genikh)
+
+clean-files := kheaders_data.tar.xz kheaders.md5
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh
new file mode 100755
index 000000000000..591a94f7b387
--- /dev/null
+++ b/kernel/gen_ikh_data.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This script generates an archive consisting of kernel headers
+# for CONFIG_IKHEADERS_PROC.
+set -e
+spath="$(dirname "$(readlink -f "$0")")"
+kroot="$spath/.."
+outdir="$(pwd)"
+tarfile=$1
+cpio_dir=$outdir/$tarfile.tmp
+
+# Script filename relative to the kernel source root
+# We add it to the archive because it is small and any changes
+# to this script will also cause a rebuild of the archive.
+sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
+
+src_file_list="
+include/
+arch/$SRCARCH/include/
+$sfile
+"
+
+obj_file_list="
+include/
+arch/$SRCARCH/include/
+"
+
+# Support incremental builds by skipping archive generation
+# if timestamps of files being archived are not changed.
+
+# This block is useful for debugging the incremental builds.
+# Uncomment it for debugging.
+# iter=1
+# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
+# else; iter=$(($(cat /tmp/iter) + 1)); fi
+# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
+# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
+
+# include/generated/compile.h is ignored because it is touched even when none
+# of the source files changed. This causes pointless regeneration, so let us
+# ignore them for md5 calculation.
+pushd $kroot > /dev/null
+src_files_md5="$(find $src_file_list -type f |
+ grep -v "include/generated/compile.h" |
+ xargs ls -lR | md5sum | cut -d ' ' -f1)"
+popd > /dev/null
+obj_files_md5="$(find $obj_file_list -type f |
+ grep -v "include/generated/compile.h" |
+ xargs ls -lR | md5sum | cut -d ' ' -f1)"
+
+if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
+if [ -f kernel/kheaders.md5 ] &&
+ [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
+ exit
+fi
+
+if [ "${quiet}" != "silent_" ]; then
+ echo " GEN $tarfile"
+fi
+
+rm -rf $cpio_dir
+mkdir $cpio_dir
+
+pushd $kroot > /dev/null
+for f in $src_file_list;
+ do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir
+popd > /dev/null
+
+# The second CPIO can complain if files already exist which can
+# happen with out of tree builds. Just silence CPIO for now.
+for f in $obj_file_list;
+ do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+
+# Remove comments except SDPX lines
+find $cpio_dir -type f -print0 |
+ xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
+
+tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
+
+echo "$src_files_md5" > kernel/kheaders.md5
+echo "$obj_files_md5" >> kernel/kheaders.md5
+echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
+
+rm -rf $cpio_dir
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
new file mode 100644
index 000000000000..70ae6052920d
--- /dev/null
+++ b/kernel/kheaders.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel headers useful to build tracing programs
+ * such as for running eBPF tracing tools.
+ *
+ * (Borrowed code from kernel/configs.c)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+
+/*
+ * Define kernel_headers_data and kernel_headers_data_end, within which the
+ * compressed kernel headers are stored. The file is first compressed with xz.
+ */
+
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .global kernel_headers_data \n"
+"kernel_headers_data: \n"
+" .incbin \"kernel/kheaders_data.tar.xz\" \n"
+" .global kernel_headers_data_end \n"
+"kernel_headers_data_end: \n"
+" .popsection \n"
+);
+
+extern char kernel_headers_data;
+extern char kernel_headers_data_end;
+
+static ssize_t
+ikheaders_read_current(struct file *file, char __user *buf,
+ size_t len, loff_t *offset)
+{
+ return simple_read_from_buffer(buf, len, offset,
+ &kernel_headers_data,
+ &kernel_headers_data_end -
+ &kernel_headers_data);
+}
+
+static const struct file_operations ikheaders_file_ops = {
+ .read = ikheaders_read_current,
+ .llseek = default_llseek,
+};
+
+static int __init ikheaders_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ /* create the current headers file */
+ entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL,
+ &ikheaders_file_ops);
+ if (!entry)
+ return -ENOMEM;
+
+ proc_set_size(entry,
+ &kernel_headers_data_end -
+ &kernel_headers_data);
+ return 0;
+}
+
+static void __exit ikheaders_cleanup(void)
+{
+ remove_proc_entry("kheaders.tar.xz", NULL);
+}
+
+module_init(ikheaders_init);
+module_exit(ikheaders_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joel Fernandes");
+MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel");
--
2.21.0.593.g511ec345e18-goog
In one of my rcutorture tests the TSC clocksource got marked unstable
due to a large difference in the TSC value. I'm not sure if the guest
run for a long time with disabled interrupts or if the host was very
busy and didn't schedule the guest for some time.
I took a look on the qemu/KVM options and decided to update the options:
- Use kvm{32|64} as CPU. We could probably use `host' (like ARM does)
for maximum available features but since we don't run any userland I'm
not sure if it makes any difference.
- Drop the "noapic" option, enable TSC deadline timer. There is no
history why the APIC was disabled, I see no reason for it. The
deadline timer is probably "nicer".
- Additional config options. It ensures that the kernel knowns that it
runs as a kvm guest and can use virt devices like the kvm-clock as
clocksource. The kvm-clock was the main motivation here.
- I didn't add a random HW device. It would make the random device ready
earlier (not it doesn't complete the initialisation at all) but I
doubt that there is any need for this.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
---
tools/testing/selftests/rcutorture/bin/functions.sh | 13 ++++++++++++-
.../selftests/rcutorture/configs/rcu/CFcommon | 4 ++++
2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index 6bcb8b5b2ff22..be3c5c73d7e79 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -172,7 +172,7 @@ identify_qemu_append () {
local console=ttyS0
case "$1" in
qemu-system-x86_64|qemu-system-i386)
- echo noapic selinux=0 initcall_debug debug
+ echo selinux=0 initcall_debug debug
;;
qemu-system-aarch64)
console=ttyAMA0
@@ -191,8 +191,19 @@ identify_qemu_append () {
# Output arguments for qemu arguments based on the TORTURE_QEMU_MAC
# and TORTURE_QEMU_INTERACTIVE environment variables.
identify_qemu_args () {
+ local KVM_CPU=""
+ case "$1" in
+ qemu-system-x86_64)
+ KVM_CPU=kvm64
+ ;;
+ qemu-system-i386)
+ KVM_CPU=kvm32
+ ;;
+ esac
case "$1" in
qemu-system-x86_64|qemu-system-i386)
+ echo -machine q35,accel=kvm
+ echo -cpu ${KVM_CPU},x2apic=on,tsc-deadline=on,hypervisor=on,tsc_adjust=on
;;
qemu-system-aarch64)
echo -machine virt,gic-version=host -cpu host
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index d2d2a86139db1..322d5d40443cd 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,2 +1,6 @@
CONFIG_RCU_TORTURE_TEST=y
CONFIG_PRINTK_TIME=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_PARAVIRT_SPINLOCKS=y
+CONFIG_KVM_GUEST=y
--
2.20.1
From: Peter Zijlstra <peterz(a)infradead.org>
Nicolai Stange discovered[1] that if live kernel patching is enabled, and the
function tracer started tracing the same function that was patched, the
conversion of the fentry call site during the translation of going from
calling the live kernel patch trampoline to the iterator trampoline, would
have as slight window where it didn't call anything.
As live kernel patching depends on ftrace to always call its code (to
prevent the function being traced from being called, as it will redirect
it). This small window would allow the old buggy function to be called, and
this can cause undesirable results.
Nicolai submitted new patches[2] but these were controversial. As this is
similar to the static call emulation issues that came up a while ago[3].
But after some debate[4][5] adding a gap in the stack when entering the
breakpoint handler allows for pushing the return address onto the stack to
easily emulate a call.
[1] http://lkml.kernel.org/r/20180726104029.7736-1-nstange@suse.de
[2] http://lkml.kernel.org/r/20190427100639.15074-1-nstange@suse.de
[3] http://lkml.kernel.org/r/3cf04e113d71c9f8e4be95fb84a510f085aa4afa.154171145…
[4] http://lkml.kernel.org/r/CAHk-=wh5OpheSU8Em_Q3Hg8qw_JtoijxOdPtHru6d+5K8TWM=…
[5] http://lkml.kernel.org/r/CAHk-=wjvQxY4DvPrJ6haPgAa6b906h=MwZXO6G8OtiTGe=N7_…
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Nicolai Stange <nstange(a)suse.de>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: the arch/x86 maintainers <x86(a)kernel.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: Jiri Kosina <jikos(a)kernel.org>
Cc: Miroslav Benes <mbenes(a)suse.cz>
Cc: Petr Mladek <pmladek(a)suse.com>
Cc: Joe Lawrence <joe.lawrence(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Cc: Mimi Zohar <zohar(a)linux.ibm.com>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: Nick Desaulniers <ndesaulniers(a)google.com>
Cc: Nayna Jain <nayna(a)linux.ibm.com>
Cc: Masahiro Yamada <yamada.masahiro(a)socionext.com>
Cc: Joerg Roedel <jroedel(a)suse.de>
Cc: "open list:KERNEL SELFTEST FRAMEWORK" <linux-kselftest(a)vger.kernel.org>
Cc: stable(a)vger.kernel.org
Fixes: b700e7f03df5 ("livepatch: kernel: add support for live patching")
Signed-off-by: *** Need SoB From Peter Zijlstra ***
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
arch/x86/kernel/ftrace.c | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..fd152f5a937b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,6 +29,7 @@
#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
+#include <asm/text-patching.h>
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -231,6 +232,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
}
static unsigned long ftrace_update_func;
+static unsigned long ftrace_update_func_call;
static int update_ftrace_func(unsigned long ip, void *new)
{
@@ -259,6 +261,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
unsigned char *new;
int ret;
+ ftrace_update_func_call = (unsigned long)func;
+
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -294,13 +298,21 @@ int ftrace_int3_handler(struct pt_regs *regs)
if (WARN_ON_ONCE(!regs))
return 0;
- ip = regs->ip - 1;
- if (!ftrace_location(ip) && !is_ftrace_caller(ip))
- return 0;
+ ip = regs->ip - INT3_INSN_SIZE;
- regs->ip += MCOUNT_INSN_SIZE - 1;
+ if (ftrace_location(ip)) {
+ int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
+ return 1;
+ } else if (is_ftrace_caller(ip)) {
+ if (!ftrace_update_func_call) {
+ int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
+ return 1;
+ }
+ int3_emulate_call(regs, ftrace_update_func_call);
+ return 1;
+ }
- return 1;
+ return 0;
}
NOKPROBE_SYMBOL(ftrace_int3_handler);
@@ -859,6 +871,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
func = ftrace_ops_get_func(ops);
+ ftrace_update_func_call = (unsigned long)func;
+
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -960,6 +974,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
{
unsigned char *new;
+ ftrace_update_func_call = 0UL;
new = ftrace_jmp_replace(ip, (unsigned long)func);
return update_ftrace_func(ip, new);
--
2.20.1
On Mon, Apr 29, 2019 at 12:13 PM Linus Torvalds
<torvalds(a)linux-foundation.org> wrote:
>
>
>
> On Mon, Apr 29, 2019, 12:02 Linus Torvalds <torvalds(a)linux-foundation.org> wrote:
>>
>>
>>
>> If nmi were to break it, it would be a cpu bug.
>
>
> Side note: we *already* depend on sti shadow working in other parts of the kernel, namely sti->iret.
>
Where? STI; IRET would be nuts.
Before:
commit 4214a16b02971c60960afd675d03544e109e0d75
Author: Andy Lutomirski <luto(a)kernel.org>
Date: Thu Apr 2 17:12:12 2015 -0700
x86/asm/entry/64/compat: Use SYSRETL to return from compat mode SYSENTER
we did sti; sysxit, but, when we discussed this, I don't recall anyone
speaking up in favor of the safely of the old code.
Not to mention that the crash we'll get if we get an NMI and a
rescheduling interrupt in this path will be very, very hard to debug.
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
Nicolai Stange discovered[1] that if live kernel patching is enabled, and the
function tracer started tracing the same function that was patched, the
conversion of the fentry call site during the translation of going from
calling the live kernel patch trampoline to the iterator trampoline, would
have as slight window where it didn't call anything.
As live kernel patching depends on ftrace to always call its code (to
prevent the function being traced from being called, as it will redirect
it). This small window would allow the old buggy function to be called, and
this can cause undesirable results.
Nicolai submitted new patches[2] but these were controversial. As this is
similar to the static call emulation issues that came up a while ago[3],
Linus suggested using per CPU data along with special trampolines[4] to emulate
the calls.
Linus's solution was for text poke (which was mostly what the static_call
code did), but as ftrace has its own mechanism, it required doing its own
thing.
Having ftrace use its own per CPU data and having its own set of specialized
trampolines solves the issue of missed calls that live kernel patching
suffers.
[1] http://lkml.kernel.org/r/20180726104029.7736-1-nstange@suse.de
[2] http://lkml.kernel.org/r/20190427100639.15074-1-nstange@suse.de
[3] http://lkml.kernel.org/r/3cf04e113d71c9f8e4be95fb84a510f085aa4afa.154171145…
[4] http://lkml.kernel.org/r/CAHk-=wh5OpheSU8Em_Q3Hg8qw_JtoijxOdPtHru6d+5K8TWM=…
Inspired-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: stable(a)vger.kernel.org
Fixes: b700e7f03df5 ("livepatch: kernel: add support for live patching")
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
Changes since v2:
- Moved inline asm to ftrace_64.S
- Used PER_CPU_VAR() and TRACE_IRQS_ON macros in assembly
- Renamed trampolines to be a little more coherent
- Created assembly version of STACK_FRAME_NON_STANDARD() in asm/frame.h
- Call ftrace_regs_caller instead of ftrace_caller
- No longer support 32 bit (it crashed badly on tests, and I couldn't figure it out)
arch/x86/include/asm/frame.h | 15 ++++++
arch/x86/kernel/ftrace.c | 102 ++++++++++++++++++++++++++++++++++-
arch/x86/kernel/ftrace_64.S | 56 +++++++++++++++++++
3 files changed, 172 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 5cbce6fbb534..04892b374b93 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -42,4 +42,19 @@
#endif /* CONFIG_FRAME_POINTER */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_STACK_VALIDATION
+#define STACK_FRAME_NON_STANDARD(func) \
+.section .discard.func_stack_frame_non_standard,"aw"; \
+.align 8; \
+.type __func_stack_frame_non_standard_##func, @object; \
+.size __func_stack_frame_non_standard_##func, 8; \
+__func_stack_frame_non_standard_##func: \
+.quad func; \
+.previous
+#else /* !CONFIG_STACK_VALIDATION */
+#define STACK_FRAME_NON_STANDARD(func)
+#endif /* CONFIG_STACK_VALIDATION */
+#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_FRAME_H */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..634fc0d4fe97 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -232,6 +232,9 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
static unsigned long ftrace_update_func;
+/* Used within inline asm below */
+unsigned long ftrace_update_func_call;
+
static int update_ftrace_func(unsigned long ip, void *new)
{
unsigned char old[MCOUNT_INSN_SIZE];
@@ -259,6 +262,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
unsigned char *new;
int ret;
+ ftrace_update_func_call = (unsigned long)func;
+
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -280,6 +285,46 @@ static nokprobe_inline int is_ftrace_caller(unsigned long ip)
return 0;
}
+#ifdef CONFIG_X86_64
+/*
+ * We need to handle the "call func1" -> "call func2" case.
+ * Just skipping the call is not sufficient as it will be like
+ * changing to "nop" first and then updating the call. But some
+ * users of ftrace require calls never to be missed.
+ *
+ * To emulate the call while converting the call site with a breakpoint,
+ * some trampolines are used along with per CPU buffers.
+ * There are three trampolines for the call sites and three trampolines
+ * for the updating of the call in ftrace trampoline. The three
+ * trampolines are:
+ *
+ * 1) Interrupts are enabled when the breakpoint is hit
+ * 2) Interrupts are disabled when the breakpoint is hit
+ * 3) The breakpoint was hit in an NMI
+ *
+ * As per CPU data is used, interrupts must be disabled to prevent them
+ * from corrupting the data. A separate NMI trampoline is used for the
+ * NMI case. If interrupts are already disabled, then the return path
+ * of where the breakpoint was hit (saved in the per CPU data) is pushed
+ * on the stack and then a jump to either the ftrace_caller (which will
+ * loop through all registered ftrace_ops handlers depending on the ip
+ * address), or if its a ftrace trampoline call update, it will call
+ * ftrace_update_func_call which will hold the call that should be
+ * called.
+ */
+extern asmlinkage void ftrace_emulate_call_sti(void);
+extern asmlinkage void ftrace_emulate_call(void);
+extern asmlinkage void ftrace_emulate_call_nmi(void);
+extern asmlinkage void ftrace_emulate_call_update_sti(void);
+extern asmlinkage void ftrace_emulate_call_update(void);
+extern asmlinkage void ftrace_emulate_call_update_nmi(void);
+
+DEFINE_PER_CPU(void *, ftrace_bp_call_return);
+DEFINE_PER_CPU(void *, ftrace_bp_call_nmi_return);
+
+/* To hold the ftrace_regs_caller address to push on the stack */
+void *ftrace_caller_func = (void *)ftrace_regs_caller;
+
/*
* A breakpoint was added to the code address we are about to
* modify, and this is the handle that will just skip over it.
@@ -291,6 +336,58 @@ int ftrace_int3_handler(struct pt_regs *regs)
{
unsigned long ip;
+ if (WARN_ON_ONCE(!regs))
+ return 0;
+
+ ip = regs->ip - 1;
+ if (ftrace_location(ip)) {
+ /* A breakpoint at the beginning of the function was hit */
+ if (in_nmi()) {
+ /* NMIs have their own trampoline */
+ this_cpu_write(ftrace_bp_call_nmi_return, (void *)ip + MCOUNT_INSN_SIZE);
+ regs->ip = (unsigned long) ftrace_emulate_call_nmi;
+ return 1;
+ }
+ this_cpu_write(ftrace_bp_call_return, (void *)ip + MCOUNT_INSN_SIZE);
+ if (regs->flags & X86_EFLAGS_IF) {
+ regs->flags &= ~X86_EFLAGS_IF;
+ regs->ip = (unsigned long) ftrace_emulate_call_sti;
+ /* Tell lockdep here we are enabling interrupts */
+ lockdep_hardirqs_on(_THIS_IP_);
+ } else {
+ regs->ip = (unsigned long) ftrace_emulate_call;
+ }
+ return 1;
+ } else if (is_ftrace_caller(ip)) {
+ /* An ftrace trampoline is being updated */
+ if (!ftrace_update_func_call) {
+ /* If it's a jump, just need to skip it */
+ regs->ip += MCOUNT_INSN_SIZE -1;
+ return 1;
+ }
+ if (in_nmi()) {
+ /* NMIs have their own trampoline */
+ this_cpu_write(ftrace_bp_call_nmi_return, (void *)ip + MCOUNT_INSN_SIZE);
+ regs->ip = (unsigned long) ftrace_emulate_call_update_nmi;
+ return 1;
+ }
+ this_cpu_write(ftrace_bp_call_return, (void *)ip + MCOUNT_INSN_SIZE);
+ if (regs->flags & X86_EFLAGS_IF) {
+ regs->flags &= ~X86_EFLAGS_IF;
+ regs->ip = (unsigned long) ftrace_emulate_call_update_sti;
+ } else {
+ regs->ip = (unsigned long) ftrace_emulate_call_update;
+ }
+ return 1;
+ }
+
+ return 0;
+}
+#else /* !X86_64 */
+int ftrace_int3_handler(struct pt_regs *regs)
+{
+ unsigned long ip;
+
if (WARN_ON_ONCE(!regs))
return 0;
@@ -299,9 +396,9 @@ int ftrace_int3_handler(struct pt_regs *regs)
return 0;
regs->ip += MCOUNT_INSN_SIZE - 1;
-
return 1;
}
+#endif
NOKPROBE_SYMBOL(ftrace_int3_handler);
static int ftrace_write(unsigned long ip, const char *val, int size)
@@ -859,6 +956,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
func = ftrace_ops_get_func(ops);
+ ftrace_update_func_call = (unsigned long)func;
+
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
@@ -960,6 +1059,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
{
unsigned char *new;
+ ftrace_update_func_call = 0;
new = ftrace_jmp_replace(ip, (unsigned long)func);
return update_ftrace_func(ip, new);
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 75f2b36b41a6..8642e1719370 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -9,6 +9,9 @@
#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
+#include <asm/irqflags.h>
+#include <asm/percpu.h>
+#include <asm/frame.h>
.code64
.section .entry.text, "ax"
@@ -262,6 +265,59 @@ GLOBAL(ftrace_regs_caller_end)
ENDPROC(ftrace_regs_caller)
+/* Trampoline for function update with interrupts enabled */
+GLOBAL(ftrace_emulate_call_sti)
+ push PER_CPU_VAR(ftrace_bp_call_return)
+ push ftrace_caller_func
+ TRACE_IRQS_ON
+ sti
+ ret
+ENDPROC(ftrace_emulate_call_sti)
+
+/* Trampoline for function update with interrupts disabled*/
+GLOBAL(ftrace_emulate_call)
+ push PER_CPU_VAR(ftrace_bp_call_return)
+ push ftrace_caller_func
+ ret
+ENDPROC(ftrace_emulate_call)
+
+/* Trampoline for function update in an NMI */
+GLOBAL(ftrace_emulate_call_nmi)
+ push PER_CPU_VAR(ftrace_bp_call_nmi_return)
+ push ftrace_caller_func
+ ret
+ENDPROC(ftrace_emulate_call_nmi)
+
+/* Trampoline for ftrace trampoline call update with interrupts enabled */
+GLOBAL(ftrace_emulate_call_update_sti)
+ push PER_CPU_VAR(ftrace_bp_call_return)
+ push ftrace_update_func_call
+ TRACE_IRQS_ON
+ sti
+ ret
+ENDPROC(ftrace_emulate_call_update_sti)
+
+/* Trampoline for ftrace trampoline call update with interrupts disabled */
+GLOBAL(ftrace_emulate_call_update)
+ push PER_CPU_VAR(ftrace_bp_call_return)
+ push ftrace_update_func_call
+ ret
+ENDPROC(ftrace_emulate_call_update)
+
+/* Trampoline for ftrace trampoline call update in an NMI */
+GLOBAL(ftrace_emulate_call_update_nmi)
+ push PER_CPU_VAR(ftrace_bp_call_nmi_return)
+ push ftrace_update_func_call
+ ret
+ENDPROC(ftrace_emulate_call_update_nmi)
+
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_sti)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_nmi)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_update_sti)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_update)
+STACK_FRAME_NON_STANDARD(ftrace_emulate_call_update_nmi)
+
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(function_hook)
--
2.20.1
pidfd are file descriptors referring to a process created with the
CLONE_PIDFD clone(2) flag. Android low memory killer (LMK) needs pidfd
polling support to replace code that currently checks for existence of
/proc/pid for knowing that a process that is signalled to be killed has
died, which is both racy and slow. The pidfd poll approach is race-free,
and also allows the LMK to do other things (such as by polling on other
fds) while awaiting the process being killed to die.
It prevents a situation where a PID is reused between when LMK sends a
kill signal and checks for existence of the PID, since the wrong PID is
now possibly checked for existence.
In this patch, we follow the same existing mechanism in the kernel used
when the parent of the task group is to be notified (do_notify_parent).
This is when the tasks waiting on a poll of pidfd are also awakened.
We have decided to include the waitqueue in struct pid for the following
reasons:
1. The wait queue has to survive for the lifetime of the poll. Including
it in task_struct would not be option in this case because the task can
be reaped and destroyed before the poll returns.
2. By including the struct pid for the waitqueue means that during
de_thread(), the new thread group leader automatically gets the new
waitqueue/pid even though its task_struct is different.
Appropriate test cases are added in the second patch to provide coverage
of all the cases the patch is handling.
Andy had a similar patch [1] in the past which was a good reference
however this patch tries to handle different situations properly related
to thread group existence, and how/where it notifies. And also solves
other bugs (waitqueue lifetime). Daniel had a similar patch [2]
recently which this patch supercedes.
[1] https://lore.kernel.org/patchwork/patch/345098/
[2] https://lore.kernel.org/lkml/20181029175322.189042-1-dancol@google.com/
Cc: luto(a)amacapital.net
Cc: rostedt(a)goodmis.org
Cc: dancol(a)google.com
Cc: sspatil(a)google.com
Cc: christian(a)brauner.io
Cc: jannh(a)google.com
Cc: surenb(a)google.com
Cc: timmurray(a)google.com
Cc: Jonathan Kowalski <bl0pbl33p(a)gmail.com>
Cc: torvalds(a)linux-foundation.org
Cc: kernel-team(a)android.com
Co-developed-by: Daniel Colascione <dancol(a)google.com>
Signed-off-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
---
RFC -> v1:
* Based on CLONE_PIDFD patches: https://lwn.net/Articles/786244/
* Updated selftests.
* Renamed poll wake function to do_notify_pidfd.
* Removed depending on EXIT flags
* Removed POLLERR flag since semantics are controversial and
we don't have usecases for it right now (later we can add if there's
a need for it).
include/linux/pid.h | 3 +++
kernel/fork.c | 33 +++++++++++++++++++++++++++++++++
kernel/pid.c | 2 ++
kernel/signal.c | 14 ++++++++++++++
4 files changed, 52 insertions(+)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 3c8ef5a199ca..1484db6ca8d1 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
#define _LINUX_PID_H
#include <linux/rculist.h>
+#include <linux/wait.h>
enum pid_type
{
@@ -60,6 +61,8 @@ struct pid
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
+ /* wait queue for pidfd notifications */
+ wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[1];
};
diff --git a/kernel/fork.c b/kernel/fork.c
index 5525837ed80e..fb3b614f6456 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1685,8 +1685,41 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
}
#endif
+static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct task_struct *task;
+ struct pid *pid;
+ int poll_flags = 0;
+
+ /*
+ * tasklist_lock must be held because to avoid racing with
+ * changes in exit_state and wake up. Basically to avoid:
+ *
+ * P0: read exit_state = 0
+ * P1: write exit_state = EXIT_DEAD
+ * P1: Do a wake up - wq is empty, so do nothing
+ * P0: Queue for polling - wait forever.
+ */
+ read_lock(&tasklist_lock);
+ pid = file->private_data;
+ task = pid_task(pid, PIDTYPE_PID);
+ WARN_ON_ONCE(task && !thread_group_leader(task));
+
+ if (!task || (task->exit_state && thread_group_empty(task)))
+ poll_flags = POLLIN | POLLRDNORM;
+
+ if (!poll_flags)
+ poll_wait(file, &pid->wait_pidfd, pts);
+
+ read_unlock(&tasklist_lock);
+
+ return poll_flags;
+}
+
+
const struct file_operations pidfd_fops = {
.release = pidfd_release,
+ .poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
#endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..5c90c239242f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -214,6 +214,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
+ init_waitqueue_head(&pid->wait_pidfd);
+
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
diff --git a/kernel/signal.c b/kernel/signal.c
index 1581140f2d99..16e7718316e5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1800,6 +1800,17 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
return ret;
}
+static void do_notify_pidfd(struct task_struct *task)
+{
+ struct pid *pid;
+
+ lockdep_assert_held(&tasklist_lock);
+
+ pid = get_task_pid(task, PIDTYPE_PID);
+ wake_up_all(&pid->wait_pidfd);
+ put_pid(pid);
+}
+
/*
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1823,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
+ /* Wake up all pidfd waiters */
+ do_notify_pidfd(tsk);
+
if (sig != SIGCHLD) {
/*
* This is only possible if parent == real_parent.
--
2.21.0.593.g511ec345e18-goog
Hi,
this series is the result of the discussion to the RFC patch found at [1].
The goal is to make x86' ftrace_int3_handler() not to simply skip over
the trapping instruction as this is problematic in the context of
the live patching consistency model. For details, c.f. the commit message
of [3/4] ("x86/ftrace: make ftrace_int3_handler() not to skip fops
invocation").
Everything is based on v5.1-rc6, please let me know in case you want me to
rebase on somehing else.
For x86_64, the live patching selftest added in [4/4] succeeds with this
series applied and fails without it. On 32 bits I only compile-tested.
checkpatch reports warnings about
- an overlong line in assembly -- I chose to ignore that
- MAINTAINERS perhaps needing updates due to the new files
arch/x86/kernel/ftrace_int3_stubs.S and
tools/testing/selftests/livepatch/test-livepatch-vs-ftrace.sh.
As the existing arch/x86/kernel/ftrace_{32,64}.S haven't got an
explicit entry either, this one is probably Ok? The selftest
definitely is.
Changes to the RFC patch:
- s/trampoline/stub/ to avoid confusion with the ftrace_ops' trampolines,
- use a fixed size stack kept in struct thread_info for passing the
(adjusted) ->ip values from ftrace_int3_handler() to the stubs,
- provide one stub for each of the two possible jump targets and hardcode
those,
- add the live patching selftest.
Thanks,
Nicolai
Nicolai Stange (4):
x86/thread_info: introduce ->ftrace_int3_stack member
ftrace: drop 'static' qualifier from ftrace_ops_list_func()
x86/ftrace: make ftrace_int3_handler() not to skip fops invocation
selftests/livepatch: add "ftrace a live patched function" test
arch/x86/include/asm/thread_info.h | 11 +++
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/asm-offsets.c | 8 +++
arch/x86/kernel/ftrace.c | 79 +++++++++++++++++++---
arch/x86/kernel/ftrace_int3_stubs.S | 61 +++++++++++++++++
kernel/trace/ftrace.c | 8 +--
tools/testing/selftests/livepatch/Makefile | 3 +-
.../livepatch/test-livepatch-vs-ftrace.sh | 44 ++++++++++++
8 files changed, 199 insertions(+), 16 deletions(-)
create mode 100644 arch/x86/kernel/ftrace_int3_stubs.S
create mode 100755 tools/testing/selftests/livepatch/test-livepatch-vs-ftrace.sh
--
2.13.7
On Mon, Apr 29, 2019 at 11:53 AM Linus Torvalds
<torvalds(a)linux-foundation.org> wrote:
>
>
>
> On Mon, Apr 29, 2019, 11:42 Andy Lutomirski <luto(a)kernel.org> wrote:
>>
>>
>> I'm less than 100% convinced about this argument. Sure, an NMI right
>> there won't cause a problem. But an NMI followed by an interrupt will
>> kill us if preemption is on. I can think of three solutions:
>
>
> No, because either the sti shadow disables nmi too (that's the case on some CPUs at least) or the iret from nmi does.
>
> Otherwise you could never trust the whole sti shadow thing - and it very much is part of the architecture.
>
Is this documented somewhere? And do you actually believe that this
is true under KVM, Hyper-V, etc? As I recall, Andrew Cooper dug in to
the way that VMX dealt with this stuff and concluded that the SDM was
blatantly wrong in many cases, which leads me to believe that Xen
HVM/PVH is the *only* hypervisor that gets it right.
Steven's point about batched updates is quite valid, though. My
personal favorite solution to this whole mess is to rework the whole
thing so that the int3 handler simply returns and retries and to
replace the sync_core() broadcast with an SMI broadcast. I don't know
whether this will actually work on real CPUs and on VMs and whether
it's going to crash various BIOSes out there.
On Mon, Apr 29, 2019 at 12:02 PM Linus Torvalds
<torvalds(a)linux-foundation.org> wrote:
>
> If nmi were to break it, it would be a cpu bug. I'm pretty sure I've
> seen the "shadow stops even nmi" documented for some uarch, but as
> mentioned it's not necessarily the only way to guarantee the shadow.
In fact, the documentation is simply the official Intel instruction
docs for "STI":
The IF flag and the STI and CLI instructions do not prohibit the
generation of exceptions and NMI interrupts. NMI interrupts (and
SMIs) may be blocked for one macroinstruction following an STI.
note the "may be blocked". As mentioned, that's just one option for
not having NMI break the STI shadow guarantee, but it's clearly one
that Intel has done at times, and clearly even documents as having
done so.
There is absolutely no question that the sti shadow is real, and that
people have depended on it for _decades_. It would be a horrible
errata if the shadow can just be made to go away by randomly getting
an NMI or SMI.
Linus
On Mon, 29 Apr 2019 11:59:04 -0700
Linus Torvalds <torvalds(a)linux-foundation.org> wrote:
> I really don't care. Just do what I suggested, and if you have numbers to
> show problems, then maybe I'll care.
>
Are you suggesting that I rewrite the code to do it one function at a
time? This has always been batch mode. This is not something new. The
function tracer has been around longer than the text poke code.
> Right now you're just making excuses for this. I described the solution
> months ago, now I've written a patch, if that's not good enough then we can
> just skip this all entirely.
>
> Honestly, if you need to rewrite tens of thousands of calls, maybe you're
> doing something wrong?
>
# cd /sys/kernel/debug/tracing
# cat available_filter_functions | wc -l
45856
# cat enabled_functions | wc -l
0
# echo function > current_tracer
# cat enabled_functions | wc -l
45856
There, I just enabled 45,856 function call sites in one shot!
How else do you want to update them? Every function in the kernel has a
nop, that turns into a call to the ftrace_handler, if I add another
user of that code, it will change each one as well.
-- Steve
The entries within __rseq_table are aligned on 32 bytes due to
linux/rseq.h struct rseq_cs uapi requirements, but the start of the
__rseq_table section is not guaranteed to be 32-byte aligned. It can
cause padding to be added at the start of the section, which makes it
hard to use as an array of items by debuggers.
Considering that __rseq_table does not really consist of a table due to
the presence of padding, rename this section to __rseq_cs.
Create a new __rseq_cs_ptr_array section which contains 64-bit packed
pointers to entries within the __rseq_cs section.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
CC: Thomas Gleixner <tglx(a)linutronix.de>
CC: Joel Fernandes <joelaf(a)google.com>
CC: Peter Zijlstra <peterz(a)infradead.org>
CC: Catalin Marinas <catalin.marinas(a)arm.com>
CC: Dave Watson <davejwatson(a)fb.com>
CC: Will Deacon <will.deacon(a)arm.com>
CC: Shuah Khan <shuah(a)kernel.org>
CC: Andi Kleen <andi(a)firstfloor.org>
CC: linux-kselftest(a)vger.kernel.org
CC: "H . Peter Anvin" <hpa(a)zytor.com>
CC: Chris Lameter <cl(a)linux.com>
CC: Russell King <linux(a)arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages(a)gmail.com>
CC: "Paul E . McKenney" <paulmck(a)linux.vnet.ibm.com>
CC: Paul Turner <pjt(a)google.com>
CC: Boqun Feng <boqun.feng(a)gmail.com>
CC: Josh Triplett <josh(a)joshtriplett.org>
CC: Steven Rostedt <rostedt(a)goodmis.org>
CC: Ben Maurer <bmaurer(a)fb.com>
CC: linux-api(a)vger.kernel.org
CC: Andy Lutomirski <luto(a)amacapital.net>
CC: Andrew Morton <akpm(a)linux-foundation.org>
CC: Linus Torvalds <torvalds(a)linux-foundation.org>
---
tools/testing/selftests/rseq/rseq-arm.h | 32 +++++++++++++++++--------------
tools/testing/selftests/rseq/rseq-arm64.h | 9 ++++++---
tools/testing/selftests/rseq/rseq-mips.h | 32 +++++++++++++++++--------------
tools/testing/selftests/rseq/rseq-ppc.h | 22 +++++++++++++--------
tools/testing/selftests/rseq/rseq-s390.h | 18 +++++++++++------
tools/testing/selftests/rseq/rseq-x86.h | 19 ++++++++++++------
6 files changed, 81 insertions(+), 51 deletions(-)
diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h
index 17e8d231943a..5f262c54364f 100644
--- a/tools/testing/selftests/rseq/rseq-arm.h
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -30,24 +30,28 @@ do { \
#include "rseq-skip.h"
#else /* !RSEQ_SKIP_FASTPATH */
-#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip, \
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".word " __rseq_str(label) "b, 0x0\n\t" \
".popsection\n\t"
-#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip) \
- __RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip, \
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -99,7 +103,7 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -166,7 +170,7 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -237,7 +241,7 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
#endif
@@ -292,7 +296,7 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -367,7 +371,7 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -443,7 +447,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -527,7 +531,7 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -651,7 +655,7 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 2079f71e0ca2..b41a2a48e965 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -82,13 +82,16 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
post_commit_offset, abort_ip) \
- " .pushsection __rseq_table, \"aw\"\n" \
+ " .pushsection __rseq_cs, \"aw\"\n" \
" .balign 32\n" \
__rseq_str(label) ":\n" \
" .long " __rseq_str(version) ", " __rseq_str(flags) "\n" \
" .quad " __rseq_str(start_ip) ", " \
__rseq_str(post_commit_offset) ", " \
__rseq_str(abort_ip) "\n" \
+ " .popsection\n\t" \
+ " .pushsection __rseq_cs_ptr_array, \"aw\"\n" \
+ " .quad " __rseq_str(label) "b\n" \
" .popsection\n"
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
@@ -99,8 +102,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
diff --git a/tools/testing/selftests/rseq/rseq-mips.h b/tools/testing/selftests/rseq/rseq-mips.h
index 25d10ff54769..fe3eabcdcbe5 100644
--- a/tools/testing/selftests/rseq/rseq-mips.h
+++ b/tools/testing/selftests/rseq/rseq-mips.h
@@ -54,26 +54,30 @@ do { \
# error unsupported _MIPS_SZLONG
#endif
-#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip, \
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \
LONG " " U32_U64_PAD(__rseq_str(post_commit_offset)) "\n\t" \
LONG " " U32_U64_PAD(__rseq_str(abort_ip)) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(label) "b") "\n\t" \
".popsection\n\t"
-#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip) \
- __RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip, \
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -127,7 +131,7 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -192,7 +196,7 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -261,7 +265,7 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
#endif
@@ -316,7 +320,7 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -389,7 +393,7 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -463,7 +467,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -543,7 +547,7 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -664,7 +668,7 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
diff --git a/tools/testing/selftests/rseq/rseq-ppc.h b/tools/testing/selftests/rseq/rseq-ppc.h
index 24f95649d71e..9df18487fa9f 100644
--- a/tools/testing/selftests/rseq/rseq-ppc.h
+++ b/tools/testing/selftests/rseq/rseq-ppc.h
@@ -33,8 +33,8 @@ do { \
#else /* !RSEQ_SKIP_FASTPATH */
/*
- * The __rseq_table section can be used by debuggers to better handle
- * single-stepping through the restartable critical sections.
+ * The __rseq_cs_ptr_array and __rseq_cs sections can be used by debuggers to
+ * better handle single-stepping through the restartable critical sections.
*/
#ifdef __PPC64__
@@ -46,11 +46,14 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
".popsection\n\t"
#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
@@ -67,8 +70,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -85,20 +88,23 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
/* 32-bit only supported on BE */ \
".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long 0x0, " __rseq_str(label) "b\n\t" \
".popsection\n\t"
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index b8b5b6f900af..fbb97815d71c 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -37,19 +37,22 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
".popsection\n\t"
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -61,19 +64,22 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long 0x0, " __rseq_str(label) "b\n\t" \
".popsection\n\t"
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h
index 0668608d3674..03095236f6fa 100644
--- a/tools/testing/selftests/rseq/rseq-x86.h
+++ b/tools/testing/selftests/rseq/rseq-x86.h
@@ -47,13 +47,17 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
".popsection\n\t"
+
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
@@ -62,8 +66,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -566,11 +570,14 @@ do { \
*/
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".long " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long " __rseq_str(label) "b, 0x0\n\t" \
".popsection\n\t"
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
@@ -581,8 +588,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
--
2.11.0
This patch set proposes KUnit, a lightweight unit testing and mocking
framework for the Linux kernel.
Unlike Autotest and kselftest, KUnit is a true unit testing framework;
it does not require installing the kernel on a test machine or in a VM
and does not require tests to be written in userspace running on a host
kernel. Additionally, KUnit is fast: From invocation to completion KUnit
can run several dozen tests in under a second. Currently, the entire
KUnit test suite for KUnit runs in under a second from the initial
invocation (build time excluded).
KUnit is heavily inspired by JUnit, Python's unittest.mock, and
Googletest/Googlemock for C++. KUnit provides facilities for defining
unit test cases, grouping related test cases into test suites, providing
common infrastructure for running tests, mocking, spying, and much more.
## What's so special about unit testing?
A unit test is supposed to test a single unit of code in isolation,
hence the name. There should be no dependencies outside the control of
the test; this means no external dependencies, which makes tests orders
of magnitudes faster. Likewise, since there are no external dependencies,
there are no hoops to jump through to run the tests. Additionally, this
makes unit tests deterministic: a failing unit test always indicates a
problem. Finally, because unit tests necessarily have finer granularity,
they are able to test all code paths easily solving the classic problem
of difficulty in exercising error handling code.
## Is KUnit trying to replace other testing frameworks for the kernel?
No. Most existing tests for the Linux kernel are end-to-end tests, which
have their place. A well tested system has lots of unit tests, a
reasonable number of integration tests, and some end-to-end tests. KUnit
is just trying to address the unit test space which is currently not
being addressed.
## More information on KUnit
There is a bunch of documentation near the end of this patch set that
describes how to use KUnit and best practices for writing unit tests.
For convenience I am hosting the compiled docs here:
https://google.github.io/kunit-docs/third_party/kernel/docs/
Additionally for convenience, I have applied these patches to a branch:
https://kunit.googlesource.com/linux/+/kunit/rfc/v5.1-rc2/v1
The repo may be cloned with:
git clone https://kunit.googlesource.com/linux
This patchset is on the kunit/rfc/v5.1-rc2/v1 branch.
## Changes Since Last Version
Last version was RFC v4. It seemed we were pretty much done with the RFC
phase, so I started the numbering over again. Sorry if anyone finds that
confusing.
- Reduced usage of object oriented style of member functions as
suggested by Frank.
- Did a bunch of heavy clean up of the kunit_abort stuff as suggested
by Frank and Stephen:
- Biggest change was to reduce the usage of direct calls of member
functions.
- Added a better explanation of what abort is for and further
explained the rationale for KUNIT_ASSERT_* vs. KUNIT_EXPECT_*
- Dropped BUG() usage
- Also moved try_catch interface to a new file since it seemed
obscured by being mixed in with the code that used it.
- Fixed some other minor issues pointed out by Stephen.
- Updated email address of one of the contributors.
- Dropped DT unittest port since it seemed like there was a lot more
discussion to be had: it wasn't ready to leave the RFC phase.
Instead, I added a KUnit test written by Iurii for PROC SYSCTL that
was requested by Luis some time ago.
For reference, RFC v4 can be found here:
https://lkml.org/lkml/2019/2/14/1144
--
2.21.0.392.gf8f6787159e-goog
This refactors the selftest Makefiles to extract the test running logic
to be reused between "run_tests" and "emit_tests", while also fixing
up the test output to be TAP version 13 compliant:
- added "plan" line
- fixed result line syntax
- moved all test output to be "# "-prefixed as TAP "diagnostic" lines
The prefixing code includes a fallback mode for limited execution
environments.
Additionally, the plan lines are fixed for all callers of kselftest.h.
-Kees
v2:
- fix external make variable "summary=1" through-out series (shuah)
- fix plan line output for all kselftest.h users
Kees Cook (8):
selftests: Extract single-test shell logic from lib.mk
selftests: Use runner.sh for emit targets
selftests: Extract logic for multiple test runs
selftests: Add plan line and fix result line syntax
selftests: Distinguish between missing and non-executable
selftests: Move test output to diagnostic lines
selftests: Remove KSFT_TAP_LEVEL
selftests: Add test plan API to kselftest.h and adjust callers
tools/testing/selftests/.gitignore | 1 -
tools/testing/selftests/Makefile | 24 ++----
.../selftests/breakpoints/breakpoint_test.c | 15 +++-
.../breakpoints/breakpoint_test_arm64.c | 3 +-
.../breakpoints/step_after_suspend_test.c | 8 ++
.../selftests/capabilities/test_execve.c | 6 +-
.../futex/functional/futex_requeue_pi.c | 1 +
.../futex_requeue_pi_mismatched_ops.c | 1 +
.../futex_requeue_pi_signal_restart.c | 1 +
.../futex_wait_private_mapped_file.c | 1 +
.../futex/functional/futex_wait_timeout.c | 1 +
.../futex_wait_uninitialized_heap.c | 1 +
.../futex/functional/futex_wait_wouldblock.c | 1 +
tools/testing/selftests/kselftest.h | 17 +++-
tools/testing/selftests/kselftest/prefix.pl | 23 +++++
tools/testing/selftests/kselftest/runner.sh | 86 +++++++++++++++++++
tools/testing/selftests/lib.mk | 64 +++-----------
.../selftests/membarrier/membarrier_test.c | 1 +
tools/testing/selftests/pidfd/pidfd_test.c | 1 +
tools/testing/selftests/sigaltstack/sas.c | 1 +
tools/testing/selftests/sync/sync_test.c | 1 +
21 files changed, 178 insertions(+), 80 deletions(-)
create mode 100755 tools/testing/selftests/kselftest/prefix.pl
create mode 100644 tools/testing/selftests/kselftest/runner.sh
--
2.17.1
This is just sending out the tweaked fix from Tycho and the selftest
changes needed to support it. I intend to send this to Linus directly
after it's been in -next for a few days for v5.1 fixes.
Thanks!
-Kees
Kees Cook (1):
selftests/seccomp: Prepare for exclusive seccomp flags
Tycho Andersen (1):
seccomp: Make NEW_LISTENER and TSYNC flags exclusive
kernel/seccomp.c | 17 ++++++++--
tools/testing/selftests/seccomp/seccomp_bpf.c | 34 ++++++++++++++-----
2 files changed, 40 insertions(+), 11 deletions(-)
--
2.17.1
Use udf as the guard instruction for the restartable sequence abort
handler.
Previously, the chosen signature was not a valid instruction, based
on the assumption that it could always sit in a literal pool. However,
there are compilation environments in which literal pools are not
availble, for instance execute-only code. Therefore, we need to
choose a signature value that is also a valid instruction.
Handle compiling with -mbig-endian on ARMv6+, which generates binaries
with mixed code vs data endianness (little endian code, big endian
data).
Else mismatch between code endianness for the generated signatures and
data endianness for the RSEQ_SIG parameter passed to the rseq
registration will trigger application segmentation faults when the
kernel try to abort rseq critical sections.
Prior to ARMv6, -mbig-endian generates big-endian code and data, so
endianness should not be reversed in that case.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
CC: Peter Zijlstra <peterz(a)infradead.org>
CC: Thomas Gleixner <tglx(a)linutronix.de>
CC: Joel Fernandes <joelaf(a)google.com>
CC: Catalin Marinas <catalin.marinas(a)arm.com>
CC: Dave Watson <davejwatson(a)fb.com>
CC: Will Deacon <will.deacon(a)arm.com>
CC: Shuah Khan <shuah(a)kernel.org>
CC: Andi Kleen <andi(a)firstfloor.org>
CC: linux-kselftest(a)vger.kernel.org
CC: "H . Peter Anvin" <hpa(a)zytor.com>
CC: Chris Lameter <cl(a)linux.com>
CC: Russell King <linux(a)arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages(a)gmail.com>
CC: "Paul E . McKenney" <paulmck(a)linux.vnet.ibm.com>
CC: Paul Turner <pjt(a)google.com>
CC: Boqun Feng <boqun.feng(a)gmail.com>
CC: Josh Triplett <josh(a)joshtriplett.org>
CC: Steven Rostedt <rostedt(a)goodmis.org>
CC: Ben Maurer <bmaurer(a)fb.com>
CC: linux-api(a)vger.kernel.org
CC: Andy Lutomirski <luto(a)amacapital.net>
CC: Andrew Morton <akpm(a)linux-foundation.org>
CC: Linus Torvalds <torvalds(a)linux-foundation.org>
---
tools/testing/selftests/rseq/rseq-arm.h | 52 +++++++++++++++++++++++++++++++--
1 file changed, 50 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h
index 5f262c54364f..e8ccfc37d685 100644
--- a/tools/testing/selftests/rseq/rseq-arm.h
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -5,7 +5,54 @@
* (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
*/
-#define RSEQ_SIG 0x53053053
+/*
+ * RSEQ_SIG uses the udf A32 instruction with an uncommon immediate operand
+ * value 0x5de3. This traps if user-space reaches this instruction by mistake,
+ * and the uncommon operand ensures the kernel does not move the instruction
+ * pointer to attacker-controlled code on rseq abort.
+ *
+ * The instruction pattern in the A32 instruction set is:
+ *
+ * e7f5def3 udf #24035 ; 0x5de3
+ *
+ * This translates to the following instruction pattern in the T16 instruction
+ * set:
+ *
+ * little endian:
+ * def3 udf #243 ; 0xf3
+ * e7f5 b.n <7f5>
+ *
+ * pre-ARMv6 big endian code:
+ * e7f5 b.n <7f5>
+ * def3 udf #243 ; 0xf3
+ *
+ * ARMv6+ -mbig-endian generates mixed endianness code vs data: little-endian
+ * code and big-endian data. Ensure the RSEQ_SIG data signature matches code
+ * endianness. Prior to ARMv6, -mbig-endian generates big-endian code and data
+ * (which match), so there is no need to reverse the endianness of the data
+ * representation of the signature. However, the choice between BE32 and BE8
+ * is done by the linker, so we cannot know whether code and data endianness
+ * will be mixed before the linker is invoked.
+ */
+
+#define RSEQ_SIG_CODE 0xe7f5def3
+
+#ifndef __ASSEMBLER__
+
+#define RSEQ_SIG_DATA \
+ ({ \
+ int sig; \
+ asm volatile ( "b 2f\n\t" \
+ "1: .inst " __rseq_str(RSEQ_SIG_CODE) "\n\t" \
+ "2:\n\t" \
+ "ldr %[sig], 1b\n\t" \
+ : [sig] "=r" (sig)); \
+ sig; \
+ })
+
+#define RSEQ_SIG RSEQ_SIG_DATA
+
+#endif
#define rseq_smp_mb() __asm__ __volatile__ ("dmb" ::: "memory", "cc")
#define rseq_smp_rmb() __asm__ __volatile__ ("dmb" ::: "memory", "cc")
@@ -78,7 +125,8 @@ do { \
__rseq_str(table_label) ":\n\t" \
".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
- ".word " __rseq_str(RSEQ_SIG) "\n\t" \
+ ".arm\n\t" \
+ ".inst " __rseq_str(RSEQ_SIG_CODE) "\n\t" \
__rseq_str(label) ":\n\t" \
teardown \
"b %l[" __rseq_str(abort_label) "]\n\t"
--
2.11.0
The entries within __rseq_table are aligned on 32 bytes due to
linux/rseq.h struct rseq_cs uapi requirements, but the start of the
__rseq_table section is not guaranteed to be 32-byte aligned. It can
cause padding to be added at the start of the section, which makes it
hard to use as an array of items by debuggers.
Considering that __rseq_table does not really consist of a table due to
the presence of padding, rename this section to __rseq_cs.
Create a new __rseq_cs_ptr_array section which contains 64-bit packed
pointers to entries within the __rseq_cs section.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
CC: Thomas Gleixner <tglx(a)linutronix.de>
CC: Joel Fernandes <joelaf(a)google.com>
CC: Peter Zijlstra <peterz(a)infradead.org>
CC: Catalin Marinas <catalin.marinas(a)arm.com>
CC: Dave Watson <davejwatson(a)fb.com>
CC: Will Deacon <will.deacon(a)arm.com>
CC: Shuah Khan <shuah(a)kernel.org>
CC: Andi Kleen <andi(a)firstfloor.org>
CC: linux-kselftest(a)vger.kernel.org
CC: "H . Peter Anvin" <hpa(a)zytor.com>
CC: Chris Lameter <cl(a)linux.com>
CC: Russell King <linux(a)arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages(a)gmail.com>
CC: "Paul E . McKenney" <paulmck(a)linux.vnet.ibm.com>
CC: Paul Turner <pjt(a)google.com>
CC: Boqun Feng <boqun.feng(a)gmail.com>
CC: Josh Triplett <josh(a)joshtriplett.org>
CC: Steven Rostedt <rostedt(a)goodmis.org>
CC: Ben Maurer <bmaurer(a)fb.com>
CC: linux-api(a)vger.kernel.org
CC: Andy Lutomirski <luto(a)amacapital.net>
CC: Andrew Morton <akpm(a)linux-foundation.org>
CC: Linus Torvalds <torvalds(a)linux-foundation.org>
---
tools/testing/selftests/rseq/rseq-arm.h | 32 +++++++++++++++++--------------
tools/testing/selftests/rseq/rseq-arm64.h | 9 ++++++---
tools/testing/selftests/rseq/rseq-mips.h | 32 +++++++++++++++++--------------
tools/testing/selftests/rseq/rseq-ppc.h | 22 +++++++++++++--------
tools/testing/selftests/rseq/rseq-s390.h | 18 +++++++++++------
tools/testing/selftests/rseq/rseq-x86.h | 19 ++++++++++++------
6 files changed, 81 insertions(+), 51 deletions(-)
diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h
index 17e8d231943a..5f262c54364f 100644
--- a/tools/testing/selftests/rseq/rseq-arm.h
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -30,24 +30,28 @@ do { \
#include "rseq-skip.h"
#else /* !RSEQ_SKIP_FASTPATH */
-#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip, \
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".word " __rseq_str(label) "b, 0x0\n\t" \
".popsection\n\t"
-#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip) \
- __RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip, \
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -99,7 +103,7 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -166,7 +170,7 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -237,7 +241,7 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
#endif
@@ -292,7 +296,7 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -367,7 +371,7 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -443,7 +447,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -527,7 +531,7 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -651,7 +655,7 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 2079f71e0ca2..b41a2a48e965 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -82,13 +82,16 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
post_commit_offset, abort_ip) \
- " .pushsection __rseq_table, \"aw\"\n" \
+ " .pushsection __rseq_cs, \"aw\"\n" \
" .balign 32\n" \
__rseq_str(label) ":\n" \
" .long " __rseq_str(version) ", " __rseq_str(flags) "\n" \
" .quad " __rseq_str(start_ip) ", " \
__rseq_str(post_commit_offset) ", " \
__rseq_str(abort_ip) "\n" \
+ " .popsection\n\t" \
+ " .pushsection __rseq_cs_ptr_array, \"aw\"\n" \
+ " .quad " __rseq_str(label) "b\n" \
" .popsection\n"
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
@@ -99,8 +102,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
diff --git a/tools/testing/selftests/rseq/rseq-mips.h b/tools/testing/selftests/rseq/rseq-mips.h
index 25d10ff54769..fe3eabcdcbe5 100644
--- a/tools/testing/selftests/rseq/rseq-mips.h
+++ b/tools/testing/selftests/rseq/rseq-mips.h
@@ -54,26 +54,30 @@ do { \
# error unsupported _MIPS_SZLONG
#endif
-#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip, \
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \
LONG " " U32_U64_PAD(__rseq_str(post_commit_offset)) "\n\t" \
LONG " " U32_U64_PAD(__rseq_str(abort_ip)) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(label) "b") "\n\t" \
".popsection\n\t"
-#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip) \
- __RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip, \
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -127,7 +131,7 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -192,7 +196,7 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -261,7 +265,7 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
#endif
@@ -316,7 +320,7 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -389,7 +393,7 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -463,7 +467,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -543,7 +547,7 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
@@ -664,7 +668,7 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
rseq_workaround_gcc_asm_size_guess();
__asm__ __volatile__ goto (
- RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
diff --git a/tools/testing/selftests/rseq/rseq-ppc.h b/tools/testing/selftests/rseq/rseq-ppc.h
index 24f95649d71e..9df18487fa9f 100644
--- a/tools/testing/selftests/rseq/rseq-ppc.h
+++ b/tools/testing/selftests/rseq/rseq-ppc.h
@@ -33,8 +33,8 @@ do { \
#else /* !RSEQ_SKIP_FASTPATH */
/*
- * The __rseq_table section can be used by debuggers to better handle
- * single-stepping through the restartable critical sections.
+ * The __rseq_cs_ptr_array and __rseq_cs sections can be used by debuggers to
+ * better handle single-stepping through the restartable critical sections.
*/
#ifdef __PPC64__
@@ -46,11 +46,14 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
".popsection\n\t"
#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
@@ -67,8 +70,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -85,20 +88,23 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
/* 32-bit only supported on BE */ \
".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long 0x0, " __rseq_str(label) "b\n\t" \
".popsection\n\t"
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index b8b5b6f900af..fbb97815d71c 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -37,19 +37,22 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
".popsection\n\t"
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -61,19 +64,22 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long 0x0, " __rseq_str(label) "b\n\t" \
".popsection\n\t"
/*
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h
index 0668608d3674..03095236f6fa 100644
--- a/tools/testing/selftests/rseq/rseq-x86.h
+++ b/tools/testing/selftests/rseq/rseq-x86.h
@@ -47,13 +47,17 @@ do { \
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
".popsection\n\t"
+
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
@@ -62,8 +66,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
@@ -566,11 +570,14 @@ do { \
*/
#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_table, \"aw\"\n\t" \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
".balign 32\n\t" \
__rseq_str(label) ":\n\t" \
".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
".long " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long " __rseq_str(label) "b, 0x0\n\t" \
".popsection\n\t"
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
@@ -581,8 +588,8 @@ do { \
* Exit points of a rseq critical section consist of all instructions outside
* of the critical section where a critical section can either branch to or
* reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_table section and should not
- * be explicitly defined as additional exit points. Knowing all exit points is
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
* useful to assist debuggers stepping over the critical section.
*/
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
--
2.11.0
pidfd are /proc/pid directory file descriptors referring to a task group
leader. Android low memory killer (LMK) needs pidfd polling support to
replace code that currently checks for existence of /proc/pid for
knowing a process that is signalled to be killed has died, which is both
racy and slow. The pidfd poll approach is race-free, and also allows the
LMK to do other things (such as by polling on other fds) while awaiting
the process being killed to die.
It prevents a situation where a PID is reused between when LMK sends a
kill signal and checks for existence of the PID, since the wrong PID is
now possibly checked for existence.
In this patch, we follow the same mechanism used uhen the parent of the
task group is to be notified, that is when the tasks waiting on a poll
of pidfd are also awakened.
We have decided to include the waitqueue in struct pid for the following
reasons:
1. The wait queue has to survive for the lifetime of the poll. Including
it in task_struct would not be option in this case because the task can
be reaped and destroyed before the poll returns.
2. By including the struct pid for the waitqueue means that during
de_exec, the thread doing de_thread() automatically gets the new
waitqueue/pid even though its task_struct is different.
Appropriate test cases are added in the second patch to provide coverage
of all the cases the patch is handling.
Andy had a similar patch [1] in the past which was a good reference
however this patch tries to handle different situations properly related
to thread group existence, and how/where it notifies. And also solves
other bugs (existence of taks_struct). Daniel had a similar patch [2]
recently which this patch supercedes.
[1] https://lore.kernel.org/patchwork/patch/345098/
[2] https://lore.kernel.org/lkml/20181029175322.189042-1-dancol@google.com/
Cc: luto(a)amacapital.net
Cc: rostedt(a)goodmis.org
Cc: dancol(a)google.com
Cc: christian(a)brauner.io
Cc: jannh(a)google.com
Cc: surenb(a)google.com
Cc: torvalds(a)linux-foundation.org
Co-developed-by: Daniel Colascione <dancol(a)google.com>
Signed-off-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
---
fs/proc/base.c | 39 +++++++++++++++++++++++++++++++++++++++
include/linux/pid.h | 3 +++
kernel/exit.c | 1 -
kernel/pid.c | 2 ++
kernel/signal.c | 14 ++++++++++++++
5 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6a803a0b75df..879900082647 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3069,8 +3069,47 @@ static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}
+static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts)
+{
+ int poll_flags = 0;
+ struct task_struct *task;
+ struct pid *pid;
+
+ task = get_proc_task(file->f_path.dentry->d_inode);
+
+ WARN_ON_ONCE(task && !thread_group_leader(task));
+
+ /*
+ * tasklist_lock must be held because to avoid racing with
+ * changes in exit_state and wake up. Basically to avoid:
+ *
+ * P0: read exit_state = 0
+ * P1: write exit_state = EXIT_DEAD
+ * P1: Do a wake up - wq is empty, so do nothing
+ * P0: Queue for polling - wait forever.
+ */
+ read_lock(&tasklist_lock);
+ if (!task)
+ poll_flags = POLLIN | POLLRDNORM | POLLERR;
+ else if (task->exit_state == EXIT_DEAD)
+ poll_flags = POLLIN | POLLRDNORM;
+ else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task))
+ poll_flags = POLLIN | POLLRDNORM;
+
+ if (!poll_flags) {
+ pid = proc_pid(file->f_path.dentry->d_inode);
+ poll_wait(file, &pid->wait_pidfd, pts);
+ }
+ read_unlock(&tasklist_lock);
+
+ if (task)
+ put_task_struct(task);
+ return poll_flags;
+}
+
static const struct file_operations proc_tgid_base_operations = {
.read = generic_read_dir,
+ .poll = proc_tgid_base_poll,
.iterate_shared = proc_tgid_base_readdir,
.llseek = generic_file_llseek,
};
diff --git a/include/linux/pid.h b/include/linux/pid.h
index b6f4ba16065a..2e0dcbc6d14e 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
#define _LINUX_PID_H
#include <linux/rculist.h>
+#include <linux/wait.h>
enum pid_type
{
@@ -60,6 +61,8 @@ struct pid
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
+ /* wait queue for pidfd pollers */
+ wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[1];
};
diff --git a/kernel/exit.c b/kernel/exit.c
index 2166c2d92ddc..c386ec52687d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -181,7 +181,6 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(tsk);
}
-
void release_task(struct task_struct *p)
{
struct task_struct *leader;
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..5c90c239242f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -214,6 +214,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
+ init_waitqueue_head(&pid->wait_pidfd);
+
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
diff --git a/kernel/signal.c b/kernel/signal.c
index f98448cf2def..e3781703ef7e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1800,6 +1800,17 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
return ret;
}
+static void do_wakeup_pidfd_pollers(struct task_struct *task)
+{
+ struct pid *pid;
+
+ lockdep_assert_held(&tasklist_lock);
+
+ pid = get_task_pid(task, PIDTYPE_PID);
+ wake_up_all(&pid->wait_pidfd);
+ put_pid(pid);
+}
+
/*
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1823,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
+ /* Wake up all pidfd waiters */
+ do_wakeup_pidfd_pollers(tsk);
+
if (sig != SIGCHLD) {
/*
* This is only possible if parent == real_parent.
--
2.21.0.392.gf8f6787159e-goog
This refactors the selftest Makefiles to extract the test running logic
to be reused between "run_tests" and "emit_tests", while also fixing
up the test output to be TAP version 13 compliant:
- added "plan" line
- fixed result line syntax
- moved all test output to be "# "-prefixed as TAP "diagnostic" lines
The prefixing code includes a fallback mode for limited execution
environments.
-Kees
Kees Cook (6):
selftests: Extract single-test shell logic from lib.mk
selftests: Use runner.sh for emit targets
selftests: Extract logic for multiple test runs
selftests/runner: Add plan line and fix result line syntax
selftests/runner: Distinguish between missing and non-executable
selftests: Move test output to diagnostic lines
tools/testing/selftests/.gitignore | 1 -
tools/testing/selftests/Makefile | 18 +++--
tools/testing/selftests/kselftest/prefix.pl | 23 ++++++
tools/testing/selftests/kselftest/runner.sh | 80 +++++++++++++++++++++
tools/testing/selftests/lib.mk | 61 +++-------------
5 files changed, 119 insertions(+), 64 deletions(-)
create mode 100755 tools/testing/selftests/kselftest/prefix.pl
create mode 100644 tools/testing/selftests/kselftest/runner.sh
--
2.17.1