For some services we are using "established-over-unconnected" model.
'''
// create unconnected socket and 'listen()'
srv_fd = socket(AF_INET, SOCK_DGRAM)
setsockopt(srv_fd, SO_REUSEPORT)
bind(srv_fd, SERVER_ADDR, SERVER_PORT)
// 'accept()'
data, client_addr = recvmsg(srv_fd)
// create a connected socket for this request
cli_fd = socket(AF_INET, SOCK_DGRAM)
setsockopt(cli_fd, SO_REUSEPORT)
bind(cli_fd, SERVER_ADDR, SERVER_PORT)
connect(cli, client_addr)
...
// do handshake with cli_fd
'''
This programming pattern simulates accept() using UDP, creating a new
socket for each client request. The server can then use separate sockets
to handle client requests, avoiding the need to use a single UDP socket
for I/O transmission.
But there is a race condition between the bind() and connect() of the
connected socket:
We might receive unexpected packets belonging to the unconnected socket
before connect() is executed, which is not what we need.
(Of course, before connect(), the unconnected socket will also receive
packets from the connected socket, which is easily resolved because
upper-layer protocols typically require explicit boundaries, and we
receive a complete packet before creating a connected socket.)
Before this patch, the connected socket had to filter requests at recvmsg
time, acting as a dispatcher to some extent. With this patch, we can
consider the bind and connect operations to be atomic.
Signed-off-by: Jiayuan Chen <jiayuan.chen(a)linux.dev>
---
include/linux/udp.h | 1 +
include/uapi/linux/udp.h | 1 +
net/ipv4/udp.c | 13 ++++++++++---
net/ipv6/udp.c | 5 +++--
4 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/include/linux/udp.h b/include/linux/udp.h
index 895240177f4f..8d281a0c0d9d 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -42,6 +42,7 @@ enum {
UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
UDP_FLAGS_UDPLITE_SEND_CC, /* set via udplite setsockopt */
UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */
+ UDP_FLAGS_STOP_RCV, /* Stop receiving packets */
};
struct udp_sock {
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index edca3e430305..bb8e0a749a55 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -34,6 +34,7 @@ struct udphdr {
#define UDP_NO_CHECK6_RX 102 /* Disable accepting checksum for UDP6 */
#define UDP_SEGMENT 103 /* Set GSO segmentation size */
#define UDP_GRO 104 /* This socket can receive UDP GRO packets */
+#define UDP_STOP_RCV 105 /* This socket will not receive any packets */
/* UDP encapsulation types */
#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* unused draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f9f5b92cf4b6..764d337ab1b3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -376,7 +376,8 @@ static int compute_score(struct sock *sk, const struct net *net,
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
- ipv6_only_sock(sk))
+ ipv6_only_sock(sk) ||
+ udp_test_bit(STOP_RCV, sk))
return -1;
if (sk->sk_rcv_saddr != daddr)
@@ -494,7 +495,7 @@ static struct sock *udp4_lib_lookup2(const struct net *net,
result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
saddr, sport, daddr, hnum, udp_ehashfn);
- if (!result) {
+ if (!result || udp_test_bit(STOP_RCV, result)) {
result = sk;
continue;
}
@@ -3031,7 +3032,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk);
sockopt_release_sock(sk);
break;
-
+ case UDP_STOP_RCV:
+ udp_assign_bit(STOP_RCV, sk, valbool);
+ break;
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
@@ -3120,6 +3123,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = udp_test_bit(GRO_ENABLED, sk);
break;
+ case UDP_STOP_RCV:
+ val = udp_test_bit(STOP_RCV, sk);
+ break;
+
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 7317f8e053f1..55896a78e94b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -137,7 +137,8 @@ static int compute_score(struct sock *sk, const struct net *net,
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
- sk->sk_family != PF_INET6)
+ sk->sk_family != PF_INET6 ||
+ udp_test_bit(STOP_RCV, sk))
return -1;
if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
@@ -245,7 +246,7 @@ static struct sock *udp6_lib_lookup2(const struct net *net,
result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
saddr, sport, daddr, hnum, udp6_ehashfn);
- if (!result) {
+ if (!result || udp_test_bit(STOP_RCV, result)) {
result = sk;
continue;
}
--
2.47.1
The vIOMMU object is designed to represent a slice of an IOMMU HW for its
virtualization features shared with or passed to user space (a VM mostly)
in a way of HW acceleration. This extended the HWPT-based design for more
advanced virtualization feature.
A vCMDQ introduced by this series as a part of the vIOMMU infrastructure
represents a HW supported queue/buffer for VM to use exclusively, e.g.
- NVIDIA's virtual command queue
- AMD vIOMMU's command buffer
either of which is an IOMMU HW feature to directly load and execute cache
invalidation commands issued by a guest kernel, to shoot down TLB entries
that HW cached for guest-owned stage-1 page table entries. This is a big
improvement since there is no VM Exit during an invalidation, compared to
the traditional invalidation pathway by trapping a guest-own invalidation
queue and forwarding those commands/requests to the host kernel that will
eventually fill a HW-owned queue to execute those commands.
Thus, a vCMDQ object, as an initial use case, is all about a guest-owned
HW command queue that VMM can allocate/configure depending on the request
from a guest kernel. Introduce a new IOMMUFD_OBJ_VCMDQ and its allocator
IOMMUFD_CMD_VCMDQ_ALLOC allowing VMM to forward the IOMMU-specific queue
info, such as queue base address, size, and etc.
Meanwhile, a guest-owned command queue needs the kernel (a command queue
driver) to control the queue by reading/writing its consumer and producer
indexes, which means the command queue HW allows the guest kernel to get
a direct R/W access to those registers. Introduce an mmap infrastructure
to the iommufd core so as to support pass through a piece of MMIO region
from the host physical address space to the guest physical address space.
The VMA info (vm_pgoff/size) used by an mmap must be pre-allocated during
the IOMMUFD_CMD_VCMDQ_ALLOC and given those info to the user space as an
output driver-data by the IOMMUFD_CMD_VCMDQ_ALLOC. So, this requires a
driver-specific user data support by a vIOMMU object.
As a real-world use case, this series implements a vCMDQ support to the
tegra241-cmdqv driver for the vCMDQ on NVIDIA Grace CPU. In another word,
this is also the Tegra CMDQV series Part-2 (user-space support), reworked
from Previous RFCv1:
https://lore.kernel.org/all/cover.1712978212.git.nicolinc@nvidia.com/
This enables the HW accelerated feature for NVIDIA Grace CPU. Compared to
the standard SMMUv3 operating in the nested translation mode trapping CMDQ
for TLBI and ATC_INV commands, this gives a huge performance improvement:
70% to 90% reductions of invalidation time were measured by various DMA
unmap tests running in a guest OS.
This is on Github:
https://github.com/nicolinc/iommufd/commits/iommufd_vcmdq-v2
Paring QEMU branch for testing:
https://github.com/nicolinc/qemu/commits/wip/for_iommufd_vcmdq-v2
Changelog
v2
* Add Reviewed-by from Jason
* [smmu] Fix vsmmu initial value
* [smmu] Support impl for hw_info
* [tegra] Rename "slot" to "vsid"
* [tegra] Update kdocs and commit logs
* [tegra] Map/unmap LVCMDQ dynamically
* [tegra] Refcount the previous LVCMDQ
* [tegra] Return -EEXIST if LVCMDQ exists
* [tegra] Simplify VINTF cleanup routine
* [tegra] Use vmid and s2_domain in vsmmu
* [tegra] Rename "mmap_pgoff" to "immap_id"
* [tegra] Add more addr and length validation
* [iommufd] Add more narrative to mmap's kdoc
* [iommufd] Add iommufd_struct_depend/undepend()
* [iommufd] Rename vcmdq_free op to vcmdq_destroy
* [iommufd] Fix bug in iommu_copy_struct_to_user()
* [iommufd] Drop is_io from iommufd_ctx_alloc_mmap()
* [iommufd] Test the queue memory for its contiguity
* [iommufd] Return -ENXIO if address or length fails
* [iommufd] Do not change @min_last in mock_viommu_alloc()
* [iommufd] Generalize TEGRA241_VCMDQ data in core structure
* [iommufd] Add selftest coverage for IOMMUFD_CMD_VCMDQ_ALLOC
* [iommufd] Add iopt_pin_pages() to prevent queue memory from unmapping
v1
https://lore.kernel.org/all/cover.1744353300.git.nicolinc@nvidia.com/
Thanks
Nicolin
Nicolin Chen (22):
iommufd/viommu: Add driver-allocated vDEVICE support
iommu: Pass in a driver-level user data structure to viommu_alloc op
iommufd/viommu: Allow driver-specific user data for a vIOMMU object
iommu: Add iommu_copy_struct_to_user helper
iommufd: Add iommufd_struct_destroy to revert iommufd_viommu_alloc
iommufd/selftest: Support user_data in mock_viommu_alloc
iommufd/selftest: Add covearge for viommu data
iommufd: Abstract iopt_pin_pages and iopt_unpin_pages helpers
iommufd/viommu: Introduce IOMMUFD_OBJ_VCMDQ and its related struct
iommufd/viommmu: Add IOMMUFD_CMD_VCMDQ_ALLOC ioctl
iommufd: Add for-driver helpers iommufd_vcmdq_depend/undepend()
iommufd/selftest: Add coverage for IOMMUFD_CMD_VCMDQ_ALLOC
iommufd: Add mmap interface
iommufd/selftest: Add coverage for the new mmap interface
Documentation: userspace-api: iommufd: Update vCMDQ
iommu/arm-smmu-v3-iommufd: Add vsmmu_alloc impl op
iommu/arm-smmu-v3-iommufd: Support implementation-defined hw_info
iommu/tegra241-cmdqv: Use request_threaded_irq
iommu/tegra241-cmdqv: Simplify deinit flow in
tegra241_cmdqv_remove_vintf()
iommu/tegra241-cmdqv: Do not statically map LVCMDQs
iommu/tegra241-cmdqv: Add user-space use support
iommu/tegra241-cmdqv: Add IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV support
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 25 +-
drivers/iommu/iommufd/io_pagetable.h | 8 +
drivers/iommu/iommufd/iommufd_private.h | 25 +-
drivers/iommu/iommufd/iommufd_test.h | 20 +
include/linux/iommu.h | 43 +-
include/linux/iommufd.h | 146 ++++++
include/uapi/linux/iommufd.h | 113 ++++-
tools/testing/selftests/iommu/iommufd_utils.h | 51 +-
.../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 42 +-
.../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 451 +++++++++++++++++-
drivers/iommu/iommufd/device.c | 117 +----
drivers/iommu/iommufd/driver.c | 81 ++++
drivers/iommu/iommufd/io_pagetable.c | 95 ++++
drivers/iommu/iommufd/main.c | 58 ++-
drivers/iommu/iommufd/selftest.c | 123 ++++-
drivers/iommu/iommufd/viommu.c | 111 ++++-
tools/testing/selftests/iommu/iommufd.c | 93 +++-
.../selftests/iommu/iommufd_fail_nth.c | 11 +-
Documentation/userspace-api/iommufd.rst | 14 +
19 files changed, 1436 insertions(+), 191 deletions(-)
--
2.43.0
kunit kernel build could fail if there are ny build artifacts from a
prior kernel build. These can be hard to debug if the build artifact
happens to be generated header file. It took me a while to debug kunit
build fail on ARCH=x86_64 in a tree which had a generated header file
arch/x86/realmode/rm/pasyms.h
make ARCH=um mrproper will not clean the tree. It is necessary to run
make ARCH=x86_64 mrproper
Example work-flow that could lead to this:
make allmodconfig (x86_64)
make
./tools/testing/kunit/kunit.py run
Add this to the documentation and kunit.py build help message.
Shuah Khan (2):
doc: kunit: add information about cleaning source trees
kunit: add tips to clean source tree to build help message
Documentation/dev-tools/kunit/start.rst | 12 ++++++++++++
tools/testing/kunit/kunit.py | 2 +-
2 files changed, 13 insertions(+), 1 deletion(-)
--
2.47.2