Every devmem dmabuf binding hands the page_pool PAGE_SIZE niovs today. On NICs that consume one descriptor per netmem, this caps a single RX descriptor at PAGE_SIZE and burns CPU on buffer churn.
In this series, we add a bind-time netlink attribute, NETDEV_A_DMABUF_RX_BUF_SIZE, that lets userspace request a larger niov size (power of two >= PAGE_SIZE). Drivers must opt in via queue_mgmt_ops.QCFG_RX_PAGE_SIZE.
Selftests use udmabuf, but udmabuf sgtables were previously hardcoded to PAGE_SIZE. This series modifies udmabuf to respect folio sizes in its exported sgtable. The result is that when backing udmabuf with MFD_HUGETLB 2MB pages, the sgtable is populated with 2MB entries, allowing devmem's gen_pool to carve out large (eg. 64K) niovs.
Measurements ------------
Setup: kperf devmem RX/TX cuda, 4 flows, 64 MB messages, 60s, dctcp, num-rx-queues=4, dmabuf-rx/tx-size-mb=2048, 10 runs per niov size, mlx5.
niov RX dev Gbps RX flow avg Gbps app sys % ----- ---------------- ----------------- ---------------- 4K 300.63 +/- 53.21 75.16 +/- 13.30 54.15 +/- 10.23 16K 321.35 +/- 28.20 80.34 +/- 7.05 41.05 +/- 8.87 32K 347.63 +/- 2.20 86.91 +/- 0.55 44.54 +/- 3.51 64K 332.11 +/- 14.26 83.03 +/- 3.56 35.47 +/- 3.11
RX app sys % drops ~19% from 4K to 64K.
kperf support (not yet merged): https://github.com/facebookexperimental/kperf/commit/8837577f920876bce6986ec...
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com --- Bobby Eshleman (4): net: devmem: allow rx-buf-size > PAGE_SIZE per dmabuf binding udmabuf: emit one sg entry per pinned folio selftests/net: ncdevmem: add -b option to set rx-buf-size on bind selftests/net: devmem.py: add check_rx_large_niov
Documentation/netlink/specs/netdev.yaml | 8 ++++ drivers/dma-buf/udmabuf.c | 47 ++++++++++++++++--- include/uapi/linux/netdev.h | 1 + net/core/devmem.c | 52 +++++++++++++--------- net/core/devmem.h | 13 ++++-- net/core/netdev-genl-gen.c | 5 ++- net/core/netdev-genl.c | 18 +++++++- tools/include/uapi/linux/netdev.h | 1 + tools/testing/selftests/drivers/net/hw/config | 1 + tools/testing/selftests/drivers/net/hw/devmem.py | 12 ++++- .../testing/selftests/drivers/net/hw/devmem_lib.py | 46 ++++++++++++++++++- tools/testing/selftests/drivers/net/hw/ncdevmem.c | 49 ++++++++++++++++++-- .../testing/selftests/drivers/net/hw/nk_devmem.py | 11 ++++- 13 files changed, 220 insertions(+), 44 deletions(-) --- base-commit: dfcc2ff12925d99e858eaf539eaa4aaaf81fe2a6 change-id: 20260602-tcpdm-large-niovs-56523a3a1077
Best regards,
From: Bobby Eshleman bobbyeshleman@meta.com
Every devmem dmabuf binding today hands the page_pool PAGE_SIZE niovs. This caps a single RX descriptor at PAGE_SIZE, burning CPU on buffer churn for large flows.
Add a bind-time netlink attribute, NETDEV_A_DMABUF_RX_BUF_SIZE, that lets userspace request a larger niov size. The value must be a power of two >= PAGE_SIZE.
Measurements ------------ Setup: kperf in devmem RX/TX cuda mode, 4 flows, 64 MB messages, 60s, dctcp, num-rx-queues=4, dmabuf-rx/tx-size-mb=2048, 10 runs per niov size, mlx5.
CPU Util:
niov net sirq % net idle % app sys % app idle % ----- ---------------- ---------------- ---------------- ---------------- 4K 62.38 +/- 8.27 33.40 +/- 7.51 54.15 +/- 10.23 43.67 +/- 10.53 16K 58.91 +/- 5.35 35.23 +/- 5.88 41.05 +/- 8.87 56.42 +/- 9.24 32K 64.12 +/- 0.68 31.09 +/- 1.48 44.54 +/- 3.51 52.63 +/- 3.65 64K 54.69 +/- 5.54 39.67 +/- 5.81 35.47 +/- 3.11 61.97 +/- 3.27
RX app sys % drops ~19% from 4K to 64K.
Throughput:
niov RX dev Gbps RX flow avg Gbps ----- ---------------- ----------------- 4K 300.63 +/- 53.21 75.16 +/- 13.30 16K 321.35 +/- 28.20 80.34 +/- 7.05 32K 347.63 +/- 2.20 86.91 +/- 0.55 64K 332.11 +/- 14.26 83.03 +/- 3.56
Throughput seems to increase, but the stdev is pretty wide so could just be noise.
kperf support (not yet merged): https://github.com/facebookexperimental/kperf/commit/8837577f920876bce6986ec...
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com --- Documentation/netlink/specs/netdev.yaml | 8 +++++ include/uapi/linux/netdev.h | 1 + net/core/devmem.c | 52 +++++++++++++++++++-------------- net/core/devmem.h | 13 ++++++--- net/core/netdev-genl-gen.c | 5 ++-- net/core/netdev-genl.c | 18 ++++++++++-- tools/include/uapi/linux/netdev.h | 1 + 7 files changed, 68 insertions(+), 30 deletions(-)
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index a1f4c5a561e9..063119907983 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -591,6 +591,13 @@ attribute-sets: type: u32 checks: min: 1 + - + name: rx-buf-size + doc: | + Size in bytes of each RX buffer the NIC writes into from the bound + dmabuf. Must be a power of two and >= PAGE_SIZE; defaults to + PAGE_SIZE. + type: u32
operations: list: @@ -805,6 +812,7 @@ operations: - ifindex - fd - queues + - rx-buf-size reply: attributes: - id diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7df1056a35fd..180a4ffffd60 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -217,6 +217,7 @@ enum { NETDEV_A_DMABUF_QUEUES, NETDEV_A_DMABUF_FD, NETDEV_A_DMABUF_ID, + NETDEV_A_DMABUF_RX_BUF_SIZE,
__NETDEV_A_DMABUF_MAX, NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1) diff --git a/net/core/devmem.c b/net/core/devmem.c index 957d6b96216b..5a1c0d7984a8 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -46,7 +46,7 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
owner = net_devmem_iov_to_chunk_owner(niov); return owner->base_dma_addr + - ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); + ((dma_addr_t)net_iov_idx(niov) << owner->binding->niov_shift); }
static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref) @@ -93,13 +93,14 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) ssize_t offset; ssize_t index;
- dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE, + dma_addr = gen_pool_alloc_owner(binding->chunk_pool, + 1UL << binding->niov_shift, (void **)&owner); if (!dma_addr) return NULL;
offset = dma_addr - owner->base_dma_addr; - index = offset / PAGE_SIZE; + index = offset >> binding->niov_shift; niov = &owner->area.niovs[index];
niov->desc.pp_magic = 0; @@ -113,12 +114,13 @@ void net_devmem_free_dmabuf(struct net_iov *niov) { struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); unsigned long dma_addr = net_devmem_get_dma_addr(niov); + size_t niov_size = 1UL << binding->niov_shift;
if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, - PAGE_SIZE))) + niov_size))) return;
- gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); + gen_pool_free(binding->chunk_pool, dma_addr, niov_size); }
void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -163,6 +165,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, u32 xa_idx; int err;
+ if (binding->niov_shift != PAGE_SHIFT) + mp_params.rx_page_size = 1U << binding->niov_shift; + err = netif_mp_open_rxq(dev, rxq_idx, &mp_params, extack); if (err) return err; @@ -184,14 +189,16 @@ struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, struct device *dma_dev, enum dma_data_direction direction, - unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + unsigned int dmabuf_fd, unsigned int niov_shift, + struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; + size_t niov_size = 1UL << niov_shift; static u32 id_alloc_next; + unsigned int sg_idx, i; struct scatterlist *sg; struct dma_buf *dmabuf; - unsigned int sg_idx, i; unsigned long virtual; int err;
@@ -213,6 +220,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
binding->dev = dev; binding->vdev = vdev; + binding->niov_shift = niov_shift; xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
err = percpu_ref_init(&binding->ref, @@ -248,18 +256,14 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, goto err_unmap; } binding->tx_vec = kvmalloc_objs(struct net_iov *, - dmabuf->size / PAGE_SIZE); + dmabuf->size >> niov_shift); if (!binding->tx_vec) { err = -ENOMEM; goto err_unmap; } }
- /* For simplicity we expect to make PAGE_SIZE allocations, but the - * binding can be much more flexible than that. We may be able to - * allocate MTU sized chunks here. Leave that for future work... - */ - binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + binding->chunk_pool = gen_pool_create(niov_shift, dev_to_node(&dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM; @@ -273,9 +277,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, size_t len = sg_dma_len(sg); struct net_iov *niov;
- if (!IS_ALIGNED(len, PAGE_SIZE)) { + if (!IS_ALIGNED(dma_addr, niov_size) || + !IS_ALIGNED(len, niov_size)) { err = -EINVAL; - NL_SET_ERR_MSG(extack, "dma-buf SG length must be PAGE_SIZE aligned"); + NL_SET_ERR_MSG(extack, + "dmabuf sg entry not aligned to niov size"); goto err_free_chunks; }
@@ -288,7 +294,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
owner->area.base_virtual = virtual; owner->base_dma_addr = dma_addr; - owner->area.num_niovs = len / PAGE_SIZE; + owner->area.num_niovs = len >> niov_shift; owner->binding = binding;
err = gen_pool_add_owner(binding->chunk_pool, dma_addr, @@ -313,7 +319,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); if (direction == DMA_TO_DEVICE) - binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; + binding->tx_vec[(owner->area.base_virtual >> niov_shift) + i] = niov; }
virtual += len; @@ -430,13 +436,15 @@ struct net_iov * net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t virt_addr, size_t *off, size_t *size) { + size_t niov_size = 1UL << binding->niov_shift; + if (virt_addr >= binding->dmabuf->size) return NULL;
- *off = virt_addr % PAGE_SIZE; - *size = PAGE_SIZE - *off; + *off = virt_addr & (niov_size - 1); + *size = niov_size - *off;
- return binding->tx_vec[virt_addr / PAGE_SIZE]; + return binding->tx_vec[virt_addr >> binding->niov_shift]; }
/*** "Dmabuf devmem memory provider" ***/ @@ -454,8 +462,8 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) pool->dma_sync = false; pool->dma_sync_for_cpu = false;
- if (pool->p.order != 0) - return -E2BIG; + if (pool->p.order != binding->niov_shift - PAGE_SHIFT) + return -EINVAL;
net_devmem_dmabuf_binding_get(binding); return 0; diff --git a/net/core/devmem.h b/net/core/devmem.h index 3852a56036cb..4a293a7d1149 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -71,6 +71,8 @@ struct net_devmem_dmabuf_binding { */ struct net_iov **tx_vec;
+ unsigned int niov_shift; + struct work_struct unbind_w; };
@@ -93,7 +95,8 @@ struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, struct device *dma_dev, enum dma_data_direction direction, - unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + unsigned int dmabuf_fd, unsigned int niov_shift, + struct netdev_nl_sock *priv, struct netlink_ext_ack *extack); struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); @@ -122,10 +125,11 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) { - struct net_iov_area *owner = net_iov_owner(niov); + struct dmabuf_genpool_chunk_owner *co = + net_devmem_iov_to_chunk_owner(niov);
- return owner->base_virtual + - ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); + return net_iov_owner(niov)->base_virtual + + ((unsigned long)net_iov_idx(niov) << co->binding->niov_shift); }
static inline bool @@ -175,6 +179,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, + unsigned int niov_shift, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index c7e138bfe345..55e03b9cd227 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -106,10 +106,11 @@ static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE };
/* NETDEV_CMD_BIND_RX - do */ -static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { +static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_RX_BUF_SIZE + 1] = { [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), + [NETDEV_A_DMABUF_RX_BUF_SIZE] = { .type = NLA_U32, }, };
/* NETDEV_CMD_NAPI_SET - do */ @@ -219,7 +220,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_BIND_RX, .doit = netdev_nl_bind_rx_doit, .policy = netdev_bind_rx_nl_policy, - .maxattr = NETDEV_A_DMABUF_FD, + .maxattr = NETDEV_A_DMABUF_RX_BUF_SIZE, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b4d48f3672a5..9902a97698f5 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -1012,6 +1012,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; u32 ifindex, dmabuf_fd, rxq_idx; + unsigned int niov_shift = PAGE_SHIFT; struct netdev_nl_sock *priv; struct net_device *netdev; unsigned long *rxq_bitmap; @@ -1028,6 +1029,18 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+ if (info->attrs[NETDEV_A_DMABUF_RX_BUF_SIZE]) { + u32 rx_buf_size = nla_get_u32(info->attrs[NETDEV_A_DMABUF_RX_BUF_SIZE]); + + if (!rx_buf_size || !is_power_of_2(rx_buf_size) || + rx_buf_size < PAGE_SIZE) { + NL_SET_ERR_MSG(info->extack, + "rx_buf_size must be a power of 2 >= PAGE_SIZE"); + return -EINVAL; + } + niov_shift = ilog2(rx_buf_size); + } + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(priv)) return PTR_ERR(priv); @@ -1078,7 +1091,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) }
binding = net_devmem_bind_dmabuf(netdev, NULL, dma_dev, DMA_FROM_DEVICE, - dmabuf_fd, priv, info->extack); + dmabuf_fd, niov_shift, priv, + info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_rxq_bitmap; @@ -1221,7 +1235,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) binding = net_devmem_bind_dmabuf(bind_dev, bind_dev != netdev ? netdev : NULL, dma_dev, DMA_TO_DEVICE, dmabuf_fd, - priv, info->extack); + PAGE_SHIFT, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock_bind_dev; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7df1056a35fd..180a4ffffd60 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -217,6 +217,7 @@ enum { NETDEV_A_DMABUF_QUEUES, NETDEV_A_DMABUF_FD, NETDEV_A_DMABUF_ID, + NETDEV_A_DMABUF_RX_BUF_SIZE,
__NETDEV_A_DMABUF_MAX, NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1)
On 06/03, Bobby Eshleman wrote:
From: Bobby Eshleman bobbyeshleman@meta.com
Every devmem dmabuf binding today hands the page_pool PAGE_SIZE niovs. This caps a single RX descriptor at PAGE_SIZE, burning CPU on buffer churn for large flows.
Add a bind-time netlink attribute, NETDEV_A_DMABUF_RX_BUF_SIZE, that lets userspace request a larger niov size. The value must be a power of two >= PAGE_SIZE.
Measurements
Setup: kperf in devmem RX/TX cuda mode, 4 flows, 64 MB messages, 60s, dctcp, num-rx-queues=4, dmabuf-rx/tx-size-mb=2048, 10 runs per niov size, mlx5.
CPU Util:
niov net sirq % net idle % app sys % app idle %
4K 62.38 +/- 8.27 33.40 +/- 7.51 54.15 +/- 10.23 43.67 +/- 10.53 16K 58.91 +/- 5.35 35.23 +/- 5.88 41.05 +/- 8.87 56.42 +/- 9.24 32K 64.12 +/- 0.68 31.09 +/- 1.48 44.54 +/- 3.51 52.63 +/- 3.65 64K 54.69 +/- 5.54 39.67 +/- 5.81 35.47 +/- 3.11 61.97 +/- 3.27RX app sys % drops ~19% from 4K to 64K.
Throughput:
niov RX dev Gbps RX flow avg Gbps
4K 300.63 +/- 53.21 75.16 +/- 13.30 16K 321.35 +/- 28.20 80.34 +/- 7.05 32K 347.63 +/- 2.20 86.91 +/- 0.55 64K 332.11 +/- 14.26 83.03 +/- 3.56Throughput seems to increase, but the stdev is pretty wide so could just be noise.
kperf support (not yet merged): https://github.com/facebookexperimental/kperf/commit/8837577f920876bce6986ec...
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com
Documentation/netlink/specs/netdev.yaml | 8 +++++ include/uapi/linux/netdev.h | 1 + net/core/devmem.c | 52 +++++++++++++++++++-------------- net/core/devmem.h | 13 ++++++--- net/core/netdev-genl-gen.c | 5 ++-- net/core/netdev-genl.c | 18 ++++++++++-- tools/include/uapi/linux/netdev.h | 1 + 7 files changed, 68 insertions(+), 30 deletions(-)
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index a1f4c5a561e9..063119907983 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -591,6 +591,13 @@ attribute-sets: type: u32 checks: min: 1
-name: rx-buf-sizedoc: |Size in bytes of each RX buffer the NIC writes into from the bounddmabuf. Must be a power of two and >= PAGE_SIZE; defaults toPAGE_SIZE.type: u32operations: list: @@ -805,6 +812,7 @@ operations: - ifindex - fd - queues
- rx-buf-size reply: attributes: - iddiff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7df1056a35fd..180a4ffffd60 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -217,6 +217,7 @@ enum { NETDEV_A_DMABUF_QUEUES, NETDEV_A_DMABUF_FD, NETDEV_A_DMABUF_ID,
- NETDEV_A_DMABUF_RX_BUF_SIZE,
__NETDEV_A_DMABUF_MAX, NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1) diff --git a/net/core/devmem.c b/net/core/devmem.c index 957d6b96216b..5a1c0d7984a8 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -46,7 +46,7 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) owner = net_devmem_iov_to_chunk_owner(niov); return owner->base_dma_addr +
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
((dma_addr_t)net_iov_idx(niov) << owner->binding->niov_shift);} static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref) @@ -93,13 +93,14 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) ssize_t offset; ssize_t index;
- dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE,
- dma_addr = gen_pool_alloc_owner(binding->chunk_pool,
if (!dma_addr) return NULL;1UL << binding->niov_shift, (void **)&owner);offset = dma_addr - owner->base_dma_addr;
- index = offset / PAGE_SIZE;
- index = offset >> binding->niov_shift; niov = &owner->area.niovs[index];
niov->desc.pp_magic = 0; @@ -113,12 +114,13 @@ void net_devmem_free_dmabuf(struct net_iov *niov) { struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); unsigned long dma_addr = net_devmem_get_dma_addr(niov);
- size_t niov_size = 1UL << binding->niov_shift;
if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
PAGE_SIZE)))
return;niov_size)))
- gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE);
- gen_pool_free(binding->chunk_pool, dma_addr, niov_size);
} void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -163,6 +165,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, u32 xa_idx; int err;
- if (binding->niov_shift != PAGE_SHIFT)
mp_params.rx_page_size = 1U << binding->niov_shift;- err = netif_mp_open_rxq(dev, rxq_idx, &mp_params, extack); if (err) return err;
@@ -184,14 +189,16 @@ struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, struct device *dma_dev, enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
unsigned int dmabuf_fd, unsigned int niov_shift,struct netdev_nl_sock *priv, struct netlink_ext_ack *extack){ struct net_devmem_dmabuf_binding *binding;
- size_t niov_size = 1UL << niov_shift; static u32 id_alloc_next;
- unsigned int sg_idx, i; struct scatterlist *sg; struct dma_buf *dmabuf;
- unsigned int sg_idx, i; unsigned long virtual; int err;
@@ -213,6 +220,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, binding->dev = dev; binding->vdev = vdev;
- binding->niov_shift = niov_shift; xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
err = percpu_ref_init(&binding->ref, @@ -248,18 +256,14 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, goto err_unmap; } binding->tx_vec = kvmalloc_objs(struct net_iov *,
dmabuf->size / PAGE_SIZE);
if (!binding->tx_vec) { err = -ENOMEM; goto err_unmap; } }dmabuf->size >> niov_shift);
- /* For simplicity we expect to make PAGE_SIZE allocations, but the
* binding can be much more flexible than that. We may be able to* allocate MTU sized chunks here. Leave that for future work...*/- binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
- binding->chunk_pool = gen_pool_create(niov_shift, dev_to_node(&dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM;
@@ -273,9 +277,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, size_t len = sg_dma_len(sg); struct net_iov *niov;
if (!IS_ALIGNED(len, PAGE_SIZE)) {
if (!IS_ALIGNED(dma_addr, niov_size) ||!IS_ALIGNED(len, niov_size)) { err = -EINVAL;
NL_SET_ERR_MSG(extack, "dma-buf SG length must be PAGE_SIZE aligned");
NL_SET_ERR_MSG(extack,"dmabuf sg entry not aligned to niov size");
nit: should we NL_SET_ERR_MSG_FMT here and export chunk len and expected alignment?
goto err_free_chunks; }@@ -288,7 +294,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, owner->area.base_virtual = virtual; owner->base_dma_addr = dma_addr;
owner->area.num_niovs = len / PAGE_SIZE;
owner->binding = binding;owner->area.num_niovs = len >> niov_shift;err = gen_pool_add_owner(binding->chunk_pool, dma_addr, @@ -313,7 +319,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); if (direction == DMA_TO_DEVICE)
binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
}binding->tx_vec[(owner->area.base_virtual >> niov_shift) + i] = niov;virtual += len; @@ -430,13 +436,15 @@ struct net_iov * net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t virt_addr, size_t *off, size_t *size) {
- size_t niov_size = 1UL << binding->niov_shift;
- if (virt_addr >= binding->dmabuf->size) return NULL;
- *off = virt_addr % PAGE_SIZE;
- *size = PAGE_SIZE - *off;
- *off = virt_addr & (niov_size - 1);
- *size = niov_size - *off;
- return binding->tx_vec[virt_addr / PAGE_SIZE];
- return binding->tx_vec[virt_addr >> binding->niov_shift];
} /*** "Dmabuf devmem memory provider" ***/ @@ -454,8 +462,8 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) pool->dma_sync = false; pool->dma_sync_for_cpu = false;
- if (pool->p.order != 0)
return -E2BIG;
- if (pool->p.order != binding->niov_shift - PAGE_SHIFT)
return -EINVAL;
Any specific reason you change E2BIG to EINVAL?
On Fri, Jun 05, 2026 at 08:33:04AM -0700, Stanislav Fomichev wrote:
On 06/03, Bobby Eshleman wrote:
From: Bobby Eshleman bobbyeshleman@meta.com
Every devmem dmabuf binding today hands the page_pool PAGE_SIZE niovs. This caps a single RX descriptor at PAGE_SIZE, burning CPU on buffer churn for large flows.
Add a bind-time netlink attribute, NETDEV_A_DMABUF_RX_BUF_SIZE, that lets userspace request a larger niov size. The value must be a power of two >= PAGE_SIZE.
Measurements
Setup: kperf in devmem RX/TX cuda mode, 4 flows, 64 MB messages, 60s, dctcp, num-rx-queues=4, dmabuf-rx/tx-size-mb=2048, 10 runs per niov size, mlx5.
CPU Util:
niov net sirq % net idle % app sys % app idle %
4K 62.38 +/- 8.27 33.40 +/- 7.51 54.15 +/- 10.23 43.67 +/- 10.53 16K 58.91 +/- 5.35 35.23 +/- 5.88 41.05 +/- 8.87 56.42 +/- 9.24 32K 64.12 +/- 0.68 31.09 +/- 1.48 44.54 +/- 3.51 52.63 +/- 3.65 64K 54.69 +/- 5.54 39.67 +/- 5.81 35.47 +/- 3.11 61.97 +/- 3.27RX app sys % drops ~19% from 4K to 64K.
Throughput:
niov RX dev Gbps RX flow avg Gbps
4K 300.63 +/- 53.21 75.16 +/- 13.30 16K 321.35 +/- 28.20 80.34 +/- 7.05 32K 347.63 +/- 2.20 86.91 +/- 0.55 64K 332.11 +/- 14.26 83.03 +/- 3.56Throughput seems to increase, but the stdev is pretty wide so could just be noise.
kperf support (not yet merged): https://github.com/facebookexperimental/kperf/commit/8837577f920876bce6986ec...
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com
Documentation/netlink/specs/netdev.yaml | 8 +++++ include/uapi/linux/netdev.h | 1 + net/core/devmem.c | 52 +++++++++++++++++++-------------- net/core/devmem.h | 13 ++++++--- net/core/netdev-genl-gen.c | 5 ++-- net/core/netdev-genl.c | 18 ++++++++++-- tools/include/uapi/linux/netdev.h | 1 + 7 files changed, 68 insertions(+), 30 deletions(-)
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index a1f4c5a561e9..063119907983 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -591,6 +591,13 @@ attribute-sets: type: u32 checks: min: 1
-name: rx-buf-sizedoc: |Size in bytes of each RX buffer the NIC writes into from the bounddmabuf. Must be a power of two and >= PAGE_SIZE; defaults toPAGE_SIZE.type: u32operations: list: @@ -805,6 +812,7 @@ operations: - ifindex - fd - queues
- rx-buf-size reply: attributes: - iddiff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7df1056a35fd..180a4ffffd60 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -217,6 +217,7 @@ enum { NETDEV_A_DMABUF_QUEUES, NETDEV_A_DMABUF_FD, NETDEV_A_DMABUF_ID,
- NETDEV_A_DMABUF_RX_BUF_SIZE,
__NETDEV_A_DMABUF_MAX, NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1) diff --git a/net/core/devmem.c b/net/core/devmem.c index 957d6b96216b..5a1c0d7984a8 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -46,7 +46,7 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) owner = net_devmem_iov_to_chunk_owner(niov); return owner->base_dma_addr +
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
((dma_addr_t)net_iov_idx(niov) << owner->binding->niov_shift);} static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref) @@ -93,13 +93,14 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) ssize_t offset; ssize_t index;
- dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE,
- dma_addr = gen_pool_alloc_owner(binding->chunk_pool,
if (!dma_addr) return NULL;1UL << binding->niov_shift, (void **)&owner);offset = dma_addr - owner->base_dma_addr;
- index = offset / PAGE_SIZE;
- index = offset >> binding->niov_shift; niov = &owner->area.niovs[index];
niov->desc.pp_magic = 0; @@ -113,12 +114,13 @@ void net_devmem_free_dmabuf(struct net_iov *niov) { struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); unsigned long dma_addr = net_devmem_get_dma_addr(niov);
- size_t niov_size = 1UL << binding->niov_shift;
if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
PAGE_SIZE)))
return;niov_size)))
- gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE);
- gen_pool_free(binding->chunk_pool, dma_addr, niov_size);
} void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -163,6 +165,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, u32 xa_idx; int err;
- if (binding->niov_shift != PAGE_SHIFT)
mp_params.rx_page_size = 1U << binding->niov_shift;- err = netif_mp_open_rxq(dev, rxq_idx, &mp_params, extack); if (err) return err;
@@ -184,14 +189,16 @@ struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, struct device *dma_dev, enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
unsigned int dmabuf_fd, unsigned int niov_shift,struct netdev_nl_sock *priv, struct netlink_ext_ack *extack){ struct net_devmem_dmabuf_binding *binding;
- size_t niov_size = 1UL << niov_shift; static u32 id_alloc_next;
- unsigned int sg_idx, i; struct scatterlist *sg; struct dma_buf *dmabuf;
- unsigned int sg_idx, i; unsigned long virtual; int err;
@@ -213,6 +220,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, binding->dev = dev; binding->vdev = vdev;
- binding->niov_shift = niov_shift; xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
err = percpu_ref_init(&binding->ref, @@ -248,18 +256,14 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, goto err_unmap; } binding->tx_vec = kvmalloc_objs(struct net_iov *,
dmabuf->size / PAGE_SIZE);
if (!binding->tx_vec) { err = -ENOMEM; goto err_unmap; } }dmabuf->size >> niov_shift);
- /* For simplicity we expect to make PAGE_SIZE allocations, but the
* binding can be much more flexible than that. We may be able to* allocate MTU sized chunks here. Leave that for future work...*/- binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
- binding->chunk_pool = gen_pool_create(niov_shift, dev_to_node(&dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM;
@@ -273,9 +277,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, size_t len = sg_dma_len(sg); struct net_iov *niov;
if (!IS_ALIGNED(len, PAGE_SIZE)) {
if (!IS_ALIGNED(dma_addr, niov_size) ||!IS_ALIGNED(len, niov_size)) { err = -EINVAL;
NL_SET_ERR_MSG(extack, "dma-buf SG length must be PAGE_SIZE aligned");
NL_SET_ERR_MSG(extack,"dmabuf sg entry not aligned to niov size");nit: should we NL_SET_ERR_MSG_FMT here and export chunk len and expected alignment?
sgtm!
goto err_free_chunks; }@@ -288,7 +294,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, owner->area.base_virtual = virtual; owner->base_dma_addr = dma_addr;
owner->area.num_niovs = len / PAGE_SIZE;
owner->binding = binding;owner->area.num_niovs = len >> niov_shift;err = gen_pool_add_owner(binding->chunk_pool, dma_addr, @@ -313,7 +319,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev, page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); if (direction == DMA_TO_DEVICE)
binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
}binding->tx_vec[(owner->area.base_virtual >> niov_shift) + i] = niov;virtual += len; @@ -430,13 +436,15 @@ struct net_iov * net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t virt_addr, size_t *off, size_t *size) {
- size_t niov_size = 1UL << binding->niov_shift;
- if (virt_addr >= binding->dmabuf->size) return NULL;
- *off = virt_addr % PAGE_SIZE;
- *size = PAGE_SIZE - *off;
- *off = virt_addr & (niov_size - 1);
- *size = niov_size - *off;
- return binding->tx_vec[virt_addr / PAGE_SIZE];
- return binding->tx_vec[virt_addr >> binding->niov_shift];
} /*** "Dmabuf devmem memory provider" ***/ @@ -454,8 +462,8 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) pool->dma_sync = false; pool->dma_sync_for_cpu = false;
- if (pool->p.order != 0)
return -E2BIG;
- if (pool->p.order != binding->niov_shift - PAGE_SHIFT)
return -EINVAL;Any specific reason you change E2BIG to EINVAL?
It seemed to reflect the new conditional more accurately, as in the case of order < niov_shift the pool order is too small, not too big. TBH, I'm not sure if that case is actually ever hit though, at least with current drivers...
Not married to it, open to go back to E2BIG.
Best, Bobby
From: Bobby Eshleman bobbyeshleman@meta.com
get_sg_table() emitted one PAGE_SIZE sg entry per page even when the underlying folio was larger.
Instead, walk folios[] and emit one sg entry per folio. When folios represent large pages (as is for MFD_HUGETLB), each sg entry is a large page. Normal PAGE_SIZE sg tables are unchanged.
Required by net/core/devmem to support rx-buf-size > PAGE_SIZE with udmabuf.
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com --- drivers/dma-buf/udmabuf.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-)
diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 94b8ecb892bb..f28dd3788ada 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -141,26 +141,63 @@ static void vunmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) vm_unmap_ram(map->vaddr, ubuf->pagecount); }
+/* Return the number of contiguous pages backed by the folio at @i. + * A udmabuf may map only part of a folio, or reference the same folio + * in multiple non-contiguous runs, so folio_nr_pages() can't be used. + */ +static pgoff_t udmabuf_folio_nr_pages(struct udmabuf *ubuf, pgoff_t i) +{ + struct folio *f = ubuf->folios[i]; + pgoff_t j; + + for (j = 1; i + j < ubuf->pagecount; j++) { + if (ubuf->folios[i + j] != f) + break; + /* Same folio, but not a sequential offset within it. */ + if (ubuf->offsets[i + j] != ubuf->offsets[i] + j * PAGE_SIZE) + break; + } + return j; +} + +/* Count the contiguous folio runs in @ubuf, one sg entry per run. */ +static unsigned int udmabuf_sg_nents(struct udmabuf *ubuf) +{ + unsigned int nents = 0; + pgoff_t i; + + for (i = 0; i < ubuf->pagecount; i += udmabuf_folio_nr_pages(ubuf, i)) + nents++; + return nents; +} + static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv; - struct sg_table *sg; struct scatterlist *sgl; - unsigned int i = 0; + struct sg_table *sg; + pgoff_t i, run; + unsigned int nents; int ret;
+ nents = udmabuf_sg_nents(ubuf); + sg = kzalloc_obj(*sg); if (!sg) return ERR_PTR(-ENOMEM);
- ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL); + ret = sg_alloc_table(sg, nents, GFP_KERNEL); if (ret < 0) goto err_alloc;
- for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) - sg_set_folio(sgl, ubuf->folios[i], PAGE_SIZE, + sgl = sg->sgl; + for (i = 0; i < ubuf->pagecount; i += run) { + run = udmabuf_folio_nr_pages(ubuf, i); + sg_set_folio(sgl, ubuf->folios[i], run << PAGE_SHIFT, ubuf->offsets[i]); + sgl = sg_next(sgl); + }
ret = dma_map_sgtable(dev, sg, direction, 0); if (ret < 0)
On 6/4/26 02:42, Bobby Eshleman wrote:
From: Bobby Eshleman bobbyeshleman@meta.com
get_sg_table() emitted one PAGE_SIZE sg entry per page even when the underlying folio was larger.
Instead, walk folios[] and emit one sg entry per folio. When folios represent large pages (as is for MFD_HUGETLB), each sg entry is a large page. Normal PAGE_SIZE sg tables are unchanged.
Required by net/core/devmem to support rx-buf-size > PAGE_SIZE with udmabuf.
That doesn't explain why this is required.
Please note that accessing the pages/folio of an sg-table returned by DMA-buf is illegal and strictly forbidden!
Regards, Christian.
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com
drivers/dma-buf/udmabuf.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-)
diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 94b8ecb892bb..f28dd3788ada 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -141,26 +141,63 @@ static void vunmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) vm_unmap_ram(map->vaddr, ubuf->pagecount); }
+/* Return the number of contiguous pages backed by the folio at @i.
- A udmabuf may map only part of a folio, or reference the same folio
- in multiple non-contiguous runs, so folio_nr_pages() can't be used.
- */
+static pgoff_t udmabuf_folio_nr_pages(struct udmabuf *ubuf, pgoff_t i) +{
struct folio *f = ubuf->folios[i];pgoff_t j;for (j = 1; i + j < ubuf->pagecount; j++) {if (ubuf->folios[i + j] != f)break;/* Same folio, but not a sequential offset within it. */if (ubuf->offsets[i + j] != ubuf->offsets[i] + j * PAGE_SIZE)break;}return j;+}
+/* Count the contiguous folio runs in @ubuf, one sg entry per run. */ +static unsigned int udmabuf_sg_nents(struct udmabuf *ubuf) +{
unsigned int nents = 0;pgoff_t i;for (i = 0; i < ubuf->pagecount; i += udmabuf_folio_nr_pages(ubuf, i))nents++;return nents;+}
static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv;
struct sg_table *sg; struct scatterlist *sgl;unsigned int i = 0;
struct sg_table *sg;pgoff_t i, run;unsigned int nents; int ret;nents = udmabuf_sg_nents(ubuf);sg = kzalloc_obj(*sg); if (!sg) return ERR_PTR(-ENOMEM);
ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
ret = sg_alloc_table(sg, nents, GFP_KERNEL); if (ret < 0) goto err_alloc;
for_each_sg(sg->sgl, sgl, ubuf->pagecount, i)sg_set_folio(sgl, ubuf->folios[i], PAGE_SIZE,
sgl = sg->sgl;for (i = 0; i < ubuf->pagecount; i += run) {run = udmabuf_folio_nr_pages(ubuf, i);sg_set_folio(sgl, ubuf->folios[i], run << PAGE_SHIFT, ubuf->offsets[i]);sgl = sg_next(sgl);} ret = dma_map_sgtable(dev, sg, direction, 0); if (ret < 0)-- 2.53.0-Meta
On Fri, Jun 05, 2026 at 11:30:07AM +0200, Christian König wrote:
On 6/4/26 02:42, Bobby Eshleman wrote:
From: Bobby Eshleman bobbyeshleman@meta.com
get_sg_table() emitted one PAGE_SIZE sg entry per page even when the underlying folio was larger.
Instead, walk folios[] and emit one sg entry per folio. When folios represent large pages (as is for MFD_HUGETLB), each sg entry is a large page. Normal PAGE_SIZE sg tables are unchanged.
Required by net/core/devmem to support rx-buf-size > PAGE_SIZE with udmabuf.
That doesn't explain why this is required.
Sure, can definitely add. Devmem currently requires dmabuf sg entries to be length and size aligned when it allocates niovs for NIC page pools. Though udmabuf is not violating any dmabuf contract by emitting PAGE_SIZE entries and the above restriction is probably more a shortfalling of devmem, by emitting a single entry per folio this patch allows udmabuf to be used by devmem for large pages.
Please note that accessing the pages/folio of an sg-table returned by DMA-buf is illegal and strictly forbidden!
Regards, Christian.
It seems both devmem and io_uring zcrx at least introspect through to the sg-table to build NIC page pools (not accessing the memory itself, however). Is there a better way?
Best, Bobby
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com
drivers/dma-buf/udmabuf.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-)
diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 94b8ecb892bb..f28dd3788ada 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -141,26 +141,63 @@ static void vunmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) vm_unmap_ram(map->vaddr, ubuf->pagecount); }
+/* Return the number of contiguous pages backed by the folio at @i.
- A udmabuf may map only part of a folio, or reference the same folio
- in multiple non-contiguous runs, so folio_nr_pages() can't be used.
- */
+static pgoff_t udmabuf_folio_nr_pages(struct udmabuf *ubuf, pgoff_t i) +{
struct folio *f = ubuf->folios[i];pgoff_t j;for (j = 1; i + j < ubuf->pagecount; j++) {if (ubuf->folios[i + j] != f)break;/* Same folio, but not a sequential offset within it. */if (ubuf->offsets[i + j] != ubuf->offsets[i] + j * PAGE_SIZE)break;}return j;+}
+/* Count the contiguous folio runs in @ubuf, one sg entry per run. */ +static unsigned int udmabuf_sg_nents(struct udmabuf *ubuf) +{
unsigned int nents = 0;pgoff_t i;for (i = 0; i < ubuf->pagecount; i += udmabuf_folio_nr_pages(ubuf, i))nents++;return nents;+}
static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv;
struct sg_table *sg; struct scatterlist *sgl;unsigned int i = 0;
struct sg_table *sg;pgoff_t i, run;unsigned int nents; int ret;nents = udmabuf_sg_nents(ubuf);sg = kzalloc_obj(*sg); if (!sg) return ERR_PTR(-ENOMEM);
ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
ret = sg_alloc_table(sg, nents, GFP_KERNEL); if (ret < 0) goto err_alloc;
for_each_sg(sg->sgl, sgl, ubuf->pagecount, i)sg_set_folio(sgl, ubuf->folios[i], PAGE_SIZE,
sgl = sg->sgl;for (i = 0; i < ubuf->pagecount; i += run) {run = udmabuf_folio_nr_pages(ubuf, i);sg_set_folio(sgl, ubuf->folios[i], run << PAGE_SHIFT, ubuf->offsets[i]);sgl = sg_next(sgl);} ret = dma_map_sgtable(dev, sg, direction, 0); if (ret < 0)-- 2.53.0-Meta
On 6/5/26 20:44, Bobby Eshleman wrote:
On Fri, Jun 05, 2026 at 11:30:07AM +0200, Christian König wrote:
On 6/4/26 02:42, Bobby Eshleman wrote:
From: Bobby Eshleman bobbyeshleman@meta.com
get_sg_table() emitted one PAGE_SIZE sg entry per page even when the underlying folio was larger.
Instead, walk folios[] and emit one sg entry per folio. When folios represent large pages (as is for MFD_HUGETLB), each sg entry is a large page. Normal PAGE_SIZE sg tables are unchanged.
Required by net/core/devmem to support rx-buf-size > PAGE_SIZE with udmabuf.
That doesn't explain why this is required.
Sure, can definitely add. Devmem currently requires dmabuf sg entries to be length and size aligned when it allocates niovs for NIC page pools. Though udmabuf is not violating any dmabuf contract by emitting PAGE_SIZE entries and the above restriction is probably more a shortfalling of devmem, by emitting a single entry per folio this patch allows udmabuf to be used by devmem for large pages.
Please note that accessing the pages/folio of an sg-table returned by DMA-buf is illegal and strictly forbidden!
Regards, Christian.
It seems both devmem and io_uring zcrx at least introspect through to the sg-table to build NIC page pools (not accessing the memory itself, however). Is there a better way?
That's an absolute NO-GO! We need to stop that immediately.
Touching the underlying struct page of an DMA-buf exported sg-table is strictly forbidden.
We even have code to wrap the sg_table and hide the struct pages on debug builds to catch those issues, see function dma_buf_wrap_sg_table().
My last status is that the NIC page pools are build directly from the DMA addresses exposed by the sg_table.
Was there any change I'm not aware of?
Regards, Christian.
Best, Bobby
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com
drivers/dma-buf/udmabuf.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-)
diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 94b8ecb892bb..f28dd3788ada 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -141,26 +141,63 @@ static void vunmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) vm_unmap_ram(map->vaddr, ubuf->pagecount); }
+/* Return the number of contiguous pages backed by the folio at @i.
- A udmabuf may map only part of a folio, or reference the same folio
- in multiple non-contiguous runs, so folio_nr_pages() can't be used.
- */
+static pgoff_t udmabuf_folio_nr_pages(struct udmabuf *ubuf, pgoff_t i) +{
struct folio *f = ubuf->folios[i];pgoff_t j;for (j = 1; i + j < ubuf->pagecount; j++) {if (ubuf->folios[i + j] != f)break;/* Same folio, but not a sequential offset within it. */if (ubuf->offsets[i + j] != ubuf->offsets[i] + j * PAGE_SIZE)break;}return j;+}
+/* Count the contiguous folio runs in @ubuf, one sg entry per run. */ +static unsigned int udmabuf_sg_nents(struct udmabuf *ubuf) +{
unsigned int nents = 0;pgoff_t i;for (i = 0; i < ubuf->pagecount; i += udmabuf_folio_nr_pages(ubuf, i))nents++;return nents;+}
static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv;
struct sg_table *sg; struct scatterlist *sgl;unsigned int i = 0;
struct sg_table *sg;pgoff_t i, run;unsigned int nents; int ret;nents = udmabuf_sg_nents(ubuf);sg = kzalloc_obj(*sg); if (!sg) return ERR_PTR(-ENOMEM);
ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
ret = sg_alloc_table(sg, nents, GFP_KERNEL); if (ret < 0) goto err_alloc;
for_each_sg(sg->sgl, sgl, ubuf->pagecount, i)sg_set_folio(sgl, ubuf->folios[i], PAGE_SIZE,
sgl = sg->sgl;for (i = 0; i < ubuf->pagecount; i += run) {run = udmabuf_folio_nr_pages(ubuf, i);sg_set_folio(sgl, ubuf->folios[i], run << PAGE_SHIFT, ubuf->offsets[i]);sgl = sg_next(sgl);} ret = dma_map_sgtable(dev, sg, direction, 0); if (ret < 0)-- 2.53.0-Meta
On Sun, Jun 7, 2026 at 11:42 PM Christian König christian.koenig@amd.com wrote:
On 6/5/26 20:44, Bobby Eshleman wrote:
On Fri, Jun 05, 2026 at 11:30:07AM +0200, Christian König wrote:
On 6/4/26 02:42, Bobby Eshleman wrote:
From: Bobby Eshleman bobbyeshleman@meta.com
get_sg_table() emitted one PAGE_SIZE sg entry per page even when the underlying folio was larger.
Instead, walk folios[] and emit one sg entry per folio. When folios represent large pages (as is for MFD_HUGETLB), each sg entry is a large page. Normal PAGE_SIZE sg tables are unchanged.
Required by net/core/devmem to support rx-buf-size > PAGE_SIZE with udmabuf.
That doesn't explain why this is required.
Sure, can definitely add. Devmem currently requires dmabuf sg entries to be length and size aligned when it allocates niovs for NIC page pools. Though udmabuf is not violating any dmabuf contract by emitting PAGE_SIZE entries and the above restriction is probably more a shortfalling of devmem, by emitting a single entry per folio this patch allows udmabuf to be used by devmem for large pages.
Please note that accessing the pages/folio of an sg-table returned by
DMA-buf is illegal and strictly forbidden!
Regards, Christian.
It seems both devmem and io_uring zcrx at least introspect through to the sg-table to build NIC page pools (not accessing the memory itself, however). Is there a better way?
That's an absolute NO-GO! We need to stop that immediately.
Touching the underlying struct page of an DMA-buf exported sg-table is strictly forbidden.
We even have code to wrap the sg_table and hide the struct pages on debug builds to catch those issues, see function dma_buf_wrap_sg_table().
My last status is that the NIC page pools are build directly from the DMA addresses exposed by the sg_table.
Was there any change I'm not aware of?
Regards, Christian.
Oh no change, your mental model is still current. They just go through each sg and use sg_dma_address() on each.
Best, Bobby
Best, Bobby
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com
drivers/dma-buf/udmabuf.c | 47
++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 42 insertions(+), 5 deletions(-)
diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 94b8ecb892bb..f28dd3788ada 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -141,26 +141,63 @@ static void vunmap_udmabuf(struct dma_buf *buf,
struct iosys_map *map)
vm_unmap_ram(map->vaddr, ubuf->pagecount);}
+/* Return the number of contiguous pages backed by the folio at @i.
- A udmabuf may map only part of a folio, or reference the same folio
- in multiple non-contiguous runs, so folio_nr_pages() can't be used.
- */
+static pgoff_t udmabuf_folio_nr_pages(struct udmabuf *ubuf, pgoff_t i) +{
struct folio *f = ubuf->folios[i];pgoff_t j;for (j = 1; i + j < ubuf->pagecount; j++) {if (ubuf->folios[i + j] != f)break;/* Same folio, but not a sequential offset within it.*/
if (ubuf->offsets[i + j] != ubuf->offsets[i] + j *PAGE_SIZE)
break;}return j;+}
+/* Count the contiguous folio runs in @ubuf, one sg entry per run. */ +static unsigned int udmabuf_sg_nents(struct udmabuf *ubuf) +{
unsigned int nents = 0;pgoff_t i;for (i = 0; i < ubuf->pagecount; i +=udmabuf_folio_nr_pages(ubuf, i))
nents++;return nents;+}
static struct sg_table *get_sg_table(struct device *dev, struct
dma_buf *buf,
enum dma_data_direction direction){ struct udmabuf *ubuf = buf->priv;
struct sg_table *sg; struct scatterlist *sgl;unsigned int i = 0;
struct sg_table *sg;pgoff_t i, run;unsigned int nents; int ret;nents = udmabuf_sg_nents(ubuf);sg = kzalloc_obj(*sg); if (!sg) return ERR_PTR(-ENOMEM);
ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
ret = sg_alloc_table(sg, nents, GFP_KERNEL); if (ret < 0) goto err_alloc;
for_each_sg(sg->sgl, sgl, ubuf->pagecount, i)sg_set_folio(sgl, ubuf->folios[i], PAGE_SIZE,
sgl = sg->sgl;for (i = 0; i < ubuf->pagecount; i += run) {run = udmabuf_folio_nr_pages(ubuf, i);sg_set_folio(sgl, ubuf->folios[i], run << PAGE_SHIFT, ubuf->offsets[i]);sgl = sg_next(sgl);} ret = dma_map_sgtable(dev, sg, direction, 0); if (ret < 0)-- 2.53.0-Meta
On 6/8/26 15:55, Bobby Eshleman wrote:
On Sun, Jun 7, 2026 at 11:42 PM Christian König <christian.koenig@amd.com mailto:christian.koenig@amd.com> wrote:
On 6/5/26 20:44, Bobby Eshleman wrote: > On Fri, Jun 05, 2026 at 11:30:07AM +0200, Christian König wrote: >> On 6/4/26 02:42, Bobby Eshleman wrote: >>> From: Bobby Eshleman <bobbyeshleman@meta.com <mailto:bobbyeshleman@meta.com>> >>> >>> get_sg_table() emitted one PAGE_SIZE sg entry per page even when the >>> underlying folio was larger. >>> >>> Instead, walk folios[] and emit one sg entry per folio. When folios >>> represent large pages (as is for MFD_HUGETLB), each sg entry is a large >>> page. Normal PAGE_SIZE sg tables are unchanged. >>> >>> Required by net/core/devmem to support rx-buf-size > PAGE_SIZE with >>> udmabuf. >> >> That doesn't explain why this is required. > > Sure, can definitely add. Devmem currently requires dmabuf sg entries to > be length and size aligned when it allocates niovs for NIC page pools. > Though udmabuf is not violating any dmabuf contract by emitting > PAGE_SIZE entries and the above restriction is probably more a > shortfalling of devmem, by emitting a single entry per folio this patch > allows udmabuf to be used by devmem for large pages. > >> >> Please note that accessing the pages/folio of an sg-table returned by DMA-buf is illegal and strictly forbidden! >> >> Regards, >> Christian. > > It seems both devmem and io_uring zcrx at least introspect through to > the sg-table to build NIC page pools (not accessing the memory itself, > however). Is there a better way? That's an absolute NO-GO! We need to stop that immediately. Touching the underlying struct page of an DMA-buf exported sg-table is strictly forbidden. We even have code to wrap the sg_table and hide the struct pages on debug builds to catch those issues, see function dma_buf_wrap_sg_table(). My last status is that the NIC page pools are build directly from the DMA addresses exposed by the sg_table. Was there any change I'm not aware of? Regards, Christian.Oh no change, your mental model is still current. They just go through each sg and use sg_dma_address() on each.
Ah, thanks! That was a near heart attack :D
Yeah that is perfectly correct, question is do you then still really need this udmabuf change? I mean the DMA API usually merges together contiguous DMA addresses.
Regards, Christian.
Best, Bobby
> > Best, > Bobby > >> >>> Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com <mailto:bobbyeshleman@meta.com>> >>> --- >>> drivers/dma-buf/udmabuf.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- >>> 1 file changed, 42 insertions(+), 5 deletions(-) >>> >>> diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c >>> index 94b8ecb892bb..f28dd3788ada 100644 >>> --- a/drivers/dma-buf/udmabuf.c >>> +++ b/drivers/dma-buf/udmabuf.c >>> @@ -141,26 +141,63 @@ static void vunmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) >>> vm_unmap_ram(map->vaddr, ubuf->pagecount); >>> } >>> >>> +/* Return the number of contiguous pages backed by the folio at @i. >>> + * A udmabuf may map only part of a folio, or reference the same folio >>> + * in multiple non-contiguous runs, so folio_nr_pages() can't be used. >>> + */ >>> +static pgoff_t udmabuf_folio_nr_pages(struct udmabuf *ubuf, pgoff_t i) >>> +{ >>> + struct folio *f = ubuf->folios[i]; >>> + pgoff_t j; >>> + >>> + for (j = 1; i + j < ubuf->pagecount; j++) { >>> + if (ubuf->folios[i + j] != f) >>> + break; >>> + /* Same folio, but not a sequential offset within it. */ >>> + if (ubuf->offsets[i + j] != ubuf->offsets[i] + j * PAGE_SIZE) >>> + break; >>> + } >>> + return j; >>> +} >>> + >>> +/* Count the contiguous folio runs in @ubuf, one sg entry per run. */ >>> +static unsigned int udmabuf_sg_nents(struct udmabuf *ubuf) >>> +{ >>> + unsigned int nents = 0; >>> + pgoff_t i; >>> + >>> + for (i = 0; i < ubuf->pagecount; i += udmabuf_folio_nr_pages(ubuf, i)) >>> + nents++; >>> + return nents; >>> +} >>> + >>> static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf, >>> enum dma_data_direction direction) >>> { >>> struct udmabuf *ubuf = buf->priv; >>> - struct sg_table *sg; >>> struct scatterlist *sgl; >>> - unsigned int i = 0; >>> + struct sg_table *sg; >>> + pgoff_t i, run; >>> + unsigned int nents; >>> int ret; >>> >>> + nents = udmabuf_sg_nents(ubuf); >>> + >>> sg = kzalloc_obj(*sg); >>> if (!sg) >>> return ERR_PTR(-ENOMEM); >>> >>> - ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL); >>> + ret = sg_alloc_table(sg, nents, GFP_KERNEL); >>> if (ret < 0) >>> goto err_alloc; >>> >>> - for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) >>> - sg_set_folio(sgl, ubuf->folios[i], PAGE_SIZE, >>> + sgl = sg->sgl; >>> + for (i = 0; i < ubuf->pagecount; i += run) { >>> + run = udmabuf_folio_nr_pages(ubuf, i); >>> + sg_set_folio(sgl, ubuf->folios[i], run << PAGE_SHIFT, >>> ubuf->offsets[i]); >>> + sgl = sg_next(sgl); >>> + } >>> >>> ret = dma_map_sgtable(dev, sg, direction, 0); >>> if (ret < 0) >>> >>> -- >>> 2.53.0-Meta >>> >>
(certifitasap@gmail.com) No Exam: When you buy a genuine NEBOSH Certification, there will be no need to take the exams. Convenience: Many people have tight schedules and do not have enough time to go through the exams or testing process. Buying the NEBOSH Certification will be very convenient for them. Apply for the NEBOSH certificate in Malaysia https://streamable.com/2yj6u6
To check NEBOSH results online in Kuwait or the UAE. Where can I buy the NEBOSH Diploma online? How to see my registered NEBOSH certificates? Get the NEBOSH health and safety certificate in the USA. France. Luxembourg. Switzerland. Colombia. India. Turkey. Kuwait. Qatar. Hong Kong. Bahrain. Malaysia. Russian. Japan. Ireland. Saudi Arabia. Egypt . China. Taiwan. Israel. Indonesia Lebanon. Jordan. Serbia. Iran. Thailand. United Arab Emirates. Iraq. Oman Buy original NEBOSH diplomas without exam in Saudi Arabia
We are a group of Teachers and Examiners specialized in the Acquisition of NEBOSH CERTIFICATES & all Certificates without taking the exams. https://streamable.com/2n368u
1- We provide an Official certificate with registration in the database and actual center stamps for customers interested in obtaining the certificate without taking the test.
2. We provide you with a new certificate with the updated results for you to follow your PR procedures without any risk.
3- We can provide Question papers for future tests before the actual test date. The questionnaires will be issued about 6 to 10 days before the test data and will be 100% the same questions that will appear in the test. Guaranteed at 100%. https://vimeo.com/1189663914?share=copy&fl=sv&fe=ci
4- We are teachers and examination officials working together as a team, so you can choose any of our professionals to go in for the exams on your behave.
5- You can register for your exams and go in for but we shall provide your target scores as you request because we have underground partners working at any center test, which gives us access to the system.
6- We equally assist our clients by sending recommendation letters to well-known educational institutions or enterprises offering employment abroad in Canada, the UK, the USA, Australia, New Zealand, and others to give you a kick start for your future.
NEBOSH Diplomas in countries like: USA, Australia, Belgium, Brazil, Canada, Italy, Finland, France, Germany, Israel, Mexico, Netherlands, , Spain, United Kingdom.
https://t.me/+A1TvrjQBdZc2MWRh
https://t.me/+UfhkuxkiJsw5NWEx
https://neboshigc03.wixsite.com/website/services
https://neboshigc03.wixsite.com/website/about
https://neboshigc03.wixs....com/website/book-online
https://neboshigc03.wixsite.com/website/contact
https://neboshigc03.wixsite.com/website/products
Contact us via Teams ID-- (Jacob JB)
We provide verified NEBOSH certificates with online possibilities. You are guaranteed 100% with us, as the certificate you obtain is 100% legal and accepted anywhere without any doubt. Customers interested in obtaining the certificate should contact us through the contact details listed below. Payment and prices shall be discussed upon your response to this ad. Contact Customer Care for a fast chat via Email....certifitasap@gmail.com
WhatsApp...+1 (450)912-2147
Contact us via Teams ID-- (Jacob JB)
Email (neboshigc03@yahoo.com)
NEBOSH IGC Certificate
NEBOSH Oil and Gas Safety
NEBOSH Fire Certificate
NEBOSH Diploma Contact Customer Care for a fast chat via Email....certifitasap@gmail.com
WhatsApp.....+1 (450) 912-2147
British NEBOSH certificate Obtain Real NEBOSH Diploma Acquire NEBOSH Diploma online Diploma without Exam in Tuvalu Buy NEBOSH Diploma without exam Obtain NEBOSH certificate in Dubai Acquire NEBOSH certificate in India Buy NEBOSH Fire Certificate in Nepal Obtain NEBOSH certificate in the UAE NEBOSH Fire Certificate without Exam BCSP Certification Online in Austria Obtain NEBOSH Exam in the Netherlands Get NEBOSH Diploma online without Exam Apply for NEBOSH Diploma UK without Exam Apply for the British NEBOSH Certificate Buy a NEBOSH certificate without an exam Buy British NEBOSH IGC certificates in UK Apply for the NEBOSH IGC certificate in UK NEBOSH Diploma Online in the Netherlands Acquire the NEBOSH certificate in Punjab Buy NEBOSH in New York/Islamabad/Seoul, Buy/get/order NEBOSH WITHOUT exams/test, Order NEBOSH in Tashkent/ Hanoi/Hong Kong, NEBOSH certificate without exam in Kerala Order original NEBOSH Diploma without Exam Buy CSP Certificate Without Exam in Germany Buy SMS Certificate Without Exam in Sweden Buy a CHST Certificate Without Exam in Iran NEBOSH certificate without Exam in the UAE Buy a NEBOSH certificate UK without an Exam Buy a British NEBOSH Fire certificate online Acquire NEBOSH certificates without an Exam Order NEBOSH Certificate Online Without Exam Want to Improve your NEBOSH IGC or Diplomas? Apply for the NEBOSH certificate in Malaysia. British NEBOSH Fire certificate without Exam Buy NEBOSH IGC Certificate Online in Austria NEBOSH certificate without exam in Hyderabad Buy NEBOSH Certificate Without Exam in Belgium Apply for the NEEBOSH certificate in Australia Buy a NEBOSH certificate in Karachi, Australia Buy NEBOSH certificate in Australia/Pakistan, Buy Original NEBOSH without exams in Karachi Obtain NEBOSH WITHOUT tests/exams in the UK, Buy a CET Certificate Without Exam in Belgium Buy NEBOSH Certificate Without Exam in Bulgaria Buy ASP Certificate Without Exam in Luxembourg Order NEBOSH certificate without Exam in Aruba Acquire a Diploma without an Exam in Australia Purchase NEBOSH Fire Certificate without Exam Buy NEBOSH certificate without Exam in Algeria Buy NEBOSH Diploma without exam in Martinique Buy NEBOSH Diploma without Exam in Kyrgyzstan NEBOSH IGC Certificate Without Exam in Denmark Get NEBOSH HSW Certificate Without Exam in Greece Buy Original NEBOSH Certificate Online in the UK Obtain NEBOSH certificate without exam in Punjab Order NEBOSH Certificate Without Exam in Bulgaria Purchase the NBOSH certificate without the Exam Apply for NEBOSH IGC Certificate UK without Exam Apply for NEBOSH Diploma without Exam in Mongolia NEBOSH certificate without Exam in New Caledonia NEBOSH IGC Certificate Without Exam in Seychelles Buy STSC Certificate Without Exam in Switzerland Buy NEBOSH HSW Certificate Without Exam in Croatia Buy STS Certificate Without Exam in the Netherlands Buy NEBOSH HSW Certificate Without Exam in Greece Buy NEBOSH HSW Certificate Without Exam in Norway Buy NEBOSH IDIP Certificate Without Exam in Turkey Buy NEBOSH IGC Certificate Without Exam in Poland Purchase a registered NEBOSH London/Sydney/Dubai NEBOSH Certificate Without Exam in Saudi Arabia Buy/get NEBOSH in Qatar without taking the test, Obtain a Registered NEBOSH certificate in Qatar, Buy/obtain/get NEBOSH certificate in Dubai/UAE, Get/purchase/Buy NEBOSH certificate in Qatar/ India, Original NEBOSH Certificate Without Exam in Jordan Buy NEBOSH IGC 1 Certificate Without Exam in Ukraine Buy NEBOSH IGC 2 Certificate Without Exam in Portugal Obtain NEBOSH Oil and Gas Safety without Exam in London Acquire NEBOSH Diploma without Exam in Western Sahara Purchase NEBOSH Diploma without Exam in Cote d'Ivoire Purchase NEBOSH IGC Certificate without Exam in Uganda Obtain original NEBOSH Fire Certificate without Exam Obtain NEBOSH IGC Certificate Without Exam in Poland Buy NEBOSH IGC 1 Certificate Without Exam in Ukraine Acquire NEBOSH Certificate Without Exam in Germany Acquire NEBOSH certificate without exam in Bangalore Buy a NEBOSH Certificate without an exam in Chennai Get NEBOSH Diploma Certificate Without Exam in Sweden Obtain NEBOSH Certificate Without Exam in Switzerland Acquire NEBOSH HSW Certificate Without Exam in Croatia Apply for NEBOSH HSW Certificate Without Exam in Norway Acquire NEBOSH IDIP Certificate Without Exam in Turkey Buy a NEBOSH certificate without an Exam in Bouvet Island Apply for British NEBOSH Oil and Gas Safety certificates Apply for the NEBOSH certificate without an exam in Delhi Acquire NEBOSH Certificate Without Exam in the Netherlands Apply for NEBOSH IGC Certificate Without Exam in Luxembourg Buy NEBOSH health and safety Certificate Without Exam in Iran Buy/purchase/ Acquire an original NEBOSH certificate in Karachi Order/obtain/buy NEBOSH certificate in Malaysia/Germany, Get/obtain NEBOSH certificate in Pakistan, New Zealand, Buy/Obtain/Get NEBOSH certificate in the United Kingdom (UK), Order Original NEBOSH Diploma Without Exam Online in Egypt Order Original NEBOSH course Certificate Without Exam in Jordan, Apply for Original NEBOSH Diplomas Without Exam in Saudi Arabia Apply for NEBOSH Oil and Gas Safety without Exam in Guyana Buy NEBOSH HSW Certificate Without Exam in the Czech Republic How to get the original NEBOSH Fire Certificate without Exam Get NEBOSH verification. Obtain a real NEBOSH Certificate online Acquire a NEBOSH certificate without an Exam in the United States Order original NEBOSH Oil and Gas Safety certificate without the Exam Purchase real NEBOSH Oil and Gas Safety certificates without an Exam Apply for NEBOSH Fire Certificate without Exam in Brunei Darussalam Apply for NEBOSH IGC Certificate without Exam in Antigua and Barbuda Purchase Original NEBOSH Diploma Online Without Exam in Jordan, Acquire Original NEBOSH Oil and Gas Safety Without Exam Online in Bahrain, Apply for Original NEBOSH Fire Certificate Without Exam Online in Algeria, Order Original NEBOSH Fire Certificate Online Without Exam in Saudi Arabia Obtain Original NEBOSH Diploma Without Exam Online in the Middle East, Obtain Original NEBOSH IGC Certificates Online Without Exam in Kuwait, Apply for Original NEBOSH Oil and Gas Safety Online Without Exam in Qatar, Obtain Original NEBOSH Fire Certificate Online Without Exam in Jordan, Apply for the NEBOSH certificate without an Exam in the Falkland Islands Apply for NEBOSH oil and gas Certificate Without Exam in Iran, Netherlands Buy/Get/Order Verified NEBOSH Certificates Online in South Korea, Gwangju Apply for Original NEBOSH Certificates Online in the United Arab Emirates, UAE, Dubai, Purchase the NEBOSH Oil and Gas Safety certificate without an Exam online in Antigua Obtain NEBOSH Oil and Gas Safety without an exam in the French Southern Territories Buy/Get/Order Verified NEBOSH Diplomas Certificates Online in South Korea, Seoul Apply for Original NEBOSH Certificate Without Exam in the United Arab Emirates, UAE, Dubai NEBOSH Diploma in the UAE, Qatar, Kuwait, Oman, Bahrain, Malaysia Singapore Jordan Saudi Arabia USA Ireland UK France Italy Belgium, Ukraine, Iceland, Brazil, Spain, Germany Egypt, Turkey, Morocco, Algeria, Greece)
Contact Customer Care for a fast chat via
Contact us via Teams ID-- (Jacob JB)
Email....certifitasap@gmail.com
WhatsApp.....+1 (450) 912-2147
Don't hesitate, feel free to contact us so as to get your certificates done perfectly and on time. available service 24Hs/7Ds
On Mon, Jun 08, 2026 at 03:59:04PM +0200, Christian König wrote:
On 6/8/26 15:55, Bobby Eshleman wrote:
On Sun, Jun 7, 2026 at 11:42 PM Christian König <christian.koenig@amd.com mailto:christian.koenig@amd.com> wrote:
On 6/5/26 20:44, Bobby Eshleman wrote: > On Fri, Jun 05, 2026 at 11:30:07AM +0200, Christian König wrote: >> On 6/4/26 02:42, Bobby Eshleman wrote: >>> From: Bobby Eshleman <bobbyeshleman@meta.com <mailto:bobbyeshleman@meta.com>> >>> >>> get_sg_table() emitted one PAGE_SIZE sg entry per page even when the >>> underlying folio was larger. >>> >>> Instead, walk folios[] and emit one sg entry per folio. When folios >>> represent large pages (as is for MFD_HUGETLB), each sg entry is a large >>> page. Normal PAGE_SIZE sg tables are unchanged. >>> >>> Required by net/core/devmem to support rx-buf-size > PAGE_SIZE with >>> udmabuf. >> >> That doesn't explain why this is required. > > Sure, can definitely add. Devmem currently requires dmabuf sg entries to > be length and size aligned when it allocates niovs for NIC page pools. > Though udmabuf is not violating any dmabuf contract by emitting > PAGE_SIZE entries and the above restriction is probably more a > shortfalling of devmem, by emitting a single entry per folio this patch > allows udmabuf to be used by devmem for large pages. > >> >> Please note that accessing the pages/folio of an sg-table returned by DMA-buf is illegal and strictly forbidden! >> >> Regards, >> Christian. > > It seems both devmem and io_uring zcrx at least introspect through to > the sg-table to build NIC page pools (not accessing the memory itself, > however). Is there a better way? That's an absolute NO-GO! We need to stop that immediately. Touching the underlying struct page of an DMA-buf exported sg-table is strictly forbidden. We even have code to wrap the sg_table and hide the struct pages on debug builds to catch those issues, see function dma_buf_wrap_sg_table(). My last status is that the NIC page pools are build directly from the DMA addresses exposed by the sg_table. Was there any change I'm not aware of? Regards, Christian.Oh no change, your mental model is still current. They just go through each sg and use sg_dma_address() on each.
Ah, thanks! That was a near heart attack :D
Yeah that is perfectly correct, question is do you then still really need this udmabuf change? I mean the DMA API usually merges together contiguous DMA addresses.
Regards, Christian.
Hey Christian, sorry for the delay I justed want to double check what I'm seeing...
I reverted the udmabuf patch and confirmed devmem still runs into 4K pages even for hugepage udmabuf. I see that the dma_map_direct() path is being taken, which if I am reading the code correctly results in the sg_dma_len(sg) inheriting sg->length directly (set by udmabuf's sg_set_folio(..., PAGE_SIZE) call), compared to the iommu_dma_map_phys() path which looks like it does merge when possible.
Best, Bobby
On 6/9/26 16:58, Bobby Eshleman wrote:
On Mon, Jun 08, 2026 at 03:59:04PM +0200, Christian König wrote:
On 6/8/26 15:55, Bobby Eshleman wrote:
On Sun, Jun 7, 2026 at 11:42 PM Christian König <christian.koenig@amd.com mailto:christian.koenig@amd.com> wrote:
On 6/5/26 20:44, Bobby Eshleman wrote: > On Fri, Jun 05, 2026 at 11:30:07AM +0200, Christian König wrote: >> On 6/4/26 02:42, Bobby Eshleman wrote: >>> From: Bobby Eshleman <bobbyeshleman@meta.com <mailto:bobbyeshleman@meta.com>> >>> >>> get_sg_table() emitted one PAGE_SIZE sg entry per page even when the >>> underlying folio was larger. >>> >>> Instead, walk folios[] and emit one sg entry per folio. When folios >>> represent large pages (as is for MFD_HUGETLB), each sg entry is a large >>> page. Normal PAGE_SIZE sg tables are unchanged. >>> >>> Required by net/core/devmem to support rx-buf-size > PAGE_SIZE with >>> udmabuf. >> >> That doesn't explain why this is required. > > Sure, can definitely add. Devmem currently requires dmabuf sg entries to > be length and size aligned when it allocates niovs for NIC page pools. > Though udmabuf is not violating any dmabuf contract by emitting > PAGE_SIZE entries and the above restriction is probably more a > shortfalling of devmem, by emitting a single entry per folio this patch > allows udmabuf to be used by devmem for large pages. > >> >> Please note that accessing the pages/folio of an sg-table returned by DMA-buf is illegal and strictly forbidden! >> >> Regards, >> Christian. > > It seems both devmem and io_uring zcrx at least introspect through to > the sg-table to build NIC page pools (not accessing the memory itself, > however). Is there a better way? That's an absolute NO-GO! We need to stop that immediately. Touching the underlying struct page of an DMA-buf exported sg-table is strictly forbidden. We even have code to wrap the sg_table and hide the struct pages on debug builds to catch those issues, see function dma_buf_wrap_sg_table(). My last status is that the NIC page pools are build directly from the DMA addresses exposed by the sg_table. Was there any change I'm not aware of? Regards, Christian.Oh no change, your mental model is still current. They just go through each sg and use sg_dma_address() on each.
Ah, thanks! That was a near heart attack :D
Yeah that is perfectly correct, question is do you then still really need this udmabuf change? I mean the DMA API usually merges together contiguous DMA addresses.
Regards, Christian.
Hey Christian, sorry for the delay I justed want to double check what I'm seeing...
I reverted the udmabuf patch and confirmed devmem still runs into 4K pages even for hugepage udmabuf. I see that the dma_map_direct() path is being taken, which if I am reading the code correctly results in the sg_dma_len(sg) inheriting sg->length directly (set by udmabuf's sg_set_folio(..., PAGE_SIZE) call), compared to the iommu_dma_map_phys() path which looks like it does merge when possible.
Ok that makes more sense. Yeah something which could potentially be improved elsewhere.
Feel free to go ahead with this patch as a workaround, just adjust the commit message and maybe add a code comment why it is necessary and helpful.
Thanks, Christian.
Best, Bobby
From: Bobby Eshleman bobbyeshleman@meta.com
Add -b <bytes> to request a non-default niov size via NETDEV_A_DMABUF_RX_BUF_SIZE. When the value exceeds PAGE_SIZE, udmabuf_alloc() switches to an MFD_HUGETLB-backed memfd so each 2 MB hugepage produces one naturally-aligned sg entry.
Reject values > 2 MB up front: MFD_HUGETLB + udmabuf can only guarantee 2 MB per sg entry (one hugepage), so a larger rx_buf_size would fail the per-sg length/alignment check.
Add CONFIG_HUGETLBFS=y to drivers/net/hw/config so the new path is reachable in the CI kernels built for these tests.
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com --- tools/testing/selftests/drivers/net/hw/config | 1 + tools/testing/selftests/drivers/net/hw/ncdevmem.c | 49 +++++++++++++++++++++-- 2 files changed, 47 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config index b9f406dd7282..388721bee553 100644 --- a/tools/testing/selftests/drivers/net/hw/config +++ b/tools/testing/selftests/drivers/net/hw/config @@ -3,6 +3,7 @@ CONFIG_FAIL_FUNCTION=y CONFIG_FAULT_INJECTION=y CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FUNCTION_ERROR_INJECTION=y +CONFIG_HUGETLBFS=y CONFIG_INET6_ESP=y CONFIG_INET6_ESP_OFFLOAD=y CONFIG_INET_ESP=y diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index d96e8a3b5a65..325c128191e2 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -61,6 +61,7 @@ #include <sys/time.h>
#include <linux/memfd.h> +#include <sys/param.h> #include <linux/dma-buf.h> #include <linux/errqueue.h> #include <linux/udmabuf.h> @@ -79,6 +80,7 @@ #define PAGE_SHIFT 12 #define TEST_PREFIX "ncdevmem" #define NUM_PAGES 16000 +#define MB(x) ((x) << 20)
#ifndef MSG_SOCK_DEVMEM #define MSG_SOCK_DEVMEM 0x2000000 @@ -100,6 +102,7 @@ static unsigned int dmabuf_id; static uint32_t tx_dmabuf_id; static int waittime_ms = 500; static bool fail_on_linear; +static uint32_t rx_buf_size;
/* System state loaded by current_config_load() */ #define MAX_FLOWS 8 @@ -142,6 +145,7 @@ static struct memory_buffer *udmabuf_alloc(size_t size) { struct udmabuf_create create; struct memory_buffer *ctx; + unsigned int memfd_flags; int ret;
ctx = malloc(sizeof(*ctx)); @@ -156,9 +160,14 @@ static struct memory_buffer *udmabuf_alloc(size_t size) goto err_free_ctx; }
- ctx->memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); + memfd_flags = MFD_ALLOW_SEALING; + if (rx_buf_size > (uint32_t)getpagesize()) + memfd_flags |= MFD_HUGETLB | MFD_HUGE_2MB; + + ctx->memfd = memfd_create("udmabuf-test", memfd_flags); if (ctx->memfd < 0) { - pr_err("[skip,no-memfd]"); + pr_err("[skip,no-memfd%s]", + (memfd_flags & MFD_HUGETLB) ? " (need hugepages)" : ""); goto err_close_dev; }
@@ -168,6 +177,11 @@ static struct memory_buffer *udmabuf_alloc(size_t size) goto err_close_memfd; }
+ if (memfd_flags & MFD_HUGETLB) { + size = roundup(size, MB(2)); + ctx->size = size; + } + ret = ftruncate(ctx->memfd, size); if (ret == -1) { pr_err("[FAIL,memfd-truncate]"); @@ -699,6 +713,8 @@ static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, netdev_bind_rx_req_set_ifindex(req, ifindex); netdev_bind_rx_req_set_fd(req, dmabuf_fd); __netdev_bind_rx_req_set_queues(req, queues, n_queue_index); + if (rx_buf_size) + netdev_bind_rx_req_set_rx_buf_size(req, rx_buf_size);
rsp = netdev_bind_rx(*ys, req); if (!rsp) { @@ -1411,7 +1427,7 @@ int main(int argc, char *argv[]) int is_server = 0, opt; int ret, err = 1;
- while ((opt = getopt(argc, argv, "Lls:c:p:v:q:t:f:z:n")) != -1) { + while ((opt = getopt(argc, argv, "Lls:c:p:v:q:t:f:z:nb:")) != -1) { switch (opt) { case 'L': fail_on_linear = true; @@ -1446,6 +1462,33 @@ int main(int argc, char *argv[]) case 'n': skip_config = 1; break; + case 'b': { + char *endp; + unsigned long val; + + errno = 0; + val = strtoul(optarg, &endp, 0); + if (errno || endp == optarg || *endp || val == 0 || + val > UINT32_MAX) { + pr_err("invalid rx_buf_size: %s", optarg); + return 1; + } + if (val & (val - 1)) { + pr_err("rx_buf_size must be a power of 2"); + return 1; + } + if (val < (unsigned long)getpagesize()) { + pr_err("rx_buf_size must be >= PAGE_SIZE (%d)", + getpagesize()); + return 1; + } + if (val > MB(2)) { + pr_err("rx_buf_size > 2 MB not supported"); + return 1; + } + rx_buf_size = val; + break; + } case '?': fprintf(stderr, "unknown option: %c\n", optopt); break;
On 06/03, Bobby Eshleman wrote:
From: Bobby Eshleman bobbyeshleman@meta.com
Add -b <bytes> to request a non-default niov size via NETDEV_A_DMABUF_RX_BUF_SIZE. When the value exceeds PAGE_SIZE, udmabuf_alloc() switches to an MFD_HUGETLB-backed memfd so each 2 MB hugepage produces one naturally-aligned sg entry.
Reject values > 2 MB up front: MFD_HUGETLB + udmabuf can only guarantee 2 MB per sg entry (one hugepage), so a larger rx_buf_size would fail the per-sg length/alignment check.
Add CONFIG_HUGETLBFS=y to drivers/net/hw/config so the new path is reachable in the CI kernels built for these tests.
I vaguely remember there was also some kernel cmdline argument to reserve these? Do we need to also do something to NIPA to reserve 2MB pages for this test? Or was it for 1GB pages?
On Fri, Jun 05, 2026 at 08:35:25AM -0700, Stanislav Fomichev wrote:
On 06/03, Bobby Eshleman wrote:
From: Bobby Eshleman bobbyeshleman@meta.com
Add -b <bytes> to request a non-default niov size via NETDEV_A_DMABUF_RX_BUF_SIZE. When the value exceeds PAGE_SIZE, udmabuf_alloc() switches to an MFD_HUGETLB-backed memfd so each 2 MB hugepage produces one naturally-aligned sg entry.
Reject values > 2 MB up front: MFD_HUGETLB + udmabuf can only guarantee 2 MB per sg entry (one hugepage), so a larger rx_buf_size would fail the per-sg length/alignment check.
Add CONFIG_HUGETLBFS=y to drivers/net/hw/config so the new path is reachable in the CI kernels built for these tests.
I vaguely remember there was also some kernel cmdline argument to reserve these? Do we need to also do something to NIPA to reserve 2MB pages for this test? Or was it for 1GB pages?
Good call, my little runner script when prototyping this adds these pages at runtime and I forgot to move it into the patch, we can have the py runner do this setup.
Best, Bobby
From: Bobby Eshleman bobbyeshleman@meta.com
Add a new devmem test case for binding the dmabuf with rx-buf-size=16K. The test sweeps RX payload sizes straddling the niov boundary to cover the sub-niov, exact-niov, and multi-niov RX paths.
Signed-off-by: Bobby Eshleman bobbyeshleman@meta.com --- tools/testing/selftests/drivers/net/hw/devmem.py | 12 +++++- .../testing/selftests/drivers/net/hw/devmem_lib.py | 46 +++++++++++++++++++++- .../testing/selftests/drivers/net/hw/nk_devmem.py | 11 +++++- 3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py index 031cf9905f65..47b54e18e7a6 100755 --- a/tools/testing/selftests/drivers/net/hw/devmem.py +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -2,7 +2,8 @@ # SPDX-License-Identifier: GPL-2.0
from os import path -from devmem_lib import setup_test, run_rx, run_tx, run_tx_chunks, run_rx_hds +from devmem_lib import (setup_test, run_rx, run_tx, run_tx_chunks, run_rx_hds, + run_rx_large_niov) from lib.py import ksft_run, ksft_exit, ksft_disruptive from lib.py import NetDrvEpEnv
@@ -30,11 +31,18 @@ def check_rx_hds(cfg) -> None: run_rx_hds(cfg)
+@ksft_disruptive +def check_rx_large_niov(cfg) -> None: + """Run the devmem RX test with rx-buf-size = 16 KiB.""" + run_rx_large_niov(cfg) + + def main() -> None: """Run the devmem test cases.""" with NetDrvEpEnv(__file__) as cfg: setup_test(cfg, path.abspath(path.dirname(__file__) + "/ncdevmem")) - ksft_run([check_rx, check_tx, check_tx_chunks, check_rx_hds], + ksft_run([check_rx, check_tx, check_tx_chunks, check_rx_hds, + check_rx_large_niov], args=(cfg,)) ksft_exit()
diff --git a/tools/testing/selftests/drivers/net/hw/devmem_lib.py b/tools/testing/selftests/drivers/net/hw/devmem_lib.py index 0921ff03eb81..1d9ad3a294c8 100644 --- a/tools/testing/selftests/drivers/net/hw/devmem_lib.py +++ b/tools/testing/selftests/drivers/net/hw/devmem_lib.py @@ -8,7 +8,7 @@ from lib.py import (bkg, cmd, defer, ethtool, rand_port, wait_port_listen, NetdevFamily)
-def require_devmem(cfg): +def require_devmem(cfg, rx_buf_size=0): """Probe ncdevmem on cfg.ifname and SKIP the test if devmem isn't supported.""" if not hasattr(cfg, "devmem_probed"): probe_command = f"{cfg.bin_local} -f {cfg.ifname}" @@ -18,6 +18,19 @@ def require_devmem(cfg): if not cfg.devmem_supported: raise KsftSkipEx("Test requires devmem support")
+ if rx_buf_size > 0: + if not hasattr(cfg, "devmem_rx_buf_size_probed"): + cfg.devmem_rx_buf_size_probed = {} + + if rx_buf_size not in cfg.devmem_rx_buf_size_probed: + probe_command = f"{cfg.bin_local} -f {cfg.ifname} -b {rx_buf_size}" + cfg.devmem_rx_buf_size_probed[rx_buf_size] = \ + cmd(probe_command, fail=False, shell=True).ret == 0 + + if not cfg.devmem_rx_buf_size_probed[rx_buf_size]: + raise KsftSkipEx( + f"Test requires devmem rx-buf-size={rx_buf_size} support") +
def configure_nic(cfg): """Channels, rings, RSS, queue lease for netkit devmem.""" @@ -76,7 +89,8 @@ def set_flow_rule(cfg, port): return int(re.search(r'ID (\d+)', output).group(1))
-def ncdevmem_rx(cfg, port, verify=True, fail_on_linear=False, flow_steer=False): +def ncdevmem_rx(cfg, port, verify=True, fail_on_linear=False, flow_steer=False, + rx_buf_size=0): """Build the ncdevmem RX listener command.""" if hasattr(cfg, 'netns'): flow_rule_id = set_flow_rule(cfg, port) @@ -96,6 +110,8 @@ def ncdevmem_rx(cfg, port, verify=True, fail_on_linear=False, flow_steer=False): extras.append("-v 7") if fail_on_linear: extras.append("-L") + if rx_buf_size > 0: + extras.append(f"-b {rx_buf_size}")
parts = [cfg.bin_local, "-l", f"-f {ifname}", f"-s {addr}", f"-p {port}", *extras] @@ -202,6 +218,32 @@ def run_tx_chunks(cfg): ksft_eq(socat.stdout.strip(), "hello\nworld")
+def run_rx_large_niov(cfg): + """Run the devmem RX test with a large niov (rx-buf-size > PAGE_SIZE). + + Sweep payload sizes that straddle the niov boundary: below, equal to, + and above rx_buf_size, to exercise sub-niov, exact-niov, and multi-niov + RX paths. + """ + require_devmem(cfg, rx_buf_size=16384) + configure_nic(cfg) + netns = getattr(cfg, "netns", None) + + for size in [1024, 4096, 8192, 16384, 32768, 65536]: + port = rand_port() + socat = socat_send(cfg, port) + listen_cmd = ncdevmem_rx(cfg, port, + flow_steer=not netns, + rx_buf_size=16384) + data_pipe = (f"yes $(echo -e \x01\x02\x03\x04\x05\x06) | " + f"head -c {size} | {socat}") + with bkg(listen_cmd, exit_wait=True, ns=netns) as ncdevmem: + wait_port_listen(port, proto="tcp", ns=netns) + cmd(data_pipe, host=cfg.remote, shell=True) + ksft_eq(ncdevmem.ret, 0, + f"large-niov failed for payload size {size}") + + def run_rx_hds(cfg): """Run the HDS test by running devmem RX across a segment size sweep.""" require_devmem(cfg) diff --git a/tools/testing/selftests/drivers/net/hw/nk_devmem.py b/tools/testing/selftests/drivers/net/hw/nk_devmem.py index 300ed2a70ab4..7f1867e4ff32 100755 --- a/tools/testing/selftests/drivers/net/hw/nk_devmem.py +++ b/tools/testing/selftests/drivers/net/hw/nk_devmem.py @@ -3,7 +3,8 @@ """Test devmem TCP with netkit."""
import os -from devmem_lib import setup_test, run_rx, run_tx, run_tx_chunks, run_rx_hds +from devmem_lib import (setup_test, run_rx, run_tx, run_tx_chunks, run_rx_hds, + run_rx_large_niov) from lib.py import ksft_run, ksft_exit, ksft_disruptive from lib.py import NetDrvContEnv
@@ -31,6 +32,12 @@ def check_nk_rx_hds(cfg) -> None: run_rx_hds(cfg)
+@ksft_disruptive +def check_nk_rx_large_niov(cfg) -> None: + """Run the devmem RX large-niov test through netkit.""" + run_rx_large_niov(cfg) + + def main() -> None: """Run the netkit devmem test cases.""" with NetDrvContEnv(__file__, rxqueues=2, primary_rx_redirect=True) as cfg: @@ -38,7 +45,7 @@ def main() -> None: os.path.join(os.path.dirname(os.path.abspath(__file__)), "ncdevmem")) ksft_run([check_nk_rx, check_nk_tx, check_nk_tx_chunks, - check_nk_rx_hds], args=(cfg,)) + check_nk_rx_hds, check_nk_rx_large_niov], args=(cfg,)) ksft_exit()
linaro-mm-sig@lists.linaro.org