This series adds support to libbpf for attaching SCHED_CLS and SCHED_ACT bpf programs to their respective tc attach points.
Currently, a user needs to shell out to the tc command line for add, change, replace, and del operations, which is not ideal.
Some of the features that have been omitted for the CLS API:
* TCA_BPF_POLICE Support for adding police actions to filter has been omitted for now. * TCA_RATE Support for packet rate estimator has been omitted for now. * Attaching actions directly to the classifier This allows the attached actions to be bound to classifier and get auto detached when it is deleted. It translates to 'bind' refcount in the kernel internally. They run after a successful classification from the SCHED_CLS prog. Support for this can be added later, but has been omitted for now, primarily because direct-action mode provides a better alternative.
A high level TC-BPF API is also provided, and currently only supports attach and destroy operations. These functions return a pointer to a bpf_link object. When falling back to the low level API, the link must be disconnected to take over its ownership. It can be released using bpf_link__destroy, which will also cause the filter/action to be detached if not disconnected.
The individual commits contain a general API summary and examples.
Kumar Kartikeya Dwivedi (5): tools pkt_cls.h: sync with kernel sources libbpf: add helpers for preparing netlink attributes libbpf: add low level TC-BPF API libbpf: add high level TC-BPF API libbpf: add selftests for TC-BPF API
tools/include/uapi/linux/pkt_cls.h | 174 +++- tools/lib/bpf/libbpf.c | 110 ++- tools/lib/bpf/libbpf.h | 133 ++++ tools/lib/bpf/libbpf.map | 17 + tools/lib/bpf/netlink.c | 752 +++++++++++++++++- tools/lib/bpf/nlattr.h | 43 + .../selftests/bpf/prog_tests/test_tc_bpf.c | 261 ++++++ .../selftests/bpf/progs/test_tc_bpf_kern.c | 18 + 8 files changed, 1476 insertions(+), 32 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_tc_bpf.c create mode 100644 tools/testing/selftests/bpf/progs/test_tc_bpf_kern.c
-- 2.30.2
Update the header file so we can use the new defines in subsequent patches.
Reviewed-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi memxor@gmail.com --- tools/include/uapi/linux/pkt_cls.h | 174 ++++++++++++++++++++++++++++- 1 file changed, 170 insertions(+), 4 deletions(-)
diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h index 12153771396a..025c40fef93d 100644 --- a/tools/include/uapi/linux/pkt_cls.h +++ b/tools/include/uapi/linux/pkt_cls.h @@ -16,9 +16,36 @@ enum { TCA_ACT_STATS, TCA_ACT_PAD, TCA_ACT_COOKIE, + TCA_ACT_FLAGS, + TCA_ACT_HW_STATS, + TCA_ACT_USED_HW_STATS, __TCA_ACT_MAX };
+#define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for + * actions stats. + */ + +/* tca HW stats type + * When user does not pass the attribute, he does not care. + * It is the same as if he would pass the attribute with + * all supported bits set. + * In case no bits are set, user is not interested in getting any HW statistics. + */ +#define TCA_ACT_HW_STATS_IMMEDIATE (1 << 0) /* Means that in dump, user + * gets the current HW stats + * state from the device + * queried at the dump time. + */ +#define TCA_ACT_HW_STATS_DELAYED (1 << 1) /* Means that in dump, user gets + * HW stats that might be out of date + * for some time, maybe couple of + * seconds. This is the case when + * driver polls stats updates + * periodically or when it gets async + * stats update from the device. + */ + #define TCA_ACT_MAX __TCA_ACT_MAX #define TCA_OLD_COMPAT (TCA_ACT_MAX+1) #define TCA_ACT_MAX_PRIO 32 @@ -63,12 +90,53 @@ enum { #define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) #define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN
+/* These macros are put here for binary compatibility with userspace apps that + * make use of them. For kernel code and new userspace apps, use the TCA_ID_* + * versions. + */ +#define TCA_ACT_GACT 5 +#define TCA_ACT_IPT 6 +#define TCA_ACT_PEDIT 7 +#define TCA_ACT_MIRRED 8 +#define TCA_ACT_NAT 9 +#define TCA_ACT_XT 10 +#define TCA_ACT_SKBEDIT 11 +#define TCA_ACT_VLAN 12 +#define TCA_ACT_BPF 13 +#define TCA_ACT_CONNMARK 14 +#define TCA_ACT_SKBMOD 15 +#define TCA_ACT_CSUM 16 +#define TCA_ACT_TUNNEL_KEY 17 +#define TCA_ACT_SIMP 22 +#define TCA_ACT_IFE 25 +#define TCA_ACT_SAMPLE 26 + /* Action type identifiers*/ -enum { - TCA_ID_UNSPEC=0, - TCA_ID_POLICE=1, +enum tca_id { + TCA_ID_UNSPEC = 0, + TCA_ID_POLICE = 1, + TCA_ID_GACT = TCA_ACT_GACT, + TCA_ID_IPT = TCA_ACT_IPT, + TCA_ID_PEDIT = TCA_ACT_PEDIT, + TCA_ID_MIRRED = TCA_ACT_MIRRED, + TCA_ID_NAT = TCA_ACT_NAT, + TCA_ID_XT = TCA_ACT_XT, + TCA_ID_SKBEDIT = TCA_ACT_SKBEDIT, + TCA_ID_VLAN = TCA_ACT_VLAN, + TCA_ID_BPF = TCA_ACT_BPF, + TCA_ID_CONNMARK = TCA_ACT_CONNMARK, + TCA_ID_SKBMOD = TCA_ACT_SKBMOD, + TCA_ID_CSUM = TCA_ACT_CSUM, + TCA_ID_TUNNEL_KEY = TCA_ACT_TUNNEL_KEY, + TCA_ID_SIMP = TCA_ACT_SIMP, + TCA_ID_IFE = TCA_ACT_IFE, + TCA_ID_SAMPLE = TCA_ACT_SAMPLE, + TCA_ID_CTINFO, + TCA_ID_MPLS, + TCA_ID_CT, + TCA_ID_GATE, /* other actions go here */ - __TCA_ID_MAX=255 + __TCA_ID_MAX = 255 };
#define TCA_ID_MAX __TCA_ID_MAX @@ -120,6 +188,10 @@ enum { TCA_POLICE_RESULT, TCA_POLICE_TM, TCA_POLICE_PAD, + TCA_POLICE_RATE64, + TCA_POLICE_PEAKRATE64, + TCA_POLICE_PKTRATE64, + TCA_POLICE_PKTBURST64, __TCA_POLICE_MAX #define TCA_POLICE_RESULT TCA_POLICE_RESULT }; @@ -333,12 +405,19 @@ enum {
/* Basic filter */
+struct tc_basic_pcnt { + __u64 rcnt; + __u64 rhit; +}; + enum { TCA_BASIC_UNSPEC, TCA_BASIC_CLASSID, TCA_BASIC_EMATCHES, TCA_BASIC_ACT, TCA_BASIC_POLICE, + TCA_BASIC_PCNT, + TCA_BASIC_PAD, __TCA_BASIC_MAX };
@@ -485,17 +564,54 @@ enum {
TCA_FLOWER_IN_HW_COUNT,
+ TCA_FLOWER_KEY_PORT_SRC_MIN, /* be16 */ + TCA_FLOWER_KEY_PORT_SRC_MAX, /* be16 */ + TCA_FLOWER_KEY_PORT_DST_MIN, /* be16 */ + TCA_FLOWER_KEY_PORT_DST_MAX, /* be16 */ + + TCA_FLOWER_KEY_CT_STATE, /* u16 */ + TCA_FLOWER_KEY_CT_STATE_MASK, /* u16 */ + TCA_FLOWER_KEY_CT_ZONE, /* u16 */ + TCA_FLOWER_KEY_CT_ZONE_MASK, /* u16 */ + TCA_FLOWER_KEY_CT_MARK, /* u32 */ + TCA_FLOWER_KEY_CT_MARK_MASK, /* u32 */ + TCA_FLOWER_KEY_CT_LABELS, /* u128 */ + TCA_FLOWER_KEY_CT_LABELS_MASK, /* u128 */ + + TCA_FLOWER_KEY_MPLS_OPTS, + + TCA_FLOWER_KEY_HASH, /* u32 */ + TCA_FLOWER_KEY_HASH_MASK, /* u32 */ + __TCA_FLOWER_MAX, };
#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
+enum { + TCA_FLOWER_KEY_CT_FLAGS_NEW = 1 << 0, /* Beginning of a new connection. */ + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */ + TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ + TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ + TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ + TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */ + __TCA_FLOWER_KEY_CT_FLAGS_MAX, +}; + enum { TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_VXLAN_ + * attributes + */ + TCA_FLOWER_KEY_ENC_OPTS_ERSPAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_ERSPAN_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, };
@@ -513,18 +629,68 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1)
+enum { + TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */ + __TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1) + +enum { + TCA_FLOWER_KEY_MPLS_OPTS_UNSPEC, + TCA_FLOWER_KEY_MPLS_OPTS_LSE, + __TCA_FLOWER_KEY_MPLS_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_MPLS_OPTS_MAX (__TCA_FLOWER_KEY_MPLS_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_MPLS_OPT_LSE_UNSPEC, + TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH, + TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL, + TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS, + TCA_FLOWER_KEY_MPLS_OPT_LSE_TC, + TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, + __TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, +}; + +#define TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX \ + (__TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), };
+#define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */ + /* Match-all classifier */
+struct tc_matchall_pcnt { + __u64 rhit; +}; + enum { TCA_MATCHALL_UNSPEC, TCA_MATCHALL_CLASSID, TCA_MATCHALL_ACT, TCA_MATCHALL_FLAGS, + TCA_MATCHALL_PCNT, + TCA_MATCHALL_PAD, __TCA_MATCHALL_MAX, };
On Thu, Mar 25, 2021 at 5:01 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
Update the header file so we can use the new defines in subsequent patches.
Reviewed-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi memxor@gmail.com
tools/include/uapi/linux/pkt_cls.h | 174 ++++++++++++++++++++++++++++-
If libbpf is going to rely on this UAPI header, we probably need to add this header to the list of headers that are checked for being up to date. See Makefile, roughly at line 140.
1 file changed, 170 insertions(+), 4 deletions(-)
[...]
On Sat, Mar 27, 2021 at 04:55:51AM IST, Andrii Nakryiko wrote:
On Thu, Mar 25, 2021 at 5:01 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
Update the header file so we can use the new defines in subsequent patches.
Reviewed-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi memxor@gmail.com
tools/include/uapi/linux/pkt_cls.h | 174 ++++++++++++++++++++++++++++-
If libbpf is going to rely on this UAPI header, we probably need to add this header to the list of headers that are checked for being up to date. See Makefile, roughly at line 140.
Ok, will do in v2.
1 file changed, 170 insertions(+), 4 deletions(-)
[...]
-- Kartikeya
On Fri, Mar 26, 2021 at 8:54 PM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
On Sat, Mar 27, 2021 at 04:55:51AM IST, Andrii Nakryiko wrote:
On Thu, Mar 25, 2021 at 5:01 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
Update the header file so we can use the new defines in subsequent patches.
Reviewed-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi memxor@gmail.com
tools/include/uapi/linux/pkt_cls.h | 174 ++++++++++++++++++++++++++++-
If libbpf is going to rely on this UAPI header, we probably need to add this header to the list of headers that are checked for being up to date. See Makefile, roughly at line 140.
Ok, will do in v2.
Just please hold off until I finish review of the rest of your patches.
1 file changed, 170 insertions(+), 4 deletions(-)
[...]
-- Kartikeya
This change introduces a few helpers to wrap open coded attribute preparation in netlink.c.
Every nested attribute's closure must happen using the helper end_nlattr_nested, which sets its length properly. NLA_F_NESTED is enforeced using begin_nlattr_nested helper. Other simple attributes can be added directly.
The maxsz parameter corresponds to the size of the request structure which is being filled in, so for instance with req being:
struct { struct nlmsghdr nh; struct tcmsg t; char buf[4096]; } req;
Then, maxsz should be sizeof(req).
This change also converts the open coded attribute preparation with the helpers. Note that the only failure the internal call to add_nlattr could result in the nested helper would be -EMSGSIZE, hence that is what we return to our caller.
Reviewed-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi memxor@gmail.com --- tools/lib/bpf/netlink.c | 37 +++++++++++++++-------------------- tools/lib/bpf/nlattr.h | 43 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 21 deletions(-)
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 4dd73de00b6f..f448c29de76d 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -135,7 +135,7 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, __u32 flags) { int sock, seq = 0, ret; - struct nlattr *nla, *nla_xdp; + struct nlattr *nla; struct { struct nlmsghdr nh; struct ifinfomsg ifinfo; @@ -157,36 +157,31 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, req.ifinfo.ifi_index = ifindex;
/* started nested attribute for XDP */ - nla = (struct nlattr *)(((char *)&req) - + NLMSG_ALIGN(req.nh.nlmsg_len)); - nla->nla_type = NLA_F_NESTED | IFLA_XDP; - nla->nla_len = NLA_HDRLEN; + nla = begin_nlattr_nested(&req.nh, sizeof(req), IFLA_XDP); + if (!nla) { + ret = -EMSGSIZE; + goto cleanup; + }
/* add XDP fd */ - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = IFLA_XDP_FD; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); - memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); - nla->nla_len += nla_xdp->nla_len; + ret = add_nlattr(&req.nh, sizeof(req), IFLA_XDP_FD, &fd, sizeof(fd)); + if (ret < 0) + goto cleanup;
/* if user passed in any flags, add those too */ if (flags) { - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = IFLA_XDP_FLAGS; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); - memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); - nla->nla_len += nla_xdp->nla_len; + ret = add_nlattr(&req.nh, sizeof(req), IFLA_XDP_FLAGS, &flags, sizeof(flags)); + if (ret < 0) + goto cleanup; }
if (flags & XDP_FLAGS_REPLACE) { - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = IFLA_XDP_EXPECTED_FD; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(old_fd); - memcpy((char *)nla_xdp + NLA_HDRLEN, &old_fd, sizeof(old_fd)); - nla->nla_len += nla_xdp->nla_len; + ret = add_nlattr(&req.nh, sizeof(req), IFLA_XDP_EXPECTED_FD, &flags, sizeof(flags)); + if (ret < 0) + goto cleanup; }
- req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + end_nlattr_nested(&req.nh, nla);
if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { ret = -errno; diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h index 6cc3ac91690f..463a53bf3022 100644 --- a/tools/lib/bpf/nlattr.h +++ b/tools/lib/bpf/nlattr.h @@ -10,7 +10,10 @@ #define __LIBBPF_NLATTR_H
#include <stdint.h> +#include <string.h> +#include <errno.h> #include <linux/netlink.h> + /* avoid multiple definition of netlink features */ #define __LINUX_NETLINK_H
@@ -103,4 +106,44 @@ int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype,
int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh);
+ +/* Helpers for preparing/consuming attributes */ + +#define NLA_DATA(nla) ((struct nlattr *)((char *)(nla) + NLA_HDRLEN)) + +static inline int add_nlattr(struct nlmsghdr *nh, size_t maxsz, int type, + const void *data, int len) +{ + struct nlattr *nla; + + if (NLMSG_ALIGN(nh->nlmsg_len) + NLA_ALIGN(NLA_HDRLEN + len) > maxsz) + return -EMSGSIZE; + if ((!data && len) || (data && !len)) + return -EINVAL; + + nla = (struct nlattr *)((char *)nh + NLMSG_ALIGN(nh->nlmsg_len)); + nla->nla_type = type; + nla->nla_len = NLA_HDRLEN + len; + if (data) + memcpy((char *)nla + NLA_HDRLEN, data, len); + nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + NLA_ALIGN(nla->nla_len); + return 0; +} + +static inline struct nlattr *begin_nlattr_nested(struct nlmsghdr *nh, size_t maxsz, + int type) +{ + struct nlattr *tail; + + tail = (struct nlattr *)((char *)nh + NLMSG_ALIGN(nh->nlmsg_len)); + if (add_nlattr(nh, maxsz, type | NLA_F_NESTED, NULL, 0)) + return NULL; + return tail; +} + +static inline void end_nlattr_nested(struct nlmsghdr *nh, struct nlattr *tail) +{ + tail->nla_len = ((char *)nh + NLMSG_ALIGN(nh->nlmsg_len)) - (char *)(tail); +} + #endif /* __LIBBPF_NLATTR_H */
On Thu, Mar 25, 2021 at 5:01 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
This change introduces a few helpers to wrap open coded attribute preparation in netlink.c.
Every nested attribute's closure must happen using the helper end_nlattr_nested, which sets its length properly. NLA_F_NESTED is enforeced using begin_nlattr_nested helper. Other simple attributes can be added directly.
The maxsz parameter corresponds to the size of the request structure which is being filled in, so for instance with req being:
struct { struct nlmsghdr nh; struct tcmsg t; char buf[4096]; } req;
Then, maxsz should be sizeof(req).
This change also converts the open coded attribute preparation with the helpers. Note that the only failure the internal call to add_nlattr could result in the nested helper would be -EMSGSIZE, hence that is what we return to our caller.
Reviewed-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi memxor@gmail.com
tools/lib/bpf/netlink.c | 37 +++++++++++++++-------------------- tools/lib/bpf/nlattr.h | 43 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 21 deletions(-)
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 4dd73de00b6f..f448c29de76d 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -135,7 +135,7 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, __u32 flags) { int sock, seq = 0, ret;
struct nlattr *nla, *nla_xdp;
struct nlattr *nla; struct { struct nlmsghdr nh; struct ifinfomsg ifinfo;
@@ -157,36 +157,31 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, req.ifinfo.ifi_index = ifindex;
/* started nested attribute for XDP */
nla = (struct nlattr *)(((char *)&req)
+ NLMSG_ALIGN(req.nh.nlmsg_len));
nla->nla_type = NLA_F_NESTED | IFLA_XDP;
nla->nla_len = NLA_HDRLEN;
nla = begin_nlattr_nested(&req.nh, sizeof(req), IFLA_XDP);
if (!nla) {
ret = -EMSGSIZE;
goto cleanup;
} /* add XDP fd */
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FD;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
nla->nla_len += nla_xdp->nla_len;
ret = add_nlattr(&req.nh, sizeof(req), IFLA_XDP_FD, &fd, sizeof(fd));
if (ret < 0)
goto cleanup; /* if user passed in any flags, add those too */ if (flags) {
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FLAGS;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
nla->nla_len += nla_xdp->nla_len;
ret = add_nlattr(&req.nh, sizeof(req), IFLA_XDP_FLAGS, &flags, sizeof(flags));
if (ret < 0)
goto cleanup; } if (flags & XDP_FLAGS_REPLACE) {
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_EXPECTED_FD;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(old_fd);
memcpy((char *)nla_xdp + NLA_HDRLEN, &old_fd, sizeof(old_fd));
nla->nla_len += nla_xdp->nla_len;
ret = add_nlattr(&req.nh, sizeof(req), IFLA_XDP_EXPECTED_FD, &flags, sizeof(flags));
if (ret < 0)
goto cleanup; }
req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
end_nlattr_nested(&req.nh, nla); if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { ret = -errno;
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h index 6cc3ac91690f..463a53bf3022 100644 --- a/tools/lib/bpf/nlattr.h +++ b/tools/lib/bpf/nlattr.h @@ -10,7 +10,10 @@ #define __LIBBPF_NLATTR_H
#include <stdint.h> +#include <string.h> +#include <errno.h> #include <linux/netlink.h>
/* avoid multiple definition of netlink features */ #define __LINUX_NETLINK_H
@@ -103,4 +106,44 @@ int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype,
int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh);
+/* Helpers for preparing/consuming attributes */
+#define NLA_DATA(nla) ((struct nlattr *)((char *)(nla) + NLA_HDRLEN))
`((char *)nh + NLMSG_ALIGN(nh->nlmsg_len))` seems to be another popular one (three occurrences in this file), maybe extract that one as well?
And can you please use functions, not macros? This way you can specify what types you expect, as one of the benefits.
+static inline int add_nlattr(struct nlmsghdr *nh, size_t maxsz, int type,
const void *data, int len)
+{
struct nlattr *nla;
if (NLMSG_ALIGN(nh->nlmsg_len) + NLA_ALIGN(NLA_HDRLEN + len) > maxsz)
return -EMSGSIZE;
if ((!data && len) || (data && !len))
return -EINVAL;
nla = (struct nlattr *)((char *)nh + NLMSG_ALIGN(nh->nlmsg_len));
nla->nla_type = type;
nla->nla_len = NLA_HDRLEN + len;
if (data)
memcpy((char *)nla + NLA_HDRLEN, data, len);
nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + NLA_ALIGN(nla->nla_len);
return 0;
+}
+static inline struct nlattr *begin_nlattr_nested(struct nlmsghdr *nh, size_t maxsz,
int type)
+{
struct nlattr *tail;
tail = (struct nlattr *)((char *)nh + NLMSG_ALIGN(nh->nlmsg_len));
if (add_nlattr(nh, maxsz, type | NLA_F_NESTED, NULL, 0))
return NULL;
return tail;
+}
+static inline void end_nlattr_nested(struct nlmsghdr *nh, struct nlattr *tail)
I don't know much about their use (yet, I feel like I'm about to learn :( ), but would nlattr_add, nlattr_begin_nested/nlattr_start_nested, nlattr_end_nested make sense and be a bit more in line with overall object_action naming pattern?
+{
tail->nla_len = ((char *)nh + NLMSG_ALIGN(nh->nlmsg_len)) - (char *)(tail);
+}
#endif /* __LIBBPF_NLATTR_H */
2.30.2
This adds functions that wrap the netlink API used for adding, manipulating, and removing filters and actions. These functions operate directly on the loaded prog's fd, and return a handle to the filter and action using an out parameter (id for tc_cls, and index for tc_act).
The basic featureset is covered to allow for attaching, manipulation of properties, and removal of filters and actions. Some additional features like TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can added on top later by extending the bpf_tc_cls_opts struct.
Support for binding actions directly to a classifier by passing them in during filter creation has also been omitted for now. These actions have an auto clean up property because their lifetime is bound to the filter they are attached to. This can be added later, but was omitted for now as direct action mode is a better alternative to it.
An API summary:
The BPF TC-CLS API
bpf_tc_act_{attach, change, replace}_{dev, block} may be used to attach, change, and replace SCHED_CLS bpf classifiers. Separate set of functions are provided for network interfaces and shared filter blocks.
bpf_tc_cls_detach_{dev, block} may be used to detach existing SCHED_CLS filter. The bpf_tc_cls_attach_id object filled in during attach, change, or replace must be passed in to the detach functions for them to remove the filter and its attached classififer correctly.
bpf_tc_cls_get_info is a helper that can be used to obtain attributes for the filter and classififer. The opts structure may be used to choose the granularity of search, such that info for a specific filter corresponding to the same loaded bpf program can be obtained. By default, the first match is returned to the user.
Examples:
struct bpf_tc_cls_attach_id id = {}; struct bpf_object *obj; struct bpf_program *p; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "classifier"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
if (bpf_object__load(obj) < 0) return -1;
fd = bpf_program__fd(p);
r = bpf_tc_cls_attach_dev(fd, if_nametoindex("lo"), BPF_TC_CLSACT_INGRESS, ETH_P_IP, NULL, &id); if (r < 0) return r;
... which is roughly equivalent to (after clsact qdisc setup): # tc filter add dev lo ingress bpf obj /home/kkd/foo.o sec classifier
If a user wishes to modify existing options on an attached filter, the bpf_tc_cls_change_{dev, block} API may be used. Parameters like chain_index, priority, and handle are ignored in the bpf_tc_cls_opts struct as they cannot be modified after attaching a filter.
Example:
/* Optional parameters necessary to select the right filter */ DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, .handle = id.handle, .priority = id.priority, .chain_index = id.chain_index) /* Turn on direct action mode */ opts.direct_action = true; r = bpf_tc_cls_change_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &id); if (r < 0) return r;
/* Verify that the direct action mode has been set */ struct bpf_tc_cls_info info = {}; r = bpf_tc_cls_get_info_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &info); if (r < 0) return r;
assert(info.bpf_flags & TCA_BPF_FLAG_ACT_DIRECT);
This would be roughly equivalent to doing: # tc filter change dev lo egress prio <p> handle <h> bpf obj /home/kkd/foo.o section classifier da
... except a new bpf program will be loaded and replace existing one.
If a user wishes to either replace an existing filter, or create a new one with the same properties, they can use bpf_tc_cls_replace_dev. The benefit of bpf_tc_cls_change is that it fails if no matching filter exists.
The BPF TC-ACT API
bpf_tc_act_{attach, replace} may be used to attach and replace already attached SCHED_ACT actions. Passing an index of 0 has special meaning, in that an index will be automatically chosen by the kernel. The index chosen by the kernel is the return value of these functions in case of success.
bpf_tc_act_detach may be used to detach a SCHED_ACT action prog identified by the index parameter. The index 0 again has a special meaning, in that passing it will flush all existing SCHED_ACT actions loaded using the ACT API.
bpf_tc_act_get_info is a helper to get the required attributes of a loaded program to be able to manipulate it futher, by passing them into the aforementioned functions.
Example:
struct bpf_object *obj; struct bpf_program *p; __u32 index; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "action"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
if (bpf_object__load(obj) < 0) return -1;
fd = bpf_program__fd(p);
r = bpf_tc_act_attach(fd, NULL, &index); if (r < 0) return r;
if (bpf_tc_act_detach(index)) return -1;
... which is equivalent to the following sequence: tc action add action bpf obj /home/kkd/foo.o sec action tc action del action bpf index <idx>
Reviewed-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi memxor@gmail.com --- tools/lib/bpf/libbpf.h | 118 +++++++ tools/lib/bpf/libbpf.map | 14 + tools/lib/bpf/netlink.c | 715 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 841 insertions(+), 6 deletions(-)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index a1a424b9b8ff..63baef6045b1 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -16,6 +16,9 @@ #include <stdbool.h> #include <sys/types.h> // for size_t #include <linux/bpf.h> +#include <linux/pkt_cls.h> +#include <linux/pkt_sched.h> +#include <linux/tc_act/tc_bpf.h>
#include "libbpf_common.h"
@@ -773,6 +776,121 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker); LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
+/* + * Requirements: + * If choosing hw offload mode (skip_sw = true), ifindex during prog load must be set. + */ + +/* Convenience macros for the clsact attach hooks */ +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS) +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS) + +struct bpf_tc_cls_opts { + size_t sz; + __u32 chain_index; + __u32 handle; + __u32 priority; + __u32 class_id; + bool direct_action; + bool skip_sw; + bool skip_hw; + size_t :0; +}; + +#define bpf_tc_cls_opts__last_field skip_hw + +/* Acts as a handle for an attached filter */ +struct bpf_tc_cls_attach_id { + __u32 ifindex; + union { + __u32 block_index; + __u32 parent_id; + }; + __u32 protocol; + __u32 chain_index; + __u32 handle; + __u32 priority; +}; + +struct bpf_tc_cls_info { + struct bpf_tc_cls_attach_id id; + __u32 class_id; + __u32 bpf_flags; + __u32 bpf_flags_gen; +}; + +/* id is out parameter that will be written to, it must not be NULL */ +LIBBPF_API int bpf_tc_cls_attach_dev(int fd, __u32 ifindex, __u32 parent_id, + __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id); +LIBBPF_API int bpf_tc_cls_change_dev(int fd, __u32 ifindex, __u32 parent_id, + __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id); +/* This replaces an existing filter with the same attributes, so the arguments + * can be filled in from an existing attach_id when replacing, and otherwise be + * used like bpf_tc_cls_attach_dev. + */ +LIBBPF_API int bpf_tc_cls_replace_dev(int fd, __u32 ifindex, __u32 parent_id, + __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id); +LIBBPF_API int bpf_tc_cls_detach_dev(const struct bpf_tc_cls_attach_id *id); +LIBBPF_API int bpf_tc_cls_get_info_dev(int fd, __u32 ifindex, __u32 parent_id, + __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_info *info); + +/* id is out parameter that will be written to, it must not be NULL */ +LIBBPF_API int bpf_tc_cls_attach_block(int fd, __u32 block_index, + __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id); +LIBBPF_API int bpf_tc_cls_change_block(int fd, __u32 block_index, + __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id); +/* This replaces an existing filter with the same attributes, so the arguments + * can be filled in from an existing attach_id when replacing, and otherwise be + * used like bpf_tc_cls_attach_block. + */ +LIBBPF_API int bpf_tc_cls_replace_block(int fd, __u32 block_index, + __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id); +LIBBPF_API int bpf_tc_cls_detach_block(const struct bpf_tc_cls_attach_id *id); +LIBBPF_API int bpf_tc_cls_get_info_block(int fd, __u32 block_index, + __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_info *info); + +struct bpf_tc_act_opts { + size_t sz; + __u32 index; + int action; + void *cookie; + size_t cookie_len; + __u8 hw_stats_type; + bool no_percpu; + size_t :0; +}; + +#define bpf_tc_act_opts__last_field no_percpu + +struct bpf_tc_act_info { + __u32 index; + __u32 capab; + int action; + int refcnt; + int bindcnt; +}; + +LIBBPF_API int bpf_tc_act_attach(int fd, const struct bpf_tc_act_opts *opts, __u32 *index); +LIBBPF_API int bpf_tc_act_replace(int fd, const struct bpf_tc_act_opts *opts, __u32 *index); +LIBBPF_API int bpf_tc_act_detach(__u32 index); +LIBBPF_API int bpf_tc_act_get_info(int fd, struct bpf_tc_act_info *info); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 279ae861f568..72022b45a8b9 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -359,4 +359,18 @@ LIBBPF_0.4.0 { bpf_linker__finalize; bpf_linker__free; bpf_linker__new; + bpf_tc_act_attach; + bpf_tc_act_replace; + bpf_tc_act_detach; + bpf_tc_act_get_info; + bpf_tc_cls_attach_block; + bpf_tc_cls_attach_dev; + bpf_tc_cls_change_block; + bpf_tc_cls_change_dev; + bpf_tc_cls_detach_block; + bpf_tc_cls_detach_dev; + bpf_tc_cls_replace_block; + bpf_tc_cls_replace_dev; + bpf_tc_cls_get_info_dev; + bpf_tc_cls_get_info_block; } LIBBPF_0.3.0; diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index f448c29de76d..bd196d184341 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -4,8 +4,13 @@ #include <stdlib.h> #include <memory.h> #include <unistd.h> +#include <inttypes.h> +#include <arpa/inet.h> #include <linux/bpf.h> +#include <linux/atm.h> +#include <linux/pkt_cls.h> #include <linux/rtnetlink.h> +#include <linux/tc_act/tc_bpf.h> #include <sys/socket.h> #include <errno.h> #include <time.h> @@ -344,6 +349,20 @@ int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags) return ret; }
+static int bpf_nl_get_ext(struct nlmsghdr *nh, int sock, unsigned int nl_pid, + __dump_nlmsg_t dump_link_nlmsg_p, + libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) +{ + int seq = time(NULL); + + nh->nlmsg_seq = seq; + if (send(sock, nh, nh->nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, dump_link_nlmsg_p, + dump_link_nlmsg, cookie); +} + int libbpf_nl_get_link(int sock, unsigned int nl_pid, libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) { @@ -356,12 +375,696 @@ int libbpf_nl_get_link(int sock, unsigned int nl_pid, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .ifm.ifi_family = AF_PACKET, }; - int seq = time(NULL);
- req.nlh.nlmsg_seq = seq; - if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) - return -errno; + return bpf_nl_get_ext(&req.nlh, sock, nl_pid, __dump_link_nlmsg, + dump_link_nlmsg, cookie); +}
- return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg, - dump_link_nlmsg, cookie); +static int tc_bpf_add_fd_and_name(struct nlmsghdr *nh, size_t maxsz, int fd, + enum bpf_prog_type type) +{ + int len, ret, bpf_fd_type, bpf_name_type; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + char name[64] = {}; + + switch (type) { + case BPF_PROG_TYPE_SCHED_CLS: + bpf_fd_type = TCA_BPF_FD; + bpf_name_type = TCA_BPF_NAME; + break; + case BPF_PROG_TYPE_SCHED_ACT: + bpf_fd_type = TCA_ACT_BPF_FD; + bpf_name_type = TCA_ACT_BPF_NAME; + break; + default: + return -EINVAL; + } + + ret = bpf_obj_get_info_by_fd(fd, &info, &info_len); + if (ret < 0 || type != info.type) + return ret; + + ret = add_nlattr(nh, maxsz, bpf_fd_type, &fd, sizeof(fd)); + if (ret < 0) + return ret; + + len = snprintf(name, sizeof(name), "%s:[%" PRIu32 "]", info.name, + info.id); + if (len < 0 || len >= sizeof(name)) + return len < 0 ? -EINVAL : -ENAMETOOLONG; + + return add_nlattr(nh, maxsz, bpf_name_type, name, len + 1); +} + +struct pass_info { + void *info; + __u32 prog_id; +}; + +static int cls_get_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie); + +static int tc_cls_bpf_modify(int fd, int cmd, unsigned int flags, __u32 ifindex, + __u32 parent_id, __u32 protocol, + const struct bpf_tc_cls_opts *opts, + __dump_nlmsg_t fn, struct bpf_tc_cls_attach_id *id) +{ + unsigned int bpf_flags = 0, bpf_flags_gen = 0; + struct bpf_tc_cls_info info = {}; + int sock, seq = 0, ret; + struct nlattr *nla; + __u32 nl_pid = 0; + struct { + struct nlmsghdr nh; + struct tcmsg t; + char buf[256]; + } req; + + if (OPTS_GET(opts, priority, 0) > 0xFFFF) + return -EINVAL; + + sock = libbpf_netlink_open(&nl_pid); + if (sock < 0) + return sock; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags; + req.nh.nlmsg_type = cmd; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.t.tcm_family = AF_UNSPEC; + req.t.tcm_handle = OPTS_GET(opts, handle, 0); + req.t.tcm_parent = parent_id; + req.t.tcm_ifindex = ifindex; + req.t.tcm_info = + TC_H_MAKE(OPTS_GET(opts, priority, 0UL) << 16, htons(protocol)); + + if (OPTS_HAS(opts, chain_index)) { + ret = add_nlattr(&req.nh, sizeof(req), TCA_CHAIN, + &opts->chain_index, sizeof(opts->chain_index)); + if (ret < 0) + goto end; + } + + ret = add_nlattr(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + goto end; + + nla = begin_nlattr_nested(&req.nh, sizeof(req), TCA_OPTIONS); + if (!nla) { + ret = -EMSGSIZE; + goto end; + } + + if (OPTS_GET(opts, class_id, TC_H_UNSPEC)) { + ret = add_nlattr(&req.nh, sizeof(req), TCA_BPF_CLASSID, + &opts->class_id, sizeof(opts->class_id)); + if (ret < 0) + goto end; + } + + if (cmd != RTM_DELTFILTER) { + ret = tc_bpf_add_fd_and_name(&req.nh, sizeof(req), fd, + BPF_PROG_TYPE_SCHED_CLS); + if (ret < 0) + goto end; + + if (OPTS_GET(opts, skip_hw, false)) + bpf_flags_gen |= TCA_CLS_FLAGS_SKIP_HW; + if (OPTS_GET(opts, skip_sw, false)) + bpf_flags_gen |= TCA_CLS_FLAGS_SKIP_SW; + if (OPTS_GET(opts, direct_action, false)) + bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; + + if (bpf_flags_gen) { + ret = add_nlattr(&req.nh, sizeof(req), + TCA_BPF_FLAGS_GEN, &bpf_flags_gen, + sizeof(bpf_flags_gen)); + if (ret < 0) + goto end; + } + + if (bpf_flags) { + ret = add_nlattr(&req.nh, sizeof(req), TCA_BPF_FLAGS, + &bpf_flags, sizeof(bpf_flags)); + if (ret < 0) + goto end; + } + } + + end_nlattr_nested(&req.nh, nla); + + ret = send(sock, &req.nh, req.nh.nlmsg_len, 0); + if (ret < 0) + goto end; + + ret = bpf_netlink_recv(sock, nl_pid, seq, fn, NULL, + &(struct pass_info){ &info, 0 }); + + if (fn) + *id = info.id; + +end: + close(sock); + return ret; +} + +int bpf_tc_cls_attach_dev(int fd, __u32 ifindex, __u32 parent_id, + __u32 protocol, const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id) +{ + if (fd < 1 || !OPTS_VALID(opts, bpf_tc_cls_opts) || !id) + return -EINVAL; + + return tc_cls_bpf_modify(fd, RTM_NEWTFILTER, + NLM_F_ECHO | NLM_F_EXCL | NLM_F_CREATE, + ifindex, parent_id, protocol, opts, + cls_get_info, id); +} + +int bpf_tc_cls_change_dev(int fd, __u32 ifindex, __u32 parent_id, + __u32 protocol, const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id) +{ + if (fd < 1 || !OPTS_VALID(opts, bpf_tc_cls_opts) || !id) + return -EINVAL; + + return tc_cls_bpf_modify(fd, RTM_NEWTFILTER, NLM_F_ECHO, ifindex, + parent_id, protocol, opts, cls_get_info, id); +} + +int bpf_tc_cls_replace_dev(int fd, __u32 ifindex, __u32 parent_id, + __u32 protocol, const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id) +{ + if (fd < 1 || !OPTS_VALID(opts, bpf_tc_cls_opts) || !id) + return -EINVAL; + + return tc_cls_bpf_modify(fd, RTM_NEWTFILTER, NLM_F_ECHO | NLM_F_CREATE, + ifindex, parent_id, protocol, opts, + cls_get_info, id); +} + +int bpf_tc_cls_detach_dev(const struct bpf_tc_cls_attach_id *id) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, 0); + + if (!id) + return -EINVAL; + + opts.chain_index = id->chain_index; + opts.handle = id->handle; + opts.priority = id->priority; + + return tc_cls_bpf_modify(-1, RTM_DELTFILTER, 0, id->ifindex, + id->parent_id, id->protocol, &opts, NULL, + NULL); +} + +int bpf_tc_cls_attach_block(int fd, __u32 block_index, __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id) +{ + return bpf_tc_cls_attach_dev(fd, TCM_IFINDEX_MAGIC_BLOCK, block_index, + protocol, opts, id); +} + +int bpf_tc_cls_change_block(int fd, __u32 block_index, __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id) +{ + return bpf_tc_cls_attach_dev(fd, TCM_IFINDEX_MAGIC_BLOCK, block_index, + protocol, opts, id); +} + +int bpf_tc_cls_replace_block(int fd, __u32 block_index, __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id) +{ + return bpf_tc_cls_attach_dev(fd, TCM_IFINDEX_MAGIC_BLOCK, block_index, + protocol, opts, id); +} + +int bpf_tc_cls_detach_block(const struct bpf_tc_cls_attach_id *id) +{ + return bpf_tc_cls_detach_dev(id); +} + +static int __cls_get_info(void *cookie, void *msg, struct nlattr **tb) +{ + struct nlattr *tbb[TCA_BPF_MAX + 1]; + struct pass_info *cinfo = cookie; + struct bpf_tc_cls_info *info; + struct tcmsg *t = msg; + __u32 prog_id; + + info = cinfo->info; + + if (!tb[TCA_OPTIONS]) + return 0; + + libbpf_nla_parse_nested(tbb, TCA_BPF_MAX, tb[TCA_OPTIONS], NULL); + if (!tbb[TCA_BPF_ID]) + return 0; + + prog_id = libbpf_nla_getattr_u32(tbb[TCA_BPF_ID]); + if (cinfo->prog_id && cinfo->prog_id != prog_id) + return 0; + + info->id.parent_id = t->tcm_parent; + info->id.ifindex = t->tcm_ifindex; + info->id.protocol = ntohs(TC_H_MIN(t->tcm_info)); + info->id.priority = TC_H_MAJ(t->tcm_info) >> 16; + info->id.handle = t->tcm_handle; + + if (tb[TCA_CHAIN]) + info->id.chain_index = libbpf_nla_getattr_u32(tb[TCA_CHAIN]); + else + info->id.chain_index = 0; + + if (tbb[TCA_BPF_FLAGS]) + info->bpf_flags = libbpf_nla_getattr_u32(tbb[TCA_BPF_FLAGS]); + + if (tbb[TCA_BPF_FLAGS_GEN]) + info->bpf_flags_gen = + libbpf_nla_getattr_u32(tbb[TCA_BPF_FLAGS_GEN]); + + if (tbb[TCA_BPF_CLASSID]) + info->class_id = libbpf_nla_getattr_u32(tbb[TCA_BPF_CLASSID]); + + return 1; +} + +static int cls_get_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct tcmsg *t = NLMSG_DATA(nh); + struct nlattr *tb[TCA_MAX + 1]; + + libbpf_nla_parse(tb, TCA_MAX, + (struct nlattr *)((char *)t + NLMSG_ALIGN(sizeof(*t))), + NLMSG_PAYLOAD(nh, sizeof(*t)), NULL); + if (!tb[TCA_KIND]) + return -EINVAL; + + return __cls_get_info(cookie, t, tb); +} + +static int tc_cls_get_info(int fd, __u32 ifindex, __u32 parent_id, + __u32 protocol, const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_info *info) +{ + __u32 nl_pid, info_len = sizeof(struct bpf_prog_info); + struct bpf_prog_info prog_info = {}; + int sock, ret; + struct { + struct nlmsghdr nh; + struct tcmsg t; + char buf[256]; + } req = { + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nh.nlmsg_type = RTM_GETTFILTER, + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .t.tcm_family = AF_UNSPEC, + }; + + if (!OPTS_VALID(opts, bpf_tc_cls_opts)) + return -EINVAL; + + req.t.tcm_parent = parent_id; + req.t.tcm_ifindex = ifindex; + req.t.tcm_handle = OPTS_GET(opts, handle, 0); + req.t.tcm_info = + TC_H_MAKE(OPTS_GET(opts, priority, 0UL) << 16, htons(protocol)); + + ret = bpf_obj_get_info_by_fd(fd, &prog_info, &info_len); + if (ret < 0) + return ret; + + sock = libbpf_netlink_open(&nl_pid); + if (sock < 0) + return sock; + + ret = add_nlattr(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + goto end; + + if (OPTS_HAS(opts, chain_index)) { + ret = add_nlattr(&req.nh, sizeof(req), TCA_CHAIN, + &opts->chain_index, sizeof(opts->chain_index)); + if (ret < 0) + goto end; + } + + req.nh.nlmsg_seq = time(NULL); + + ret = bpf_nl_get_ext(&req.nh, sock, nl_pid, cls_get_info, NULL, + &(struct pass_info){ info, prog_info.id }); + if (ret < 0) + goto end; + /* 1 denotes a match */ + ret = ret == 1 ? 0 : -ESRCH; +end: + close(sock); + return ret; +} + +int bpf_tc_cls_get_info_dev(int fd, __u32 ifindex, __u32 parent_id, + __u32 protocol, const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_info *info) +{ + return tc_cls_get_info(fd, ifindex, parent_id, protocol, opts, info); +} + +int bpf_tc_cls_get_info_block(int fd, __u32 block_index, __u32 protocol, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_info *info) +{ + return bpf_tc_cls_get_info_dev(fd, TCM_IFINDEX_MAGIC_BLOCK, block_index, + protocol, opts, info); +} + +static int tc_act_add_action(struct nlmsghdr *nh, size_t maxsz, int type, + int fd, const struct bpf_tc_act_opts *opts) +{ + struct nlattr *nla, *nla_opt, *nla_subopt; + struct tc_act_bpf param = {}; + int ret; + + nla = begin_nlattr_nested(nh, maxsz, type); + if (!nla) + return -EMSGSIZE; + + nla_opt = begin_nlattr_nested(nh, maxsz, 1); + if (!nla_opt) + return -EMSGSIZE; + + ret = add_nlattr(nh, maxsz, TCA_ACT_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return ret; + + ret = add_nlattr(nh, maxsz, TCA_ACT_INDEX, + OPTS_HAS(opts, index) ? &opts->index : &(__u32){ 0 }, + sizeof(opts->index)); + + if (ret < 0) + return ret; + + nla_subopt = begin_nlattr_nested(nh, maxsz, TCA_ACT_OPTIONS); + if (!nla) + return -EMSGSIZE; + + if (fd > 0) { + ret = tc_bpf_add_fd_and_name(nh, maxsz, fd, + BPF_PROG_TYPE_SCHED_ACT); + if (ret < 0) + return ret; + } + + param.index = OPTS_GET(opts, index, 0); + param.action = OPTS_GET(opts, action, TC_ACT_UNSPEC); + + ret = add_nlattr(nh, maxsz, TCA_ACT_BPF_PARMS, ¶m, sizeof(param)); + if (ret < 0) + return ret; + + if (OPTS_GET(opts, cookie, NULL) && OPTS_GET(opts, cookie_len, 0)) { + if (opts->cookie_len > TC_COOKIE_MAX_SIZE) + return -E2BIG; + + ret = add_nlattr(nh, maxsz, TCA_ACT_COOKIE, opts->cookie, + opts->cookie_len); + if (ret < 0) + return ret; + } + + if (OPTS_GET(opts, hw_stats_type, 0)) { + struct nla_bitfield32 hw_stats_bf = { + .value = opts->hw_stats_type, + .selector = opts->hw_stats_type, + }; + + ret = add_nlattr(nh, maxsz, TCA_ACT_HW_STATS, &hw_stats_bf, + sizeof(hw_stats_bf)); + if (ret < 0) + return ret; + } + + if (OPTS_GET(opts, no_percpu, false)) { + struct nla_bitfield32 flags = { + TCA_ACT_FLAGS_NO_PERCPU_STATS, + TCA_ACT_FLAGS_NO_PERCPU_STATS, + }; + + ret = add_nlattr(nh, maxsz, TCA_ACT_FLAGS, &flags, + sizeof(flags)); + if (ret < 0) + return ret; + } + + end_nlattr_nested(nh, nla_subopt); + end_nlattr_nested(nh, nla_opt); + end_nlattr_nested(nh, nla); + + return 0; +} + +static int tc_act_modify(int cmd, unsigned int flags, int fd, int action, + const struct bpf_tc_act_opts *opts, __dump_nlmsg_t fn, + __u32 *index) +{ + struct bpf_tc_act_info info = {}; + int sock, seq = 0, ret; + __u32 nl_pid = 0; + struct { + struct nlmsghdr nh; + struct tcamsg t; + char buf[256]; + } req; + + sock = libbpf_netlink_open(&nl_pid); + if (sock < 0) + return sock; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags; + req.nh.nlmsg_type = cmd; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.t.tca_family = AF_UNSPEC; + + /* gcc complains when using req.nh here */ + ret = tc_act_add_action((struct nlmsghdr *)&req, sizeof(req), + TCA_ACT_TAB, fd, opts); + if (ret < 0) + goto end; + + ret = send(sock, &req.nh, req.nh.nlmsg_len, 0); + if (ret < 0) + goto end; + + ret = bpf_netlink_recv(sock, nl_pid, seq, fn, NULL, + &(struct pass_info){ &info, 0 }); + if (ret < 0) + goto end; + + if (fn) { + if (info.index) { + *index = info.index; + ret = 0; + } else + ret = -ESRCH; + } + +end: + close(sock); + return ret; +} + +static int get_act_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie); + +int bpf_tc_act_attach(int fd, const struct bpf_tc_act_opts *opts, __u32 *index) +{ + if (fd < 1 || !OPTS_VALID(opts, bpf_tc_act_opts) || !index) + return -EINVAL; + + return tc_act_modify(RTM_NEWACTION, NLM_F_ECHO | NLM_F_EXCL, fd, + OPTS_GET(opts, action, TCA_ACT_UNSPEC), opts, + get_act_info, index); +} + +int bpf_tc_act_replace(int fd, const struct bpf_tc_act_opts *opts, __u32 *index) +{ + if (fd < 1 || !OPTS_VALID(opts, bpf_tc_act_opts) || !index) + return -EINVAL; + + return tc_act_modify(RTM_NEWACTION, NLM_F_ECHO | NLM_F_REPLACE, fd, + OPTS_GET(opts, action, TCA_ACT_UNSPEC), opts, + get_act_info, index); +} + +int bpf_tc_act_detach(__u32 index) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_act_opts, opts, .index = index); + + return tc_act_modify(RTM_DELACTION, index ? 0 : NLM_F_ROOT, -1, + TC_ACT_UNSPEC, &opts, NULL, NULL); +} + +static int __get_act_info(void *cookie, void *msg, struct nlattr *nla) +{ + struct nlattr *tbb[TCA_ACT_BPF_MAX + 1]; + struct pass_info *ainfo = cookie; + struct bpf_tc_act_info *info; + struct tc_act_bpf parm; + __u32 prog_id; + + info = ainfo->info; + + if (!nla) + return -EINVAL; + + libbpf_nla_parse_nested(tbb, TCA_ACT_BPF_MAX, nla, NULL); + + if (!tbb[TCA_ACT_BPF_PARMS] || !tbb[TCA_ACT_BPF_ID]) + return -ESRCH; + + prog_id = libbpf_nla_getattr_u32(tbb[TCA_ACT_BPF_ID]); + if (ainfo->prog_id && ainfo->prog_id != prog_id) + return 0; + + /* Found a match */ + memcpy(&parm, libbpf_nla_data(tbb[TCA_ACT_BPF_PARMS]), + sizeof(parm)); + + info->index = parm.index; + info->capab = parm.capab; + info->action = parm.action; + info->refcnt = parm.refcnt; + info->bindcnt = parm.bindcnt; + + return 1; +} + +static int get_act_info_msg(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie, __u32 total, struct nlattr *nla) +{ + struct nlattr *tbb[TCA_ACT_MAX + 1]; + struct tcamsg *t = NLMSG_DATA(nh); + struct nlattr *tb[total + 1]; + int ret; + + libbpf_nla_parse_nested(tb, total, nla, NULL); + + for (int i = 0; i <= total; i++) { + if (tb[i]) { + nla = tb[i]; + libbpf_nla_parse_nested(tbb, TCA_ACT_MAX, nla, NULL); + + if (!tbb[TCA_ACT_KIND]) + return -EINVAL; + + ret = __get_act_info(cookie, t, tbb[TCA_ACT_OPTIONS]); + if (ret < 0) + return ret; + + if (ret > 0) + return 1; + } + } + + return 0; +} + +static int get_act_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct nlattr *nla, *tb[TCA_ROOT_MAX + 1]; + __u32 total = 0; + + nla = NLMSG_DATA(nh) + NLMSG_ALIGN(sizeof(struct tcamsg)); + libbpf_nla_parse(tb, TCA_ROOT_MAX, nla, + NLMSG_PAYLOAD(nh, sizeof(struct tcamsg)), NULL); + + if (tb[TCA_ROOT_COUNT]) + total = libbpf_nla_getattr_u32(tb[TCA_ROOT_COUNT]); + + total = total ?: TCA_ACT_MAX_PRIO; + + return get_act_info_msg(nh, fn, cookie, total, tb[TCA_ACT_TAB]); +} + +static int tc_act_get_info(int sock, unsigned int nl_pid, int fd, + struct bpf_tc_act_info *info) +{ + struct bpf_prog_info prog_info = {}; + __u32 info_len = sizeof(prog_info); + struct nlattr *nla, *nla_opt; + struct { + struct nlmsghdr nh; + struct tcamsg t; + char buf[256]; + } req = { + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)), + .nh.nlmsg_type = RTM_GETACTION, + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .t.tca_family = AF_UNSPEC, + }; + int ret; + + if (fd < 1) + return -EINVAL; + + ret = bpf_obj_get_info_by_fd(fd, &prog_info, &info_len); + if (ret < 0) + return ret; + + nla = begin_nlattr_nested(&req.nh, sizeof(req), TCA_ACT_TAB); + if (!nla) + return -EMSGSIZE; + + nla_opt = begin_nlattr_nested(&req.nh, sizeof(req), 1); + if (!nla_opt) + return -EMSGSIZE; + + ret = add_nlattr(&req.nh, sizeof(req), TCA_ACT_KIND, "bpf", + sizeof("bpf")); + if (ret < 0) + return ret; + + end_nlattr_nested(&req.nh, nla_opt); + end_nlattr_nested(&req.nh, nla); + + req.nh.nlmsg_seq = time(NULL); + + /* Pass prog id the info is to be returned for */ + return bpf_nl_get_ext(&req.nh, sock, nl_pid, get_act_info, NULL, + &(struct pass_info){ info, prog_info.id }); +} + +int bpf_tc_act_get_info(int fd, struct bpf_tc_act_info *info) +{ + int sock, ret; + __u32 nl_pid; + + if (fd < 1 || !info) + return -EINVAL; + + sock = libbpf_netlink_open(&nl_pid); + if (sock < 0) + return sock; + + ret = tc_act_get_info(sock, nl_pid, fd, info); + if (ret < 0) + goto end; + + if (!info->index) + ret = -ESRCH; +end: + close(sock); + return ret; }
On Thu, Mar 25, 2021 at 5:02 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
This adds functions that wrap the netlink API used for adding, manipulating, and removing filters and actions. These functions operate directly on the loaded prog's fd, and return a handle to the filter and action using an out parameter (id for tc_cls, and index for tc_act).
The basic featureset is covered to allow for attaching, manipulation of properties, and removal of filters and actions. Some additional features like TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can added on top later by extending the bpf_tc_cls_opts struct.
Support for binding actions directly to a classifier by passing them in during filter creation has also been omitted for now. These actions have an auto clean up property because their lifetime is bound to the filter they are attached to. This can be added later, but was omitted for now as direct action mode is a better alternative to it.
An API summary:
The BPF TC-CLS API
bpf_tc_act_{attach, change, replace}_{dev, block} may be used to attach, change, and replace SCHED_CLS bpf classifiers. Separate set of functions are provided for network interfaces and shared filter blocks.
bpf_tc_cls_detach_{dev, block} may be used to detach existing SCHED_CLS filter. The bpf_tc_cls_attach_id object filled in during attach, change, or replace must be passed in to the detach functions for them to remove the filter and its attached classififer correctly.
bpf_tc_cls_get_info is a helper that can be used to obtain attributes for the filter and classififer. The opts structure may be used to choose the granularity of search, such that info for a specific filter corresponding to the same loaded bpf program can be obtained. By default, the first match is returned to the user.
Examples:
struct bpf_tc_cls_attach_id id = {}; struct bpf_object *obj; struct bpf_program *p; int fd, r; obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj); p = bpf_object__find_program_by_title(obj, "classifier"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p); if (bpf_object__load(obj) < 0) return -1; fd = bpf_program__fd(p); r = bpf_tc_cls_attach_dev(fd, if_nametoindex("lo"), BPF_TC_CLSACT_INGRESS, ETH_P_IP, NULL, &id); if (r < 0) return r;
... which is roughly equivalent to (after clsact qdisc setup): # tc filter add dev lo ingress bpf obj /home/kkd/foo.o sec classifier
If a user wishes to modify existing options on an attached filter, the bpf_tc_cls_change_{dev, block} API may be used. Parameters like chain_index, priority, and handle are ignored in the bpf_tc_cls_opts struct as they cannot be modified after attaching a filter.
Example:
/* Optional parameters necessary to select the right filter */ DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, .handle = id.handle, .priority = id.priority, .chain_index = id.chain_index) /* Turn on direct action mode */ opts.direct_action = true; r = bpf_tc_cls_change_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &id); if (r < 0) return r; /* Verify that the direct action mode has been set */ struct bpf_tc_cls_info info = {}; r = bpf_tc_cls_get_info_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &info); if (r < 0) return r; assert(info.bpf_flags & TCA_BPF_FLAG_ACT_DIRECT);
This would be roughly equivalent to doing: # tc filter change dev lo egress prio <p> handle <h> bpf obj /home/kkd/foo.o section classifier da
... except a new bpf program will be loaded and replace existing one.
If a user wishes to either replace an existing filter, or create a new one with the same properties, they can use bpf_tc_cls_replace_dev. The benefit of bpf_tc_cls_change is that it fails if no matching filter exists.
The BPF TC-ACT API
Is there some succinct but complete enough documentation/tutorial/etc that I can reasonably read to understand kernel APIs provided by TC (w.r.t. BPF, of course). I'm trying to wrap my head around this and whether API makes sense or not. Please share links, if you have some.
bpf_tc_act_{attach, replace} may be used to attach and replace already attached SCHED_ACT actions. Passing an index of 0 has special meaning, in that an index will be automatically chosen by the kernel. The index chosen by the kernel is the return value of these functions in case of success.
bpf_tc_act_detach may be used to detach a SCHED_ACT action prog identified by the index parameter. The index 0 again has a special meaning, in that passing it will flush all existing SCHED_ACT actions loaded using the ACT API.
bpf_tc_act_get_info is a helper to get the required attributes of a loaded program to be able to manipulate it futher, by passing them into the aforementioned functions.
[...]
On Sun, Mar 28, 2021 at 10:12:40AM IST, Andrii Nakryiko wrote:
Is there some succinct but complete enough documentation/tutorial/etc that I can reasonably read to understand kernel APIs provided by TC (w.r.t. BPF, of course). I'm trying to wrap my head around this and whether API makes sense or not. Please share links, if you have some.
Hi Andrii,
Unfortunately for the kernel API part, I couldn't find any when I was working on this. So I had to read the iproute2 tc code (tc_filter.c, f_bpf.c, m_action.c, m_bpf.c) and the kernel side bits (cls_api.c, cls_bpf.c, act_api.c, act_bpf.c) to grok anything I didn't understand. There's also similar code in libnl (lib/route/{act,cls}.c).
Other than that, these resources were useful (perhaps you already went through some/all of them):
https://docs.cilium.io/en/latest/bpf/#tc-traffic-control https://qmonnet.github.io/whirl-offload/2020/04/11/tc-bpf-direct-action/ tc(8), and tc-bpf(8) man pages
I hope this is helpful!
-- Kartikeya
On Sun, Mar 28, 2021 at 1:11 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
On Sun, Mar 28, 2021 at 10:12:40AM IST, Andrii Nakryiko wrote:
Is there some succinct but complete enough documentation/tutorial/etc that I can reasonably read to understand kernel APIs provided by TC (w.r.t. BPF, of course). I'm trying to wrap my head around this and whether API makes sense or not. Please share links, if you have some.
Hi Andrii,
Unfortunately for the kernel API part, I couldn't find any when I was working on this. So I had to read the iproute2 tc code (tc_filter.c, f_bpf.c, m_action.c, m_bpf.c) and the kernel side bits (cls_api.c, cls_bpf.c, act_api.c, act_bpf.c) to grok anything I didn't understand. There's also similar code in libnl (lib/route/{act,cls}.c).
Other than that, these resources were useful (perhaps you already went through some/all of them):
https://docs.cilium.io/en/latest/bpf/#tc-traffic-control https://qmonnet.github.io/whirl-offload/2020/04/11/tc-bpf-direct-action/ tc(8), and tc-bpf(8) man pages
I hope this is helpful!
Thanks! I'll take a look. Sorry, I'm a bit behind with all the stuff, trying to catch up.
I was just wondering if it would be more natural instead of having _dev _block variants and having to specify __u32 ifindex, __u32 parent_id, __u32 protocol, to have some struct specifying TC "destination"? Maybe not, but I thought I'd bring this up early. So you'd have just bpf_tc_cls_attach(), and you'd so something like
bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, parent_id, protocol))
or
bpf_tc_cls_attach(prog_fd, TC_BLOCK(block_idx, protocol))
? Or it's taking it too far?
But even if not, I think detaching can be unified between _dev and _block, can't it?
-- Kartikeya
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Sun, Mar 28, 2021 at 1:11 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
On Sun, Mar 28, 2021 at 10:12:40AM IST, Andrii Nakryiko wrote:
Is there some succinct but complete enough documentation/tutorial/etc that I can reasonably read to understand kernel APIs provided by TC (w.r.t. BPF, of course). I'm trying to wrap my head around this and whether API makes sense or not. Please share links, if you have some.
Hi Andrii,
Unfortunately for the kernel API part, I couldn't find any when I was working on this. So I had to read the iproute2 tc code (tc_filter.c, f_bpf.c, m_action.c, m_bpf.c) and the kernel side bits (cls_api.c, cls_bpf.c, act_api.c, act_bpf.c) to grok anything I didn't understand. There's also similar code in libnl (lib/route/{act,cls}.c).
Other than that, these resources were useful (perhaps you already went through some/all of them):
https://docs.cilium.io/en/latest/bpf/#tc-traffic-control https://qmonnet.github.io/whirl-offload/2020/04/11/tc-bpf-direct-action/ tc(8), and tc-bpf(8) man pages
I hope this is helpful!
Thanks! I'll take a look. Sorry, I'm a bit behind with all the stuff, trying to catch up.
I was just wondering if it would be more natural instead of having _dev _block variants and having to specify __u32 ifindex, __u32 parent_id, __u32 protocol, to have some struct specifying TC "destination"? Maybe not, but I thought I'd bring this up early. So you'd have just bpf_tc_cls_attach(), and you'd so something like
bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, parent_id, protocol))
or
bpf_tc_cls_attach(prog_fd, TC_BLOCK(block_idx, protocol))
? Or it's taking it too far?
Hmm, that's not a bad idea, actually. An earlier version of the series did have only a single set of functions, but with way too many arguments, which is why we ended up agreeing to split them. But encapsulating the destination in a separate struct and combining it with some helper macros might just make this work! I like it! Kumar, WDYT?
-Toke
On Wed, Mar 31, 2021 at 02:41:40AM IST, Toke Høiland-Jørgensen wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Sun, Mar 28, 2021 at 1:11 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
On Sun, Mar 28, 2021 at 10:12:40AM IST, Andrii Nakryiko wrote:
Is there some succinct but complete enough documentation/tutorial/etc that I can reasonably read to understand kernel APIs provided by TC (w.r.t. BPF, of course). I'm trying to wrap my head around this and whether API makes sense or not. Please share links, if you have some.
Hi Andrii,
Unfortunately for the kernel API part, I couldn't find any when I was working on this. So I had to read the iproute2 tc code (tc_filter.c, f_bpf.c, m_action.c, m_bpf.c) and the kernel side bits (cls_api.c, cls_bpf.c, act_api.c, act_bpf.c) to grok anything I didn't understand. There's also similar code in libnl (lib/route/{act,cls}.c).
Other than that, these resources were useful (perhaps you already went through some/all of them):
https://docs.cilium.io/en/latest/bpf/#tc-traffic-control https://qmonnet.github.io/whirl-offload/2020/04/11/tc-bpf-direct-action/ tc(8), and tc-bpf(8) man pages
I hope this is helpful!
Thanks! I'll take a look. Sorry, I'm a bit behind with all the stuff, trying to catch up.
I was just wondering if it would be more natural instead of having _dev _block variants and having to specify __u32 ifindex, __u32 parent_id, __u32 protocol, to have some struct specifying TC "destination"? Maybe not, but I thought I'd bring this up early. So you'd have just bpf_tc_cls_attach(), and you'd so something like
bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, parent_id, protocol))
or
bpf_tc_cls_attach(prog_fd, TC_BLOCK(block_idx, protocol))
? Or it's taking it too far?
Hmm, that's not a bad idea, actually. An earlier version of the series did have only a single set of functions, but with way too many arguments, which is why we ended up agreeing to split them. But encapsulating the destination in a separate struct and combining it with some helper macros might just make this work! I like it! Kumar, WDYT?
SGTM.
-Toke
-- Kartikeya
On 3/30/21 10:39 PM, Andrii Nakryiko wrote:
On Sun, Mar 28, 2021 at 1:11 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
On Sun, Mar 28, 2021 at 10:12:40AM IST, Andrii Nakryiko wrote:
Is there some succinct but complete enough documentation/tutorial/etc that I can reasonably read to understand kernel APIs provided by TC (w.r.t. BPF, of course). I'm trying to wrap my head around this and whether API makes sense or not. Please share links, if you have some.
Hi Andrii,
Unfortunately for the kernel API part, I couldn't find any when I was working on this. So I had to read the iproute2 tc code (tc_filter.c, f_bpf.c, m_action.c, m_bpf.c) and the kernel side bits (cls_api.c, cls_bpf.c, act_api.c, act_bpf.c) to grok anything I didn't understand. There's also similar code in libnl (lib/route/{act,cls}.c).
Other than that, these resources were useful (perhaps you already went through some/all of them):
https://docs.cilium.io/en/latest/bpf/#tc-traffic-control https://qmonnet.github.io/whirl-offload/2020/04/11/tc-bpf-direct-action/ tc(8), and tc-bpf(8) man pages
I hope this is helpful!
Thanks! I'll take a look. Sorry, I'm a bit behind with all the stuff, trying to catch up.
I was just wondering if it would be more natural instead of having _dev _block variants and having to specify __u32 ifindex, __u32 parent_id, __u32 protocol, to have some struct specifying TC "destination"? Maybe not, but I thought I'd bring this up early. So you'd have just bpf_tc_cls_attach(), and you'd so something like
bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, parent_id, protocol))
or
bpf_tc_cls_attach(prog_fd, TC_BLOCK(block_idx, protocol))
? Or it's taking it too far?
But even if not, I think detaching can be unified between _dev and _block, can't it?
Do we even need the _block variant? I would rather prefer to take the chance and make it as simple as possible, and only iff really needed extend with other APIs, for example:
bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS});
Internally, this will create the sch_clsact qdisc & cls_bpf filter instance iff not present yet, and attach to a default prio 1 handle 1, and _always_ in direct-action mode. This is /as simple as it gets/ and we don't need to bother users with more complex tc/cls_bpf internals unless desired. For example, extended APIs could add prio/parent so that multi-prog can be attached to a single cls_bpf instance, but even that could be a second step, imho.
Thanks, Daniel
On Tue, Mar 30, 2021 at 2:26 PM Daniel Borkmann daniel@iogearbox.net wrote:
On 3/30/21 10:39 PM, Andrii Nakryiko wrote:
On Sun, Mar 28, 2021 at 1:11 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
On Sun, Mar 28, 2021 at 10:12:40AM IST, Andrii Nakryiko wrote:
Is there some succinct but complete enough documentation/tutorial/etc that I can reasonably read to understand kernel APIs provided by TC (w.r.t. BPF, of course). I'm trying to wrap my head around this and whether API makes sense or not. Please share links, if you have some.
Hi Andrii,
Unfortunately for the kernel API part, I couldn't find any when I was working on this. So I had to read the iproute2 tc code (tc_filter.c, f_bpf.c, m_action.c, m_bpf.c) and the kernel side bits (cls_api.c, cls_bpf.c, act_api.c, act_bpf.c) to grok anything I didn't understand. There's also similar code in libnl (lib/route/{act,cls}.c).
Other than that, these resources were useful (perhaps you already went through some/all of them):
https://docs.cilium.io/en/latest/bpf/#tc-traffic-control https://qmonnet.github.io/whirl-offload/2020/04/11/tc-bpf-direct-action/ tc(8), and tc-bpf(8) man pages
I hope this is helpful!
Thanks! I'll take a look. Sorry, I'm a bit behind with all the stuff, trying to catch up.
I was just wondering if it would be more natural instead of having _dev _block variants and having to specify __u32 ifindex, __u32 parent_id, __u32 protocol, to have some struct specifying TC "destination"? Maybe not, but I thought I'd bring this up early. So you'd have just bpf_tc_cls_attach(), and you'd so something like
bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, parent_id, protocol))
or
bpf_tc_cls_attach(prog_fd, TC_BLOCK(block_idx, protocol))
? Or it's taking it too far?
But even if not, I think detaching can be unified between _dev and _block, can't it?
Do we even need the _block variant? I would rather prefer to take the chance and make it as simple as possible, and only iff really needed extend with other APIs, for example:
bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS});
Internally, this will create the sch_clsact qdisc & cls_bpf filter instance iff not present yet, and attach to a default prio 1 handle 1, and _always_ in direct-action mode. This is /as simple as it gets/ and we don't need to bother users with more complex tc/cls_bpf internals unless desired. For example, extended APIs could add prio/parent so that multi-prog can be attached to a single cls_bpf instance, but even that could be a second step, imho.
+1 to support sched_cls in direct-action mode only.
On Wed, Mar 31, 2021 at 02:55:47AM IST, Daniel Borkmann wrote:
Do we even need the _block variant? I would rather prefer to take the chance and make it as simple as possible, and only iff really needed extend with other APIs, for example:
The block variant can be dropped, I'll use the TC_BLOCK/TC_DEV alternative which sets parent_id/ifindex properly.
bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS});
Internally, this will create the sch_clsact qdisc & cls_bpf filter instance iff not present yet, and attach to a default prio 1 handle 1, and _always_ in direct-action mode. This is /as simple as it gets/ and we don't need to bother users with more complex tc/cls_bpf internals unless desired. For example, extended APIs could add prio/parent so that multi-prog can be attached to a single cls_bpf instance, but even that could be a second step, imho.
I am not opposed to clsact qdisc setup if INGRESS/EGRESS is supplied (not sure how others feel about it).
We could make direct_action mode default, and similarly choose prio as 1 by default instead of letting the kernel do it. Then you can just pass in NULL for bpf_tc_cls_opts and be close to what you're proposing. For protocol we can choose ETH_P_ALL by default too if the user doesn't set it.
With these modifications, the equivalent would look like bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, INGRESS), NULL, &id);
So as long as the user doesn't care about other details, they can just pass opts as NULL.
WDYT?
Thanks, Daniel
-- Kartikeya
On 3/31/21 11:44 AM, Kumar Kartikeya Dwivedi wrote:
On Wed, Mar 31, 2021 at 02:55:47AM IST, Daniel Borkmann wrote:
Do we even need the _block variant? I would rather prefer to take the chance and make it as simple as possible, and only iff really needed extend with other APIs, for example:
The block variant can be dropped, I'll use the TC_BLOCK/TC_DEV alternative which sets parent_id/ifindex properly.
bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS});
Internally, this will create the sch_clsact qdisc & cls_bpf filter instance iff not present yet, and attach to a default prio 1 handle 1, and _always_ in direct-action mode. This is /as simple as it gets/ and we don't need to bother users with more complex tc/cls_bpf internals unless desired. For example, extended APIs could add prio/parent so that multi-prog can be attached to a single cls_bpf instance, but even that could be a second step, imho.
I am not opposed to clsact qdisc setup if INGRESS/EGRESS is supplied (not sure how others feel about it).
What speaks against it? It would be 100% clear from API side where the prog is being attached. Same as with tc cmdline where you specify 'ingress'/'egress'.
We could make direct_action mode default, and similarly choose prio
To be honest, I wouldn't even support a mode from the lib/API side where direct_action is not set. It should always be forced to true. Everything else is rather broken setup-wise, imho, since it won't scale. We added direct_action a bit later to the kernel than original cls_bpf, but if I would do it again today, I'd make it the only available option. I don't see a reasonable use-case where you have it to false.
as 1 by default instead of letting the kernel do it. Then you can just pass in NULL for bpf_tc_cls_opts and be close to what you're proposing. For protocol we can choose ETH_P_ALL by default too if the user doesn't set it.
Same here with ETH_P_ALL, I'm not sure anyone uses anything other than ETH_P_ALL, so yes, that should be default.
With these modifications, the equivalent would look like bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, INGRESS), NULL, &id);
Few things compared to bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}):
1) nit, but why even 'cls' in the name. I think we shouldn't expose such old-days tc semantics to a user. Just bpf_tc_attach() is cleaner/simpler to understand. 2) What's the 'TC_DEV(ifindex, INGRESS)' macro doing exactly? Looks unnecessary, why not regular args to the API? 3) Exposed bpf_tc_attach() API could internally call a bpf_tc_attach_opts() API with preset defaults, and the latter could have all the custom bits if the user needs to go beyond the simple API, so from your bpf_tc_cls_attach() I'd also drop the NULL. 4) For the simple API I'd likely also drop the id (you could have a query API if needed).
So as long as the user doesn't care about other details, they can just pass opts as NULL.
On Fri, Apr 02, 2021 at 05:49:29AM IST, Daniel Borkmann wrote:
On 3/31/21 11:44 AM, Kumar Kartikeya Dwivedi wrote:
On Wed, Mar 31, 2021 at 02:55:47AM IST, Daniel Borkmann wrote:
Do we even need the _block variant? I would rather prefer to take the chance and make it as simple as possible, and only iff really needed extend with other APIs, for example:
The block variant can be dropped, I'll use the TC_BLOCK/TC_DEV alternative which sets parent_id/ifindex properly.
bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS});
Internally, this will create the sch_clsact qdisc & cls_bpf filter instance iff not present yet, and attach to a default prio 1 handle 1, and _always_ in direct-action mode. This is /as simple as it gets/ and we don't need to bother users with more complex tc/cls_bpf internals unless desired. For example, extended APIs could add prio/parent so that multi-prog can be attached to a single cls_bpf instance, but even that could be a second step, imho.
I am not opposed to clsact qdisc setup if INGRESS/EGRESS is supplied (not sure how others feel about it).
What speaks against it? It would be 100% clear from API side where the prog is being attached. Same as with tc cmdline where you specify 'ingress'/'egress'.
Ok, I will add the qdisc setup in the next revision.
We could make direct_action mode default, and similarly choose prio
To be honest, I wouldn't even support a mode from the lib/API side where direct_action is not set. It should always be forced to true. Everything else is rather broken setup-wise, imho, since it won't scale. We added direct_action a bit later to the kernel than original cls_bpf, but if I would do it again today, I'd make it the only available option. I don't see a reasonable use-case where you have it to false.
I'm all for doing that, but in some sense that also speaks against SCHED_ACT support. Currently, you can load SCHED_ACT programs using this series, but not really bind them to classifier. I left that option open to a future patch, it would just reuse the existing tc_act_add_action helper (also why I kept it in its own function). Maybe we need to reconsider that, if direct action is the only recommended way going forward (to discourage people from using SCHED_ACT), or just add opts to do all the setup in low level API, instead of leaving it incomplete.
as 1 by default instead of letting the kernel do it. Then you can just pass in NULL for bpf_tc_cls_opts and be close to what you're proposing. For protocol we can choose ETH_P_ALL by default too if the user doesn't set it.
Same here with ETH_P_ALL, I'm not sure anyone uses anything other than ETH_P_ALL, so yes, that should be default.
Ack.
With these modifications, the equivalent would look like bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, INGRESS), NULL, &id);
Few things compared to bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}):
- nit, but why even 'cls' in the name. I think we shouldn't expose such old-days tc semantics to a user. Just bpf_tc_attach() is cleaner/simpler to understand.
Since it would make it clear this is for SCHED_CLS progs, likewise bpf_tc_act_* is for SCHED_ACT progs. Not opposed to changing the name.
- What's the 'TC_DEV(ifindex, INGRESS)' macro doing exactly? Looks unnecessary, why not regular args to the API?
It is very easy to support BLOCK (I know it's not really popular here, but I think if supporting it just requires adding a macro, then we can go ahead). So the user can use TC_BLOCK(block_idx) instead of remembering ifindex is to be set to TCM_IFINDEX_MAGIC_BLOCK and parent_id to actual block index. It will just expand to:
#define TC_BLOCK(block_idx) TCM_IFINDEX_MAGIC_BLOCK, (block_idx)
TC_DEV macro can be dropped, since user can directly pass ifindex and parent_id.
- Exposed bpf_tc_attach() API could internally call a bpf_tc_attach_opts() API with preset defaults, and the latter could have all the custom bits if the user needs to go beyond the simple API, so from your bpf_tc_cls_attach() I'd also drop the NULL.
Ok, this is probably better (but maybe we can do this for the high-level bpf_program__attach that returns a bpf_link * instead of introducing yet another function).
- For the simple API I'd likely also drop the id (you could have a query API if needed).
This would be fine, because it's not a fast path or anything, but right now we return the id using the netlink response, otherwise for query we have to open the socket, prepare the msg, send and recv again. So it's a minor optimization.
However, there's one other problem. In an earlier version of this series, I didn't keep the id/index out parameters (to act as handle to the newly attached filter/action). This lead to problems on query. Suppose a user doesn't properly fill the opts during query (e.g. in case of filters). This means the netlink dump includes all filters matching filled in attributes. If the prog_id for all of these is same (e.g. all have same bpf classifier prog attached to them), it becomes impossible to determine which one is the filter user asked for. It is not possible to enforce filling in all kinds of attributes since some can be left out and assigned by default in the kernel (priority, chain_index etc.). So returning the newly created filter's id turned out to be the best option. This is also used to stash filter related information in bpf_link to properly release it later.
The same problem happens with actions, where we look up using the prog_id, we multiple actions with different index can match on same prog_id. It is not possible to determine which index corresponds to last loaded action.
So unless there's a better idea on how to deal with this, a query API won't work for the case where same bpf prog is attached more than once. Returning the id/index during attach seemed better than all other options we considered.
-- Kartikeya
On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
This would be fine, because it's not a fast path or anything, but right now we return the id using the netlink response, otherwise for query we have to open the socket, prepare the msg, send and recv again. So it's a minor optimization.
However, there's one other problem. In an earlier version of this series, I didn't keep the id/index out parameters (to act as handle to the newly attached filter/action). This lead to problems on query. Suppose a user doesn't properly fill the opts during query (e.g. in case of filters). This means the netlink dump includes all filters matching filled in attributes. If the prog_id for all of these is same (e.g. all have same bpf classifier prog attached to them), it becomes impossible to determine which one is the filter user asked for. It is not possible to enforce filling in all kinds of attributes since some can be left out and assigned by default in the kernel (priority, chain_index etc.). So returning the newly created filter's id turned out to be the best option. This is also used to stash filter related information in bpf_link to properly release it later.
The same problem happens with actions, where we look up using the prog_id, we multiple actions with different index can match on same prog_id. It is not possible to determine which index corresponds to last loaded action.
So unless there's a better idea on how to deal with this, a query API won't work for the case where same bpf prog is attached more than once. Returning the id/index during attach seemed better than all other options we considered.
All of these things are messy because of tc legacy. bpf tried to follow tc style with cls and act distinction and it didn't quite work. cls with direct-action is the only thing that became mainstream while tc style attach wasn't really addressed. There were several incidents where tc had tens of thousands of progs attached because of this attach/query/index weirdness described above. I think the only way to address this properly is to introduce bpf_link style of attaching to tc. Such bpf_link would support ingress/egress only. direction-action will be implied. There won't be any index and query will be obvious. So I would like to propose to take this patch set a step further from what Daniel said: int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): and make this proposed api to return FD. To detach from tc ingress/egress just close(fd). The user processes will not conflict with each other and will not accidently detach bpf program that was attached by another user process. Such api will address the existing tc query/attach/detach race race conditions. And libbpf side of support for this api will be trivial. Single bpf link_create command with ifindex and ingress|egress arguments. wdyt?
On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote:
On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
[...]
All of these things are messy because of tc legacy. bpf tried to follow tc style with cls and act distinction and it didn't quite work. cls with direct-action is the only thing that became mainstream while tc style attach wasn't really addressed. There were several incidents where tc had tens of thousands of progs attached because of this attach/query/index weirdness described above. I think the only way to address this properly is to introduce bpf_link style of attaching to tc. Such bpf_link would support ingress/egress only. direction-action will be implied. There won't be any index and query will be obvious.
Note that we already have bpf_link support working (without support for pinning ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link and are able to operate on the exact filter during release.
So I would like to propose to take this patch set a step further from what Daniel said: int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): and make this proposed api to return FD. To detach from tc ingress/egress just close(fd).
You mean adding an fd-based TC API to the kernel?
The user processes will not conflict with each other and will not accidently detach bpf program that was attached by another user process. Such api will address the existing tc query/attach/detach race race conditions.
Hmm, I think we do solve the race condition by returning the id. As long as you don't misuse the interface and go around deleting filters arbitrarily (i.e. only detach using the id), programs won't step over each other's filters. Storing the id from the netlink response received during detach also eliminates any ambigiuity from probing through get_info after attach. Same goes for actions, and the same applies to the bpf_link returning API (which stashes id/index).
Do you have any other example that can still be racy given the current API?
The only advantage of fd would be the possibility of pinning it, and extending lifetime of the filter.
And libbpf side of support for this api will be trivial. Single bpf link_create command with ifindex and ingress|egress arguments. wdyt?
-- Kartikeya
On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote:
On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote:
On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
[...]
All of these things are messy because of tc legacy. bpf tried to follow tc style with cls and act distinction and it didn't quite work. cls with direct-action is the only thing that became mainstream while tc style attach wasn't really addressed. There were several incidents where tc had tens of thousands of progs attached because of this attach/query/index weirdness described above. I think the only way to address this properly is to introduce bpf_link style of attaching to tc. Such bpf_link would support ingress/egress only. direction-action will be implied. There won't be any index and query will be obvious.
Note that we already have bpf_link support working (without support for pinning ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link and are able to operate on the exact filter during release.
Except they're not unique. The library can stash them, but something else doing detach via iproute2 or their own netlink calls will detach the prog. This other app can attach to the same spot a different prog and now bpf_link__destroy will be detaching somebody else prog.
So I would like to propose to take this patch set a step further from what Daniel said: int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): and make this proposed api to return FD. To detach from tc ingress/egress just close(fd).
You mean adding an fd-based TC API to the kernel?
yes.
The user processes will not conflict with each other and will not accidently detach bpf program that was attached by another user process. Such api will address the existing tc query/attach/detach race race conditions.
Hmm, I think we do solve the race condition by returning the id. As long as you don't misuse the interface and go around deleting filters arbitrarily (i.e. only detach using the id), programs won't step over each other's filters. Storing the id from the netlink response received during detach also eliminates any ambigiuity from probing through get_info after attach. Same goes for actions, and the same applies to the bpf_link returning API (which stashes id/index).
There are plenty of tools and libraries out there that do attach/detach of bpf to tc. Everyone is not going to convert to this new libbpf api overnight. So 'miuse of the interface' is not a misuse. It's a reality that is going to keep happening unless the kernel guarantees ownership of the attachment via FD.
The only advantage of fd would be the possibility of pinning it, and extending lifetime of the filter.
Pinning is one of the advantages. The main selling point of FD is ownership of the attachment.
On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote:
On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote:
On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
[...]
All of these things are messy because of tc legacy. bpf tried to follow tc style with cls and act distinction and it didn't quite work. cls with direct-action is the only thing that became mainstream while tc style attach wasn't really addressed. There were several incidents where tc had tens of thousands of progs attached because of this attach/query/index weirdness described above. I think the only way to address this properly is to introduce bpf_link style of attaching to tc. Such bpf_link would support ingress/egress only. direction-action will be implied. There won't be any index and query will be obvious.
Note that we already have bpf_link support working (without support for pinning ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link and are able to operate on the exact filter during release.
Except they're not unique. The library can stash them, but something else doing detach via iproute2 or their own netlink calls will detach the prog. This other app can attach to the same spot a different prog and now bpf_link__destroy will be detaching somebody else prog.
So I would like to propose to take this patch set a step further from what Daniel said: int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): and make this proposed api to return FD. To detach from tc ingress/egress just close(fd).
You mean adding an fd-based TC API to the kernel?
yes.
I'm totally for bpf_link-based TC attachment.
But I think *also* having "legacy" netlink-based APIs will allow applications to handle older kernels in a much nicer way without extra dependency on iproute2. We have a similar situation with kprobe, where currently libbpf only supports "modern" fd-based attachment, but users periodically ask questions and struggle to figure out issues on older kernels that don't support new APIs.
So I think we'd have to support legacy TC APIs, but I agree with Alexei and Daniel that we should keep it to the simplest and most straightforward API of supporting direction-action attachments and setting up qdisc transparently (if I'm getting all the terminology right, after reading Quentin's blog post). That coincidentally should probably match how bpf_link-based TC API will look like, so all that can be abstracted behind a single bpf_link__attach_tc() API as well, right? That's the plan for dealing with kprobe right now, btw. Libbpf will detect the best available API and transparently fall back (maybe with some warning for awareness, due to inherent downsides of legacy APIs: no auto-cleanup being the most prominent one).
The user processes will not conflict with each other and will not accidently detach bpf program that was attached by another user process. Such api will address the existing tc query/attach/detach race race conditions.
Hmm, I think we do solve the race condition by returning the id. As long as you don't misuse the interface and go around deleting filters arbitrarily (i.e. only detach using the id), programs won't step over each other's filters. Storing the id from the netlink response received during detach also eliminates any ambigiuity from probing through get_info after attach. Same goes for actions, and the same applies to the bpf_link returning API (which stashes id/index).
There are plenty of tools and libraries out there that do attach/detach of bpf to tc. Everyone is not going to convert to this new libbpf api overnight. So 'miuse of the interface' is not a misuse. It's a reality that is going to keep happening unless the kernel guarantees ownership of the attachment via FD.
The only advantage of fd would be the possibility of pinning it, and extending lifetime of the filter.
Pinning is one of the advantages. The main selling point of FD is ownership of the attachment.
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote:
On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote:
On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
[...]
All of these things are messy because of tc legacy. bpf tried to follow tc style with cls and act distinction and it didn't quite work. cls with direct-action is the only thing that became mainstream while tc style attach wasn't really addressed. There were several incidents where tc had tens of thousands of progs attached because of this attach/query/index weirdness described above. I think the only way to address this properly is to introduce bpf_link style of attaching to tc. Such bpf_link would support ingress/egress only. direction-action will be implied. There won't be any index and query will be obvious.
Note that we already have bpf_link support working (without support for pinning ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link and are able to operate on the exact filter during release.
Except they're not unique. The library can stash them, but something else doing detach via iproute2 or their own netlink calls will detach the prog. This other app can attach to the same spot a different prog and now bpf_link__destroy will be detaching somebody else prog.
So I would like to propose to take this patch set a step further from what Daniel said: int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): and make this proposed api to return FD. To detach from tc ingress/egress just close(fd).
You mean adding an fd-based TC API to the kernel?
yes.
I'm totally for bpf_link-based TC attachment.
But I think *also* having "legacy" netlink-based APIs will allow applications to handle older kernels in a much nicer way without extra dependency on iproute2. We have a similar situation with kprobe, where currently libbpf only supports "modern" fd-based attachment, but users periodically ask questions and struggle to figure out issues on older kernels that don't support new APIs.
+1; I am OK with adding a new bpf_link-based way to attach TC programs, but we still need to support the netlink API in libbpf.
So I think we'd have to support legacy TC APIs, but I agree with Alexei and Daniel that we should keep it to the simplest and most straightforward API of supporting direction-action attachments and setting up qdisc transparently (if I'm getting all the terminology right, after reading Quentin's blog post). That coincidentally should probably match how bpf_link-based TC API will look like, so all that can be abstracted behind a single bpf_link__attach_tc() API as well, right? That's the plan for dealing with kprobe right now, btw. Libbpf will detect the best available API and transparently fall back (maybe with some warning for awareness, due to inherent downsides of legacy APIs: no auto-cleanup being the most prominent one).
Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the high-level API auto-detect. That way users can also still use the netlink attach function if they don't want the fd-based auto-close behaviour of bpf_link.
-Toke
On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote:
On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote:
On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
[...]
All of these things are messy because of tc legacy. bpf tried to follow tc style with cls and act distinction and it didn't quite work. cls with direct-action is the only thing that became mainstream while tc style attach wasn't really addressed. There were several incidents where tc had tens of thousands of progs attached because of this attach/query/index weirdness described above. I think the only way to address this properly is to introduce bpf_link style of attaching to tc. Such bpf_link would support ingress/egress only. direction-action will be implied. There won't be any index and query will be obvious.
Note that we already have bpf_link support working (without support for pinning ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link and are able to operate on the exact filter during release.
Except they're not unique. The library can stash them, but something else doing detach via iproute2 or their own netlink calls will detach the prog. This other app can attach to the same spot a different prog and now bpf_link__destroy will be detaching somebody else prog.
So I would like to propose to take this patch set a step further from what Daniel said: int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): and make this proposed api to return FD. To detach from tc ingress/egress just close(fd).
You mean adding an fd-based TC API to the kernel?
yes.
I'm totally for bpf_link-based TC attachment.
But I think *also* having "legacy" netlink-based APIs will allow applications to handle older kernels in a much nicer way without extra dependency on iproute2. We have a similar situation with kprobe, where currently libbpf only supports "modern" fd-based attachment, but users periodically ask questions and struggle to figure out issues on older kernels that don't support new APIs.
+1; I am OK with adding a new bpf_link-based way to attach TC programs, but we still need to support the netlink API in libbpf.
So I think we'd have to support legacy TC APIs, but I agree with Alexei and Daniel that we should keep it to the simplest and most straightforward API of supporting direction-action attachments and setting up qdisc transparently (if I'm getting all the terminology right, after reading Quentin's blog post). That coincidentally should probably match how bpf_link-based TC API will look like, so all that can be abstracted behind a single bpf_link__attach_tc() API as well, right? That's the plan for dealing with kprobe right now, btw. Libbpf will detect the best available API and transparently fall back (maybe with some warning for awareness, due to inherent downsides of legacy APIs: no auto-cleanup being the most prominent one).
Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the high-level API auto-detect. That way users can also still use the netlink attach function if they don't want the fd-based auto-close behaviour of bpf_link.
So I thought a bit more about this, and it feels like the right move would be to expose only higher-level TC BPF API behind bpf_link. It will keep the API complexity and amount of APIs that libbpf will have to support to the minimum, and will keep the API itself simple: direct-attach with the minimum amount of input arguments. By not exposing low-level APIs we also table the whole bpf_tc_cls_attach_id design discussion, as we now can keep as much info as needed inside bpf_link_tc (which will embed bpf_link internally as well) to support detachment and possibly some additional querying, if needed.
I think that's the best and least controversial step forward for getting this API into libbpf.
-Toke
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote:
On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote:
On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: > [...]
All of these things are messy because of tc legacy. bpf tried to follow tc style with cls and act distinction and it didn't quite work. cls with direct-action is the only thing that became mainstream while tc style attach wasn't really addressed. There were several incidents where tc had tens of thousands of progs attached because of this attach/query/index weirdness described above. I think the only way to address this properly is to introduce bpf_link style of attaching to tc. Such bpf_link would support ingress/egress only. direction-action will be implied. There won't be any index and query will be obvious.
Note that we already have bpf_link support working (without support for pinning ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link and are able to operate on the exact filter during release.
Except they're not unique. The library can stash them, but something else doing detach via iproute2 or their own netlink calls will detach the prog. This other app can attach to the same spot a different prog and now bpf_link__destroy will be detaching somebody else prog.
So I would like to propose to take this patch set a step further from what Daniel said: int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): and make this proposed api to return FD. To detach from tc ingress/egress just close(fd).
You mean adding an fd-based TC API to the kernel?
yes.
I'm totally for bpf_link-based TC attachment.
But I think *also* having "legacy" netlink-based APIs will allow applications to handle older kernels in a much nicer way without extra dependency on iproute2. We have a similar situation with kprobe, where currently libbpf only supports "modern" fd-based attachment, but users periodically ask questions and struggle to figure out issues on older kernels that don't support new APIs.
+1; I am OK with adding a new bpf_link-based way to attach TC programs, but we still need to support the netlink API in libbpf.
So I think we'd have to support legacy TC APIs, but I agree with Alexei and Daniel that we should keep it to the simplest and most straightforward API of supporting direction-action attachments and setting up qdisc transparently (if I'm getting all the terminology right, after reading Quentin's blog post). That coincidentally should probably match how bpf_link-based TC API will look like, so all that can be abstracted behind a single bpf_link__attach_tc() API as well, right? That's the plan for dealing with kprobe right now, btw. Libbpf will detect the best available API and transparently fall back (maybe with some warning for awareness, due to inherent downsides of legacy APIs: no auto-cleanup being the most prominent one).
Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the high-level API auto-detect. That way users can also still use the netlink attach function if they don't want the fd-based auto-close behaviour of bpf_link.
So I thought a bit more about this, and it feels like the right move would be to expose only higher-level TC BPF API behind bpf_link. It will keep the API complexity and amount of APIs that libbpf will have to support to the minimum, and will keep the API itself simple: direct-attach with the minimum amount of input arguments. By not exposing low-level APIs we also table the whole bpf_tc_cls_attach_id design discussion, as we now can keep as much info as needed inside bpf_link_tc (which will embed bpf_link internally as well) to support detachment and possibly some additional querying, if needed.
But then there would be no way for the caller to explicitly select a mechanism? I.e., if I write a BPF program using this mechanism targeting a 5.12 kernel, I'll get netlink attachment, which can stick around when I do bpf_link__disconnect(). But then if the kernel gets upgraded to support bpf_link for TC programs I'll suddenly transparently get bpf_link and the attachments will go away unless I pin them. This seems... less than ideal?
If we expose the low-level API I can elect to just use this if I know I want netlink behaviour, but if bpf_program__attach_tc() is the only API available it would at least need a flag to enforce one mode or the other (I can see someone wanting to enforce kernel bpf_link semantics as well, so a flag for either mode seems reasonable?).
-Toke
On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote:
On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: > On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: > > [...] > > All of these things are messy because of tc legacy. bpf tried to follow tc style > with cls and act distinction and it didn't quite work. cls with > direct-action is the only > thing that became mainstream while tc style attach wasn't really addressed. > There were several incidents where tc had tens of thousands of progs attached > because of this attach/query/index weirdness described above. > I think the only way to address this properly is to introduce bpf_link style of > attaching to tc. Such bpf_link would support ingress/egress only. > direction-action will be implied. There won't be any index and query > will be obvious.
Note that we already have bpf_link support working (without support for pinning ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link and are able to operate on the exact filter during release.
Except they're not unique. The library can stash them, but something else doing detach via iproute2 or their own netlink calls will detach the prog. This other app can attach to the same spot a different prog and now bpf_link__destroy will be detaching somebody else prog.
> So I would like to propose to take this patch set a step further from > what Daniel said: > int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): > and make this proposed api to return FD. > To detach from tc ingress/egress just close(fd).
You mean adding an fd-based TC API to the kernel?
yes.
I'm totally for bpf_link-based TC attachment.
But I think *also* having "legacy" netlink-based APIs will allow applications to handle older kernels in a much nicer way without extra dependency on iproute2. We have a similar situation with kprobe, where currently libbpf only supports "modern" fd-based attachment, but users periodically ask questions and struggle to figure out issues on older kernels that don't support new APIs.
+1; I am OK with adding a new bpf_link-based way to attach TC programs, but we still need to support the netlink API in libbpf.
So I think we'd have to support legacy TC APIs, but I agree with Alexei and Daniel that we should keep it to the simplest and most straightforward API of supporting direction-action attachments and setting up qdisc transparently (if I'm getting all the terminology right, after reading Quentin's blog post). That coincidentally should probably match how bpf_link-based TC API will look like, so all that can be abstracted behind a single bpf_link__attach_tc() API as well, right? That's the plan for dealing with kprobe right now, btw. Libbpf will detect the best available API and transparently fall back (maybe with some warning for awareness, due to inherent downsides of legacy APIs: no auto-cleanup being the most prominent one).
Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the high-level API auto-detect. That way users can also still use the netlink attach function if they don't want the fd-based auto-close behaviour of bpf_link.
So I thought a bit more about this, and it feels like the right move would be to expose only higher-level TC BPF API behind bpf_link. It will keep the API complexity and amount of APIs that libbpf will have to support to the minimum, and will keep the API itself simple: direct-attach with the minimum amount of input arguments. By not exposing low-level APIs we also table the whole bpf_tc_cls_attach_id design discussion, as we now can keep as much info as needed inside bpf_link_tc (which will embed bpf_link internally as well) to support detachment and possibly some additional querying, if needed.
But then there would be no way for the caller to explicitly select a mechanism? I.e., if I write a BPF program using this mechanism targeting a 5.12 kernel, I'll get netlink attachment, which can stick around when I do bpf_link__disconnect(). But then if the kernel gets upgraded to support bpf_link for TC programs I'll suddenly transparently get bpf_link and the attachments will go away unless I pin them. This seems... less than ideal?
That's what we are doing with bpf_program__attach_kprobe(), though. And so far I've only seen people (privately) saying how good it would be to have bpf_link-based TC APIs, doesn't seem like anyone with a realistic use case prefers the current APIs. So I suspect it's not going to be a problem in practice. But at least I'd start there and see how people are using it and if they need anything else.
If we expose the low-level API I can elect to just use this if I know I want netlink behaviour, but if bpf_program__attach_tc() is the only API available it would at least need a flag to enforce one mode or the other (I can see someone wanting to enforce kernel bpf_link semantics as well, so a flag for either mode seems reasonable?).
Sophisticated enough users can also do feature detection to know if it's going to work or not. There are many ways to skin this cat. I'd prioritize bpf_link-based TC APIs to be added with legacy TC API as a fallback.
-Toke
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: > On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: > > On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: > > > [...] > > > > All of these things are messy because of tc legacy. bpf tried to follow tc style > > with cls and act distinction and it didn't quite work. cls with > > direct-action is the only > > thing that became mainstream while tc style attach wasn't really addressed. > > There were several incidents where tc had tens of thousands of progs attached > > because of this attach/query/index weirdness described above. > > I think the only way to address this properly is to introduce bpf_link style of > > attaching to tc. Such bpf_link would support ingress/egress only. > > direction-action will be implied. There won't be any index and query > > will be obvious. > > Note that we already have bpf_link support working (without support for pinning > ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, > chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link > and are able to operate on the exact filter during release.
Except they're not unique. The library can stash them, but something else doing detach via iproute2 or their own netlink calls will detach the prog. This other app can attach to the same spot a different prog and now bpf_link__destroy will be detaching somebody else prog.
> > So I would like to propose to take this patch set a step further from > > what Daniel said: > > int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): > > and make this proposed api to return FD. > > To detach from tc ingress/egress just close(fd). > > You mean adding an fd-based TC API to the kernel?
yes.
I'm totally for bpf_link-based TC attachment.
But I think *also* having "legacy" netlink-based APIs will allow applications to handle older kernels in a much nicer way without extra dependency on iproute2. We have a similar situation with kprobe, where currently libbpf only supports "modern" fd-based attachment, but users periodically ask questions and struggle to figure out issues on older kernels that don't support new APIs.
+1; I am OK with adding a new bpf_link-based way to attach TC programs, but we still need to support the netlink API in libbpf.
So I think we'd have to support legacy TC APIs, but I agree with Alexei and Daniel that we should keep it to the simplest and most straightforward API of supporting direction-action attachments and setting up qdisc transparently (if I'm getting all the terminology right, after reading Quentin's blog post). That coincidentally should probably match how bpf_link-based TC API will look like, so all that can be abstracted behind a single bpf_link__attach_tc() API as well, right? That's the plan for dealing with kprobe right now, btw. Libbpf will detect the best available API and transparently fall back (maybe with some warning for awareness, due to inherent downsides of legacy APIs: no auto-cleanup being the most prominent one).
Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the high-level API auto-detect. That way users can also still use the netlink attach function if they don't want the fd-based auto-close behaviour of bpf_link.
So I thought a bit more about this, and it feels like the right move would be to expose only higher-level TC BPF API behind bpf_link. It will keep the API complexity and amount of APIs that libbpf will have to support to the minimum, and will keep the API itself simple: direct-attach with the minimum amount of input arguments. By not exposing low-level APIs we also table the whole bpf_tc_cls_attach_id design discussion, as we now can keep as much info as needed inside bpf_link_tc (which will embed bpf_link internally as well) to support detachment and possibly some additional querying, if needed.
But then there would be no way for the caller to explicitly select a mechanism? I.e., if I write a BPF program using this mechanism targeting a 5.12 kernel, I'll get netlink attachment, which can stick around when I do bpf_link__disconnect(). But then if the kernel gets upgraded to support bpf_link for TC programs I'll suddenly transparently get bpf_link and the attachments will go away unless I pin them. This seems... less than ideal?
That's what we are doing with bpf_program__attach_kprobe(), though. And so far I've only seen people (privately) saying how good it would be to have bpf_link-based TC APIs, doesn't seem like anyone with a realistic use case prefers the current APIs. So I suspect it's not going to be a problem in practice. But at least I'd start there and see how people are using it and if they need anything else.
*sigh* - I really wish you would stop arbitrarily declaring your own use cases "realistic" and mine (implied) "unrealistic". Makes it really hard to have a productive discussion...
If we expose the low-level API I can elect to just use this if I know I want netlink behaviour, but if bpf_program__attach_tc() is the only API available it would at least need a flag to enforce one mode or the other (I can see someone wanting to enforce kernel bpf_link semantics as well, so a flag for either mode seems reasonable?).
Sophisticated enough users can also do feature detection to know if it's going to work or not.
Sure, but that won't help if there's no API to pick the attach mode they want.
There are many ways to skin this cat. I'd prioritize bpf_link-based TC APIs to be added with legacy TC API as a fallback.
I'm fine with adding that; I just want the functions implementing the TC API to also be exported so users can use those if they prefer...
-Toke
On Wed, Apr 14, 2021 at 3:51 PM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote: > > On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: > > On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: > > > On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: > > > > [...] > > > > > > All of these things are messy because of tc legacy. bpf tried to follow tc style > > > with cls and act distinction and it didn't quite work. cls with > > > direct-action is the only > > > thing that became mainstream while tc style attach wasn't really addressed. > > > There were several incidents where tc had tens of thousands of progs attached > > > because of this attach/query/index weirdness described above. > > > I think the only way to address this properly is to introduce bpf_link style of > > > attaching to tc. Such bpf_link would support ingress/egress only. > > > direction-action will be implied. There won't be any index and query > > > will be obvious. > > > > Note that we already have bpf_link support working (without support for pinning > > ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, > > chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link > > and are able to operate on the exact filter during release. > > Except they're not unique. The library can stash them, but something else > doing detach via iproute2 or their own netlink calls will detach the prog. > This other app can attach to the same spot a different prog and now > bpf_link__destroy will be detaching somebody else prog. > > > > So I would like to propose to take this patch set a step further from > > > what Daniel said: > > > int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): > > > and make this proposed api to return FD. > > > To detach from tc ingress/egress just close(fd). > > > > You mean adding an fd-based TC API to the kernel? > > yes.
I'm totally for bpf_link-based TC attachment.
But I think *also* having "legacy" netlink-based APIs will allow applications to handle older kernels in a much nicer way without extra dependency on iproute2. We have a similar situation with kprobe, where currently libbpf only supports "modern" fd-based attachment, but users periodically ask questions and struggle to figure out issues on older kernels that don't support new APIs.
+1; I am OK with adding a new bpf_link-based way to attach TC programs, but we still need to support the netlink API in libbpf.
So I think we'd have to support legacy TC APIs, but I agree with Alexei and Daniel that we should keep it to the simplest and most straightforward API of supporting direction-action attachments and setting up qdisc transparently (if I'm getting all the terminology right, after reading Quentin's blog post). That coincidentally should probably match how bpf_link-based TC API will look like, so all that can be abstracted behind a single bpf_link__attach_tc() API as well, right? That's the plan for dealing with kprobe right now, btw. Libbpf will detect the best available API and transparently fall back (maybe with some warning for awareness, due to inherent downsides of legacy APIs: no auto-cleanup being the most prominent one).
Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the high-level API auto-detect. That way users can also still use the netlink attach function if they don't want the fd-based auto-close behaviour of bpf_link.
So I thought a bit more about this, and it feels like the right move would be to expose only higher-level TC BPF API behind bpf_link. It will keep the API complexity and amount of APIs that libbpf will have to support to the minimum, and will keep the API itself simple: direct-attach with the minimum amount of input arguments. By not exposing low-level APIs we also table the whole bpf_tc_cls_attach_id design discussion, as we now can keep as much info as needed inside bpf_link_tc (which will embed bpf_link internally as well) to support detachment and possibly some additional querying, if needed.
But then there would be no way for the caller to explicitly select a mechanism? I.e., if I write a BPF program using this mechanism targeting a 5.12 kernel, I'll get netlink attachment, which can stick around when I do bpf_link__disconnect(). But then if the kernel gets upgraded to support bpf_link for TC programs I'll suddenly transparently get bpf_link and the attachments will go away unless I pin them. This seems... less than ideal?
That's what we are doing with bpf_program__attach_kprobe(), though. And so far I've only seen people (privately) saying how good it would be to have bpf_link-based TC APIs, doesn't seem like anyone with a realistic use case prefers the current APIs. So I suspect it's not going to be a problem in practice. But at least I'd start there and see how people are using it and if they need anything else.
*sigh* - I really wish you would stop arbitrarily declaring your own use cases "realistic" and mine (implied) "unrealistic". Makes it really hard to have a productive discussion...
Well (sigh?..), this wasn't my intention, sorry you read it this way. But we had similar discussions when I was adding bpf_link-based XDP attach APIs. And guess what, now I see that samples/bpf/whatever_xdp is switched to bpf_link-based XDP, because that makes everything simpler and more reliable. What I also know is that in production we ran into multiple issues with anything that doesn't auto-detach on process exit/crash (unless pinned explicitly, of course). And that people that are trying to use TC right now are saying how having bpf_link-based TC APIs would make everything *simpler* and *safer*. So I don't know... I understand it might be convenient in some cases to not care about a lifetime of BPF programs you are attaching, but then there are usually explicit and intentional ways to achieve at least similar behavior with safety by default.
So I guess call me unconvinced (yet? still?). Give it another shot, though.
If we expose the low-level API I can elect to just use this if I know I want netlink behaviour, but if bpf_program__attach_tc() is the only API available it would at least need a flag to enforce one mode or the other (I can see someone wanting to enforce kernel bpf_link semantics as well, so a flag for either mode seems reasonable?).
Sophisticated enough users can also do feature detection to know if it's going to work or not.
Sure, but that won't help if there's no API to pick the attach mode they want.
I'm not intending to allow legacy kprobe APIs to be "chosen", for instance. Because I'm convinced it's a bad API that no one should use if they can use an FD-based one. It might be a different case for TC, who knows. I'd just start with safer APIs and then evaluate whether there is a real demand for less safe ones. It's just some minor refactoring and exposing more APIs, when/if we need them.
There are many ways to skin this cat. I'd prioritize bpf_link-based TC APIs to be added with legacy TC API as a fallback.
I'm fine with adding that; I just want the functions implementing the TC API to also be exported so users can use those if they prefer...
-Toke
On 4/15/21 1:19 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 3:51 PM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes: > On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov > alexei.starovoitov@gmail.com wrote: >> On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: >>> On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: >>>> On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: >>>>> [...] >>>> >>>> All of these things are messy because of tc legacy. bpf tried to follow tc style >>>> with cls and act distinction and it didn't quite work. cls with >>>> direct-action is the only >>>> thing that became mainstream while tc style attach wasn't really addressed. >>>> There were several incidents where tc had tens of thousands of progs attached >>>> because of this attach/query/index weirdness described above. >>>> I think the only way to address this properly is to introduce bpf_link style of >>>> attaching to tc. Such bpf_link would support ingress/egress only. >>>> direction-action will be implied. There won't be any index and query >>>> will be obvious. >>> >>> Note that we already have bpf_link support working (without support for pinning >>> ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, >>> chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link >>> and are able to operate on the exact filter during release. >> >> Except they're not unique. The library can stash them, but something else >> doing detach via iproute2 or their own netlink calls will detach the prog. >> This other app can attach to the same spot a different prog and now >> bpf_link__destroy will be detaching somebody else prog. >> >>>> So I would like to propose to take this patch set a step further from >>>> what Daniel said: >>>> int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): >>>> and make this proposed api to return FD. >>>> To detach from tc ingress/egress just close(fd). >>> >>> You mean adding an fd-based TC API to the kernel? >> >> yes. > > I'm totally for bpf_link-based TC attachment. > > But I think *also* having "legacy" netlink-based APIs will allow > applications to handle older kernels in a much nicer way without extra > dependency on iproute2. We have a similar situation with kprobe, where > currently libbpf only supports "modern" fd-based attachment, but users > periodically ask questions and struggle to figure out issues on older > kernels that don't support new APIs.
+1; I am OK with adding a new bpf_link-based way to attach TC programs, but we still need to support the netlink API in libbpf.
> So I think we'd have to support legacy TC APIs, but I agree with > Alexei and Daniel that we should keep it to the simplest and most > straightforward API of supporting direction-action attachments and > setting up qdisc transparently (if I'm getting all the terminology > right, after reading Quentin's blog post). That coincidentally should > probably match how bpf_link-based TC API will look like, so all that > can be abstracted behind a single bpf_link__attach_tc() API as well, > right? That's the plan for dealing with kprobe right now, btw. Libbpf > will detect the best available API and transparently fall back (maybe > with some warning for awareness, due to inherent downsides of legacy > APIs: no auto-cleanup being the most prominent one).
Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the high-level API auto-detect. That way users can also still use the netlink attach function if they don't want the fd-based auto-close behaviour of bpf_link.
So I thought a bit more about this, and it feels like the right move would be to expose only higher-level TC BPF API behind bpf_link. It will keep the API complexity and amount of APIs that libbpf will have to support to the minimum, and will keep the API itself simple: direct-attach with the minimum amount of input arguments. By not exposing low-level APIs we also table the whole bpf_tc_cls_attach_id design discussion, as we now can keep as much info as needed inside bpf_link_tc (which will embed bpf_link internally as well) to support detachment and possibly some additional querying, if needed.
But then there would be no way for the caller to explicitly select a mechanism? I.e., if I write a BPF program using this mechanism targeting a 5.12 kernel, I'll get netlink attachment, which can stick around when I do bpf_link__disconnect(). But then if the kernel gets upgraded to support bpf_link for TC programs I'll suddenly transparently get bpf_link and the attachments will go away unless I pin them. This seems... less than ideal?
That's what we are doing with bpf_program__attach_kprobe(), though. And so far I've only seen people (privately) saying how good it would be to have bpf_link-based TC APIs, doesn't seem like anyone with a realistic use case prefers the current APIs. So I suspect it's not going to be a problem in practice. But at least I'd start there and see how people are using it and if they need anything else.
*sigh* - I really wish you would stop arbitrarily declaring your own use cases "realistic" and mine (implied) "unrealistic". Makes it really hard to have a productive discussion...
Well (sigh?..), this wasn't my intention, sorry you read it this way. But we had similar discussions when I was adding bpf_link-based XDP attach APIs. And guess what, now I see that samples/bpf/whatever_xdp is switched to bpf_link-based XDP, because that makes everything simpler and more reliable. What I also know is that in production we ran into multiple issues with anything that doesn't auto-detach on process exit/crash (unless pinned explicitly, of course). And that people that are trying to use TC right now are saying how having bpf_link-based TC APIs would make everything *simpler* and *safer*. So I don't know... I understand it might be convenient in some cases to not care about a lifetime of BPF programs you are attaching, but then there are usually explicit and intentional ways to achieve at least similar behavior with safety by default.
[...]
There are many ways to skin this cat. I'd prioritize bpf_link-based TC APIs to be added with legacy TC API as a fallback.
I think the problem here is though that this would need to be deterministic when upgrading from one kernel version to another where we don't use the fallback anymore, e.g. in case of Cilium we always want to keep the progs attached to allow headless updates on the agent, meaning, traffic keeps flowing through the BPF datapath while in user space, our agent restarts after upgrade, and atomically replaces the BPF progs once up and running (we're doing this for the whole range of 4.9 to 5.x kernels that we support). While we use the 'simple' api that is discussed here internally in Cilium, this attach behavior would have to be consistent, so transparent fallback inside libbpf on link vs non-link availability won't work (at least in our case).
So I guess call me unconvinced (yet? still?). Give it another shot, though.
If we expose the low-level API I can elect to just use this if I know I want netlink behaviour, but if bpf_program__attach_tc() is the only API available it would at least need a flag to enforce one mode or the other (I can see someone wanting to enforce kernel bpf_link semantics as well, so a flag for either mode seems reasonable?).
Sophisticated enough users can also do feature detection to know if it's going to work or not.
Sure, but that won't help if there's no API to pick the attach mode they want.
I'm not intending to allow legacy kprobe APIs to be "chosen", for instance. Because I'm convinced it's a bad API that no one should use if they can use an FD-based one. It might be a different case for TC, who knows. I'd just start with safer APIs and then evaluate whether there is a real demand for less safe ones. It's just some minor refactoring and exposing more APIs, when/if we need them.
There are many ways to skin this cat. I'd prioritize bpf_link-based TC APIs to be added with legacy TC API as a fallback.
I'm fine with adding that; I just want the functions implementing the TC API to also be exported so users can use those if they prefer...
-Toke
On Wed, Apr 14, 2021 at 4:32 PM Daniel Borkmann daniel@iogearbox.net wrote:
On 4/15/21 1:19 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 3:51 PM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote: > Andrii Nakryiko andrii.nakryiko@gmail.com writes: >> On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov >> alexei.starovoitov@gmail.com wrote: >>> On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: >>>> On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: >>>>> On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: >>>>>> [...] >>>>> >>>>> All of these things are messy because of tc legacy. bpf tried to follow tc style >>>>> with cls and act distinction and it didn't quite work. cls with >>>>> direct-action is the only >>>>> thing that became mainstream while tc style attach wasn't really addressed. >>>>> There were several incidents where tc had tens of thousands of progs attached >>>>> because of this attach/query/index weirdness described above. >>>>> I think the only way to address this properly is to introduce bpf_link style of >>>>> attaching to tc. Such bpf_link would support ingress/egress only. >>>>> direction-action will be implied. There won't be any index and query >>>>> will be obvious. >>>> >>>> Note that we already have bpf_link support working (without support for pinning >>>> ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, >>>> chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link >>>> and are able to operate on the exact filter during release. >>> >>> Except they're not unique. The library can stash them, but something else >>> doing detach via iproute2 or their own netlink calls will detach the prog. >>> This other app can attach to the same spot a different prog and now >>> bpf_link__destroy will be detaching somebody else prog. >>> >>>>> So I would like to propose to take this patch set a step further from >>>>> what Daniel said: >>>>> int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): >>>>> and make this proposed api to return FD. >>>>> To detach from tc ingress/egress just close(fd). >>>> >>>> You mean adding an fd-based TC API to the kernel? >>> >>> yes. >> >> I'm totally for bpf_link-based TC attachment. >> >> But I think *also* having "legacy" netlink-based APIs will allow >> applications to handle older kernels in a much nicer way without extra >> dependency on iproute2. We have a similar situation with kprobe, where >> currently libbpf only supports "modern" fd-based attachment, but users >> periodically ask questions and struggle to figure out issues on older >> kernels that don't support new APIs. > > +1; I am OK with adding a new bpf_link-based way to attach TC programs, > but we still need to support the netlink API in libbpf. > >> So I think we'd have to support legacy TC APIs, but I agree with >> Alexei and Daniel that we should keep it to the simplest and most >> straightforward API of supporting direction-action attachments and >> setting up qdisc transparently (if I'm getting all the terminology >> right, after reading Quentin's blog post). That coincidentally should >> probably match how bpf_link-based TC API will look like, so all that >> can be abstracted behind a single bpf_link__attach_tc() API as well, >> right? That's the plan for dealing with kprobe right now, btw. Libbpf >> will detect the best available API and transparently fall back (maybe >> with some warning for awareness, due to inherent downsides of legacy >> APIs: no auto-cleanup being the most prominent one). > > Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the > high-level API auto-detect. That way users can also still use the > netlink attach function if they don't want the fd-based auto-close > behaviour of bpf_link.
So I thought a bit more about this, and it feels like the right move would be to expose only higher-level TC BPF API behind bpf_link. It will keep the API complexity and amount of APIs that libbpf will have to support to the minimum, and will keep the API itself simple: direct-attach with the minimum amount of input arguments. By not exposing low-level APIs we also table the whole bpf_tc_cls_attach_id design discussion, as we now can keep as much info as needed inside bpf_link_tc (which will embed bpf_link internally as well) to support detachment and possibly some additional querying, if needed.
But then there would be no way for the caller to explicitly select a mechanism? I.e., if I write a BPF program using this mechanism targeting a 5.12 kernel, I'll get netlink attachment, which can stick around when I do bpf_link__disconnect(). But then if the kernel gets upgraded to support bpf_link for TC programs I'll suddenly transparently get bpf_link and the attachments will go away unless I pin them. This seems... less than ideal?
That's what we are doing with bpf_program__attach_kprobe(), though. And so far I've only seen people (privately) saying how good it would be to have bpf_link-based TC APIs, doesn't seem like anyone with a realistic use case prefers the current APIs. So I suspect it's not going to be a problem in practice. But at least I'd start there and see how people are using it and if they need anything else.
*sigh* - I really wish you would stop arbitrarily declaring your own use cases "realistic" and mine (implied) "unrealistic". Makes it really hard to have a productive discussion...
Well (sigh?..), this wasn't my intention, sorry you read it this way. But we had similar discussions when I was adding bpf_link-based XDP attach APIs. And guess what, now I see that samples/bpf/whatever_xdp is switched to bpf_link-based XDP, because that makes everything simpler and more reliable. What I also know is that in production we ran into multiple issues with anything that doesn't auto-detach on process exit/crash (unless pinned explicitly, of course). And that people that are trying to use TC right now are saying how having bpf_link-based TC APIs would make everything *simpler* and *safer*. So I don't know... I understand it might be convenient in some cases to not care about a lifetime of BPF programs you are attaching, but then there are usually explicit and intentional ways to achieve at least similar behavior with safety by default.
[...]
There are many ways to skin this cat. I'd prioritize bpf_link-based TC APIs to be added with legacy TC API as a fallback.
I think the problem here is though that this would need to be deterministic when upgrading from one kernel version to another where we don't use the fallback anymore, e.g. in case of Cilium we always want to keep the progs attached to allow headless updates on the agent, meaning, traffic keeps flowing through the BPF datapath while in user space, our agent restarts after upgrade, and atomically replaces the BPF progs once up and running (we're doing this for the whole range of 4.9 to 5.x kernels that we support). While we use the 'simple' api that is discussed here internally in Cilium, this attach behavior would have to be consistent, so transparent fallback inside libbpf on link vs non-link availability won't work (at least in our case).
What about pinning? It's not exactly the same, but bpf_link could actually pin a BPF program, if using legacy TC, and pin bpf_link, if using bpf_link-based APIs. Of course before switching from iproute2 to libbpf APIs you'd need to design your applications to use pinning instead of relying implicitly on permanently attached BPF program.
So I guess call me unconvinced (yet? still?). Give it another shot, though.
If we expose the low-level API I can elect to just use this if I know I want netlink behaviour, but if bpf_program__attach_tc() is the only API available it would at least need a flag to enforce one mode or the other (I can see someone wanting to enforce kernel bpf_link semantics as well, so a flag for either mode seems reasonable?).
Sophisticated enough users can also do feature detection to know if it's going to work or not.
Sure, but that won't help if there's no API to pick the attach mode they want.
I'm not intending to allow legacy kprobe APIs to be "chosen", for instance. Because I'm convinced it's a bad API that no one should use if they can use an FD-based one. It might be a different case for TC, who knows. I'd just start with safer APIs and then evaluate whether there is a real demand for less safe ones. It's just some minor refactoring and exposing more APIs, when/if we need them.
There are many ways to skin this cat. I'd prioritize bpf_link-based TC APIs to be added with legacy TC API as a fallback.
I'm fine with adding that; I just want the functions implementing the TC API to also be exported so users can use those if they prefer...
-Toke
On 4/15/21 1:58 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 4:32 PM Daniel Borkmann daniel@iogearbox.net wrote:
On 4/15/21 1:19 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 3:51 PM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes: > On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote: >> Andrii Nakryiko andrii.nakryiko@gmail.com writes: >>> On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov >>> alexei.starovoitov@gmail.com wrote: >>>> On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: >>>>> On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: >>>>>> On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: >>>>>>> [...] >>>>>> >>>>>> All of these things are messy because of tc legacy. bpf tried to follow tc style >>>>>> with cls and act distinction and it didn't quite work. cls with >>>>>> direct-action is the only >>>>>> thing that became mainstream while tc style attach wasn't really addressed. >>>>>> There were several incidents where tc had tens of thousands of progs attached >>>>>> because of this attach/query/index weirdness described above. >>>>>> I think the only way to address this properly is to introduce bpf_link style of >>>>>> attaching to tc. Such bpf_link would support ingress/egress only. >>>>>> direction-action will be implied. There won't be any index and query >>>>>> will be obvious. >>>>> >>>>> Note that we already have bpf_link support working (without support for pinning >>>>> ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, >>>>> chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link >>>>> and are able to operate on the exact filter during release. >>>> >>>> Except they're not unique. The library can stash them, but something else >>>> doing detach via iproute2 or their own netlink calls will detach the prog. >>>> This other app can attach to the same spot a different prog and now >>>> bpf_link__destroy will be detaching somebody else prog. >>>> >>>>>> So I would like to propose to take this patch set a step further from >>>>>> what Daniel said: >>>>>> int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): >>>>>> and make this proposed api to return FD. >>>>>> To detach from tc ingress/egress just close(fd). >>>>> >>>>> You mean adding an fd-based TC API to the kernel? >>>> >>>> yes. >>> >>> I'm totally for bpf_link-based TC attachment. >>> >>> But I think *also* having "legacy" netlink-based APIs will allow >>> applications to handle older kernels in a much nicer way without extra >>> dependency on iproute2. We have a similar situation with kprobe, where >>> currently libbpf only supports "modern" fd-based attachment, but users >>> periodically ask questions and struggle to figure out issues on older >>> kernels that don't support new APIs. >> >> +1; I am OK with adding a new bpf_link-based way to attach TC programs, >> but we still need to support the netlink API in libbpf. >> >>> So I think we'd have to support legacy TC APIs, but I agree with >>> Alexei and Daniel that we should keep it to the simplest and most >>> straightforward API of supporting direction-action attachments and >>> setting up qdisc transparently (if I'm getting all the terminology >>> right, after reading Quentin's blog post). That coincidentally should >>> probably match how bpf_link-based TC API will look like, so all that >>> can be abstracted behind a single bpf_link__attach_tc() API as well, >>> right? That's the plan for dealing with kprobe right now, btw. Libbpf >>> will detect the best available API and transparently fall back (maybe >>> with some warning for awareness, due to inherent downsides of legacy >>> APIs: no auto-cleanup being the most prominent one). >> >> Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the >> high-level API auto-detect. That way users can also still use the >> netlink attach function if they don't want the fd-based auto-close >> behaviour of bpf_link. > > So I thought a bit more about this, and it feels like the right move > would be to expose only higher-level TC BPF API behind bpf_link. It > will keep the API complexity and amount of APIs that libbpf will have > to support to the minimum, and will keep the API itself simple: > direct-attach with the minimum amount of input arguments. By not > exposing low-level APIs we also table the whole bpf_tc_cls_attach_id > design discussion, as we now can keep as much info as needed inside > bpf_link_tc (which will embed bpf_link internally as well) to support > detachment and possibly some additional querying, if needed.
But then there would be no way for the caller to explicitly select a mechanism? I.e., if I write a BPF program using this mechanism targeting a 5.12 kernel, I'll get netlink attachment, which can stick around when I do bpf_link__disconnect(). But then if the kernel gets upgraded to support bpf_link for TC programs I'll suddenly transparently get bpf_link and the attachments will go away unless I pin them. This seems... less than ideal?
That's what we are doing with bpf_program__attach_kprobe(), though. And so far I've only seen people (privately) saying how good it would be to have bpf_link-based TC APIs, doesn't seem like anyone with a realistic use case prefers the current APIs. So I suspect it's not going to be a problem in practice. But at least I'd start there and see how people are using it and if they need anything else.
*sigh* - I really wish you would stop arbitrarily declaring your own use cases "realistic" and mine (implied) "unrealistic". Makes it really hard to have a productive discussion...
Well (sigh?..), this wasn't my intention, sorry you read it this way. But we had similar discussions when I was adding bpf_link-based XDP attach APIs. And guess what, now I see that samples/bpf/whatever_xdp is switched to bpf_link-based XDP, because that makes everything simpler and more reliable. What I also know is that in production we ran into multiple issues with anything that doesn't auto-detach on process exit/crash (unless pinned explicitly, of course). And that people that are trying to use TC right now are saying how having bpf_link-based TC APIs would make everything *simpler* and *safer*. So I don't know... I understand it might be convenient in some cases to not care about a lifetime of BPF programs you are attaching, but then there are usually explicit and intentional ways to achieve at least similar behavior with safety by default.
[...]
There are many ways to skin this cat. I'd prioritize bpf_link-based TC APIs to be added with legacy TC API as a fallback.
I think the problem here is though that this would need to be deterministic when upgrading from one kernel version to another where we don't use the fallback anymore, e.g. in case of Cilium we always want to keep the progs attached to allow headless updates on the agent, meaning, traffic keeps flowing through the BPF datapath while in user space, our agent restarts after upgrade, and atomically replaces the BPF progs once up and running (we're doing this for the whole range of 4.9 to 5.x kernels that we support). While we use the 'simple' api that is discussed here internally in Cilium, this attach behavior would have to be consistent, so transparent fallback inside libbpf on link vs non-link availability won't work (at least in our case).
What about pinning? It's not exactly the same, but bpf_link could actually pin a BPF program, if using legacy TC, and pin bpf_link, if using bpf_link-based APIs. Of course before switching from iproute2 to libbpf APIs you'd need to design your applications to use pinning instead of relying implicitly on permanently attached BPF program.
All the progs we load from Cilium in a K8s setting w/ Pods, we could have easily over 100 loaded at the same time on a node, and we template the per Pod ones, so the complexity of managing those pinned lifecycles from the agent and dealing with the semantic/fallback differences between kernels feels probably not worth the gain. So if there would be a libbpf tc simplified attach API, I'd for the time being stick to the existing aka legacy means.
Thanks, Daniel
On Thu, Apr 15, 2021 at 3:10 PM Daniel Borkmann daniel@iogearbox.net wrote:
On 4/15/21 1:58 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 4:32 PM Daniel Borkmann daniel@iogearbox.net wrote:
On 4/15/21 1:19 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 3:51 PM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote: > Andrii Nakryiko andrii.nakryiko@gmail.com writes: >> On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote: >>> Andrii Nakryiko andrii.nakryiko@gmail.com writes: >>>> On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov >>>> alexei.starovoitov@gmail.com wrote: >>>>> On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: >>>>>> On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: >>>>>>> On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: >>>>>>>> [...] >>>>>>> >>>>>>> All of these things are messy because of tc legacy. bpf tried to follow tc style >>>>>>> with cls and act distinction and it didn't quite work. cls with >>>>>>> direct-action is the only >>>>>>> thing that became mainstream while tc style attach wasn't really addressed. >>>>>>> There were several incidents where tc had tens of thousands of progs attached >>>>>>> because of this attach/query/index weirdness described above. >>>>>>> I think the only way to address this properly is to introduce bpf_link style of >>>>>>> attaching to tc. Such bpf_link would support ingress/egress only. >>>>>>> direction-action will be implied. There won't be any index and query >>>>>>> will be obvious. >>>>>> >>>>>> Note that we already have bpf_link support working (without support for pinning >>>>>> ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, >>>>>> chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link >>>>>> and are able to operate on the exact filter during release. >>>>> >>>>> Except they're not unique. The library can stash them, but something else >>>>> doing detach via iproute2 or their own netlink calls will detach the prog. >>>>> This other app can attach to the same spot a different prog and now >>>>> bpf_link__destroy will be detaching somebody else prog. >>>>> >>>>>>> So I would like to propose to take this patch set a step further from >>>>>>> what Daniel said: >>>>>>> int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): >>>>>>> and make this proposed api to return FD. >>>>>>> To detach from tc ingress/egress just close(fd). >>>>>> >>>>>> You mean adding an fd-based TC API to the kernel? >>>>> >>>>> yes. >>>> >>>> I'm totally for bpf_link-based TC attachment. >>>> >>>> But I think *also* having "legacy" netlink-based APIs will allow >>>> applications to handle older kernels in a much nicer way without extra >>>> dependency on iproute2. We have a similar situation with kprobe, where >>>> currently libbpf only supports "modern" fd-based attachment, but users >>>> periodically ask questions and struggle to figure out issues on older >>>> kernels that don't support new APIs. >>> >>> +1; I am OK with adding a new bpf_link-based way to attach TC programs, >>> but we still need to support the netlink API in libbpf. >>> >>>> So I think we'd have to support legacy TC APIs, but I agree with >>>> Alexei and Daniel that we should keep it to the simplest and most >>>> straightforward API of supporting direction-action attachments and >>>> setting up qdisc transparently (if I'm getting all the terminology >>>> right, after reading Quentin's blog post). That coincidentally should >>>> probably match how bpf_link-based TC API will look like, so all that >>>> can be abstracted behind a single bpf_link__attach_tc() API as well, >>>> right? That's the plan for dealing with kprobe right now, btw. Libbpf >>>> will detect the best available API and transparently fall back (maybe >>>> with some warning for awareness, due to inherent downsides of legacy >>>> APIs: no auto-cleanup being the most prominent one). >>> >>> Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the >>> high-level API auto-detect. That way users can also still use the >>> netlink attach function if they don't want the fd-based auto-close >>> behaviour of bpf_link. >> >> So I thought a bit more about this, and it feels like the right move >> would be to expose only higher-level TC BPF API behind bpf_link. It >> will keep the API complexity and amount of APIs that libbpf will have >> to support to the minimum, and will keep the API itself simple: >> direct-attach with the minimum amount of input arguments. By not >> exposing low-level APIs we also table the whole bpf_tc_cls_attach_id >> design discussion, as we now can keep as much info as needed inside >> bpf_link_tc (which will embed bpf_link internally as well) to support >> detachment and possibly some additional querying, if needed. > > But then there would be no way for the caller to explicitly select a > mechanism? I.e., if I write a BPF program using this mechanism targeting > a 5.12 kernel, I'll get netlink attachment, which can stick around when > I do bpf_link__disconnect(). But then if the kernel gets upgraded to > support bpf_link for TC programs I'll suddenly transparently get > bpf_link and the attachments will go away unless I pin them. This > seems... less than ideal?
That's what we are doing with bpf_program__attach_kprobe(), though. And so far I've only seen people (privately) saying how good it would be to have bpf_link-based TC APIs, doesn't seem like anyone with a realistic use case prefers the current APIs. So I suspect it's not going to be a problem in practice. But at least I'd start there and see how people are using it and if they need anything else.
*sigh* - I really wish you would stop arbitrarily declaring your own use cases "realistic" and mine (implied) "unrealistic". Makes it really hard to have a productive discussion...
Well (sigh?..), this wasn't my intention, sorry you read it this way. But we had similar discussions when I was adding bpf_link-based XDP attach APIs. And guess what, now I see that samples/bpf/whatever_xdp is switched to bpf_link-based XDP, because that makes everything simpler and more reliable. What I also know is that in production we ran into multiple issues with anything that doesn't auto-detach on process exit/crash (unless pinned explicitly, of course). And that people that are trying to use TC right now are saying how having bpf_link-based TC APIs would make everything *simpler* and *safer*. So I don't know... I understand it might be convenient in some cases to not care about a lifetime of BPF programs you are attaching, but then there are usually explicit and intentional ways to achieve at least similar behavior with safety by default.
[...]
There are many ways to skin this cat. I'd prioritize bpf_link-based TC APIs to be added with legacy TC API as a fallback.
I think the problem here is though that this would need to be deterministic when upgrading from one kernel version to another where we don't use the fallback anymore, e.g. in case of Cilium we always want to keep the progs attached to allow headless updates on the agent, meaning, traffic keeps flowing through the BPF datapath while in user space, our agent restarts after upgrade, and atomically replaces the BPF progs once up and running (we're doing this for the whole range of 4.9 to 5.x kernels that we support). While we use the 'simple' api that is discussed here internally in Cilium, this attach behavior would have to be consistent, so transparent fallback inside libbpf on link vs non-link availability won't work (at least in our case).
What about pinning? It's not exactly the same, but bpf_link could actually pin a BPF program, if using legacy TC, and pin bpf_link, if using bpf_link-based APIs. Of course before switching from iproute2 to libbpf APIs you'd need to design your applications to use pinning instead of relying implicitly on permanently attached BPF program.
All the progs we load from Cilium in a K8s setting w/ Pods, we could have easily over 100 loaded at the same time on a node, and we template the per Pod ones, so the complexity of managing those pinned lifecycles from the agent and dealing with the semantic/fallback differences between kernels feels probably not worth the gain. So if there would be a libbpf tc simplified attach API, I'd for the time being stick to the existing aka legacy means.
Sure. Then what do you think about keeping only low-level TC APIs, and in the future add bpf_program__attach_tc(), which will use bpf_link-based one. It seems like it's not worth it to pretend we have bpf_link-based semantics with "legacy" current TC APIs. Similarly how we have a low-level XDP attach API, and bpf_link-based (only) bpf_program__attach_xdp().
Thanks, Daniel
On 4/16/21 12:22 AM, Andrii Nakryiko wrote:
On Thu, Apr 15, 2021 at 3:10 PM Daniel Borkmann daniel@iogearbox.net wrote:
On 4/15/21 1:58 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 4:32 PM Daniel Borkmann daniel@iogearbox.net wrote:
On 4/15/21 1:19 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 3:51 PM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes: > On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote: >> Andrii Nakryiko andrii.nakryiko@gmail.com writes: >>> On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote: >>>> Andrii Nakryiko andrii.nakryiko@gmail.com writes: >>>>> On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov >>>>> alexei.starovoitov@gmail.com wrote: >>>>>> On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: >>>>>>> On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: >>>>>>>> On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: >>>>>>>>> [...] >>>>>>>> >>>>>>>> All of these things are messy because of tc legacy. bpf tried to follow tc style >>>>>>>> with cls and act distinction and it didn't quite work. cls with >>>>>>>> direct-action is the only >>>>>>>> thing that became mainstream while tc style attach wasn't really addressed. >>>>>>>> There were several incidents where tc had tens of thousands of progs attached >>>>>>>> because of this attach/query/index weirdness described above. >>>>>>>> I think the only way to address this properly is to introduce bpf_link style of >>>>>>>> attaching to tc. Such bpf_link would support ingress/egress only. >>>>>>>> direction-action will be implied. There won't be any index and query >>>>>>>> will be obvious. >>>>>>> >>>>>>> Note that we already have bpf_link support working (without support for pinning >>>>>>> ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, >>>>>>> chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link >>>>>>> and are able to operate on the exact filter during release. >>>>>> >>>>>> Except they're not unique. The library can stash them, but something else >>>>>> doing detach via iproute2 or their own netlink calls will detach the prog. >>>>>> This other app can attach to the same spot a different prog and now >>>>>> bpf_link__destroy will be detaching somebody else prog. >>>>>> >>>>>>>> So I would like to propose to take this patch set a step further from >>>>>>>> what Daniel said: >>>>>>>> int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): >>>>>>>> and make this proposed api to return FD. >>>>>>>> To detach from tc ingress/egress just close(fd). >>>>>>> >>>>>>> You mean adding an fd-based TC API to the kernel? >>>>>> >>>>>> yes. >>>>> >>>>> I'm totally for bpf_link-based TC attachment. >>>>> >>>>> But I think *also* having "legacy" netlink-based APIs will allow >>>>> applications to handle older kernels in a much nicer way without extra >>>>> dependency on iproute2. We have a similar situation with kprobe, where >>>>> currently libbpf only supports "modern" fd-based attachment, but users >>>>> periodically ask questions and struggle to figure out issues on older >>>>> kernels that don't support new APIs. >>>> >>>> +1; I am OK with adding a new bpf_link-based way to attach TC programs, >>>> but we still need to support the netlink API in libbpf. >>>> >>>>> So I think we'd have to support legacy TC APIs, but I agree with >>>>> Alexei and Daniel that we should keep it to the simplest and most >>>>> straightforward API of supporting direction-action attachments and >>>>> setting up qdisc transparently (if I'm getting all the terminology >>>>> right, after reading Quentin's blog post). That coincidentally should >>>>> probably match how bpf_link-based TC API will look like, so all that >>>>> can be abstracted behind a single bpf_link__attach_tc() API as well, >>>>> right? That's the plan for dealing with kprobe right now, btw. Libbpf >>>>> will detect the best available API and transparently fall back (maybe >>>>> with some warning for awareness, due to inherent downsides of legacy >>>>> APIs: no auto-cleanup being the most prominent one). >>>> >>>> Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the >>>> high-level API auto-detect. That way users can also still use the >>>> netlink attach function if they don't want the fd-based auto-close >>>> behaviour of bpf_link. >>> >>> So I thought a bit more about this, and it feels like the right move >>> would be to expose only higher-level TC BPF API behind bpf_link. It >>> will keep the API complexity and amount of APIs that libbpf will have >>> to support to the minimum, and will keep the API itself simple: >>> direct-attach with the minimum amount of input arguments. By not >>> exposing low-level APIs we also table the whole bpf_tc_cls_attach_id >>> design discussion, as we now can keep as much info as needed inside >>> bpf_link_tc (which will embed bpf_link internally as well) to support >>> detachment and possibly some additional querying, if needed. >> >> But then there would be no way for the caller to explicitly select a >> mechanism? I.e., if I write a BPF program using this mechanism targeting >> a 5.12 kernel, I'll get netlink attachment, which can stick around when >> I do bpf_link__disconnect(). But then if the kernel gets upgraded to >> support bpf_link for TC programs I'll suddenly transparently get >> bpf_link and the attachments will go away unless I pin them. This >> seems... less than ideal? > > That's what we are doing with bpf_program__attach_kprobe(), though. > And so far I've only seen people (privately) saying how good it would > be to have bpf_link-based TC APIs, doesn't seem like anyone with a > realistic use case prefers the current APIs. So I suspect it's not > going to be a problem in practice. But at least I'd start there and > see how people are using it and if they need anything else.
*sigh* - I really wish you would stop arbitrarily declaring your own use cases "realistic" and mine (implied) "unrealistic". Makes it really hard to have a productive discussion...
Well (sigh?..), this wasn't my intention, sorry you read it this way. But we had similar discussions when I was adding bpf_link-based XDP attach APIs. And guess what, now I see that samples/bpf/whatever_xdp is switched to bpf_link-based XDP, because that makes everything simpler and more reliable. What I also know is that in production we ran into multiple issues with anything that doesn't auto-detach on process exit/crash (unless pinned explicitly, of course). And that people that are trying to use TC right now are saying how having bpf_link-based TC APIs would make everything *simpler* and *safer*. So I don't know... I understand it might be convenient in some cases to not care about a lifetime of BPF programs you are attaching, but then there are usually explicit and intentional ways to achieve at least similar behavior with safety by default.
[...]
> There are many ways to skin this cat. I'd prioritize bpf_link-based TC > APIs to be added with legacy TC API as a fallback.
I think the problem here is though that this would need to be deterministic when upgrading from one kernel version to another where we don't use the fallback anymore, e.g. in case of Cilium we always want to keep the progs attached to allow headless updates on the agent, meaning, traffic keeps flowing through the BPF datapath while in user space, our agent restarts after upgrade, and atomically replaces the BPF progs once up and running (we're doing this for the whole range of 4.9 to 5.x kernels that we support). While we use the 'simple' api that is discussed here internally in Cilium, this attach behavior would have to be consistent, so transparent fallback inside libbpf on link vs non-link availability won't work (at least in our case).
What about pinning? It's not exactly the same, but bpf_link could actually pin a BPF program, if using legacy TC, and pin bpf_link, if using bpf_link-based APIs. Of course before switching from iproute2 to libbpf APIs you'd need to design your applications to use pinning instead of relying implicitly on permanently attached BPF program.
All the progs we load from Cilium in a K8s setting w/ Pods, we could have easily over 100 loaded at the same time on a node, and we template the per Pod ones, so the complexity of managing those pinned lifecycles from the agent and dealing with the semantic/fallback differences between kernels feels probably not worth the gain. So if there would be a libbpf tc simplified attach API, I'd for the time being stick to the existing aka legacy means.
Sure. Then what do you think about keeping only low-level TC APIs, and in the future add bpf_program__attach_tc(), which will use bpf_link-based one. It seems like it's not worth it to pretend we have bpf_link-based semantics with "legacy" current TC APIs. Similarly how we have a low-level XDP attach API, and bpf_link-based (only) bpf_program__attach_xdp().
I think that's okay. I guess question is what do we define as initial scope for the low-level TC API. cls_bpf w/ fixed direct-action mode + fixed eth_p_all, allowing to flexibly specify handle / priority or a block_index feels reasonable.
Daniel Borkmann daniel@iogearbox.net writes:
On 4/16/21 12:22 AM, Andrii Nakryiko wrote:
On Thu, Apr 15, 2021 at 3:10 PM Daniel Borkmann daniel@iogearbox.net wrote:
On 4/15/21 1:58 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 4:32 PM Daniel Borkmann daniel@iogearbox.net wrote:
On 4/15/21 1:19 AM, Andrii Nakryiko wrote:
On Wed, Apr 14, 2021 at 3:51 PM Toke Høiland-Jørgensen toke@redhat.com wrote: > Andrii Nakryiko andrii.nakryiko@gmail.com writes: >> On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote: >>> Andrii Nakryiko andrii.nakryiko@gmail.com writes: >>>> On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote: >>>>> Andrii Nakryiko andrii.nakryiko@gmail.com writes: >>>>>> On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov >>>>>> alexei.starovoitov@gmail.com wrote: >>>>>>> On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: >>>>>>>> On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: >>>>>>>>> On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: >>>>>>>>>> [...] >>>>>>>>> >>>>>>>>> All of these things are messy because of tc legacy. bpf tried to follow tc style >>>>>>>>> with cls and act distinction and it didn't quite work. cls with >>>>>>>>> direct-action is the only >>>>>>>>> thing that became mainstream while tc style attach wasn't really addressed. >>>>>>>>> There were several incidents where tc had tens of thousands of progs attached >>>>>>>>> because of this attach/query/index weirdness described above. >>>>>>>>> I think the only way to address this properly is to introduce bpf_link style of >>>>>>>>> attaching to tc. Such bpf_link would support ingress/egress only. >>>>>>>>> direction-action will be implied. There won't be any index and query >>>>>>>>> will be obvious. >>>>>>>> >>>>>>>> Note that we already have bpf_link support working (without support for pinning >>>>>>>> ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, >>>>>>>> chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link >>>>>>>> and are able to operate on the exact filter during release. >>>>>>> >>>>>>> Except they're not unique. The library can stash them, but something else >>>>>>> doing detach via iproute2 or their own netlink calls will detach the prog. >>>>>>> This other app can attach to the same spot a different prog and now >>>>>>> bpf_link__destroy will be detaching somebody else prog. >>>>>>> >>>>>>>>> So I would like to propose to take this patch set a step further from >>>>>>>>> what Daniel said: >>>>>>>>> int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): >>>>>>>>> and make this proposed api to return FD. >>>>>>>>> To detach from tc ingress/egress just close(fd). >>>>>>>> >>>>>>>> You mean adding an fd-based TC API to the kernel? >>>>>>> >>>>>>> yes. >>>>>> >>>>>> I'm totally for bpf_link-based TC attachment. >>>>>> >>>>>> But I think *also* having "legacy" netlink-based APIs will allow >>>>>> applications to handle older kernels in a much nicer way without extra >>>>>> dependency on iproute2. We have a similar situation with kprobe, where >>>>>> currently libbpf only supports "modern" fd-based attachment, but users >>>>>> periodically ask questions and struggle to figure out issues on older >>>>>> kernels that don't support new APIs. >>>>> >>>>> +1; I am OK with adding a new bpf_link-based way to attach TC programs, >>>>> but we still need to support the netlink API in libbpf. >>>>> >>>>>> So I think we'd have to support legacy TC APIs, but I agree with >>>>>> Alexei and Daniel that we should keep it to the simplest and most >>>>>> straightforward API of supporting direction-action attachments and >>>>>> setting up qdisc transparently (if I'm getting all the terminology >>>>>> right, after reading Quentin's blog post). That coincidentally should >>>>>> probably match how bpf_link-based TC API will look like, so all that >>>>>> can be abstracted behind a single bpf_link__attach_tc() API as well, >>>>>> right? That's the plan for dealing with kprobe right now, btw. Libbpf >>>>>> will detect the best available API and transparently fall back (maybe >>>>>> with some warning for awareness, due to inherent downsides of legacy >>>>>> APIs: no auto-cleanup being the most prominent one). >>>>> >>>>> Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the >>>>> high-level API auto-detect. That way users can also still use the >>>>> netlink attach function if they don't want the fd-based auto-close >>>>> behaviour of bpf_link. >>>> >>>> So I thought a bit more about this, and it feels like the right move >>>> would be to expose only higher-level TC BPF API behind bpf_link. It >>>> will keep the API complexity and amount of APIs that libbpf will have >>>> to support to the minimum, and will keep the API itself simple: >>>> direct-attach with the minimum amount of input arguments. By not >>>> exposing low-level APIs we also table the whole bpf_tc_cls_attach_id >>>> design discussion, as we now can keep as much info as needed inside >>>> bpf_link_tc (which will embed bpf_link internally as well) to support >>>> detachment and possibly some additional querying, if needed. >>> >>> But then there would be no way for the caller to explicitly select a >>> mechanism? I.e., if I write a BPF program using this mechanism targeting >>> a 5.12 kernel, I'll get netlink attachment, which can stick around when >>> I do bpf_link__disconnect(). But then if the kernel gets upgraded to >>> support bpf_link for TC programs I'll suddenly transparently get >>> bpf_link and the attachments will go away unless I pin them. This >>> seems... less than ideal? >> >> That's what we are doing with bpf_program__attach_kprobe(), though. >> And so far I've only seen people (privately) saying how good it would >> be to have bpf_link-based TC APIs, doesn't seem like anyone with a >> realistic use case prefers the current APIs. So I suspect it's not >> going to be a problem in practice. But at least I'd start there and >> see how people are using it and if they need anything else. > > *sigh* - I really wish you would stop arbitrarily declaring your own use > cases "realistic" and mine (implied) "unrealistic". Makes it really hard > to have a productive discussion...
Well (sigh?..), this wasn't my intention, sorry you read it this way. But we had similar discussions when I was adding bpf_link-based XDP attach APIs. And guess what, now I see that samples/bpf/whatever_xdp is switched to bpf_link-based XDP, because that makes everything simpler and more reliable. What I also know is that in production we ran into multiple issues with anything that doesn't auto-detach on process exit/crash (unless pinned explicitly, of course). And that people that are trying to use TC right now are saying how having bpf_link-based TC APIs would make everything *simpler* and *safer*. So I don't know... I understand it might be convenient in some cases to not care about a lifetime of BPF programs you are attaching, but then there are usually explicit and intentional ways to achieve at least similar behavior with safety by default.
[...]
>> There are many ways to skin this cat. I'd prioritize bpf_link-based TC >> APIs to be added with legacy TC API as a fallback.
I think the problem here is though that this would need to be deterministic when upgrading from one kernel version to another where we don't use the fallback anymore, e.g. in case of Cilium we always want to keep the progs attached to allow headless updates on the agent, meaning, traffic keeps flowing through the BPF datapath while in user space, our agent restarts after upgrade, and atomically replaces the BPF progs once up and running (we're doing this for the whole range of 4.9 to 5.x kernels that we support). While we use the 'simple' api that is discussed here internally in Cilium, this attach behavior would have to be consistent, so transparent fallback inside libbpf on link vs non-link availability won't work (at least in our case).
What about pinning? It's not exactly the same, but bpf_link could actually pin a BPF program, if using legacy TC, and pin bpf_link, if using bpf_link-based APIs. Of course before switching from iproute2 to libbpf APIs you'd need to design your applications to use pinning instead of relying implicitly on permanently attached BPF program.
All the progs we load from Cilium in a K8s setting w/ Pods, we could have easily over 100 loaded at the same time on a node, and we template the per Pod ones, so the complexity of managing those pinned lifecycles from the agent and dealing with the semantic/fallback differences between kernels feels probably not worth the gain. So if there would be a libbpf tc simplified attach API, I'd for the time being stick to the existing aka legacy means.
Sure. Then what do you think about keeping only low-level TC APIs, and in the future add bpf_program__attach_tc(), which will use bpf_link-based one. It seems like it's not worth it to pretend we have bpf_link-based semantics with "legacy" current TC APIs. Similarly how we have a low-level XDP attach API, and bpf_link-based (only) bpf_program__attach_xdp().
I think that's okay. I guess question is what do we define as initial scope for the low-level TC API. cls_bpf w/ fixed direct-action mode + fixed eth_p_all, allowing to flexibly specify handle / priority or a block_index feels reasonable.
Sounds reasonable to me, with the addition of 'parent' to the things you can specify.
So snipping a few bits from Kumar's patch and paring it down a bit, we'd end up with something like this?
+struct bpf_tc_cls_opts { + size_t sz; + __u32 chain_index; + __u32 handle; + __u32 priority; + __u32 class_id; +}; +#define bpf_tc_cls_opts__last_field class_id + +/* Acts as a handle for an attached filter */ +struct bpf_tc_cls_attach_id { + __u32 ifindex; + union { + __u32 block_index; + __u32 parent_id; + }; + __u32 protocol; + __u32 chain_index; + __u32 handle; + __u32 priority; +}; + +struct bpf_tc_cls_info { + struct bpf_tc_cls_attach_id id; + __u32 class_id; + __u32 bpf_flags; + __u32 bpf_flags_gen; +}; + +LIBBPF_API int bpf_tc_cls_attach_dev(int fd, __u32 ifindex, __u32 parent_id, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_attach_id *id); +LIBBPF_API int bpf_tc_cls_detach_dev(const struct bpf_tc_cls_attach_id *id); +LIBBPF_API int bpf_tc_cls_get_info_dev(int fd, __u32 ifindex, __u32 parent_id, + const struct bpf_tc_cls_opts *opts, + struct bpf_tc_cls_info *info);
What about change and replace? I guess we could do without those, right?
-Toke
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:51 PM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
> On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov > alexei.starovoitov@gmail.com wrote: >> >> On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: >> > On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: >> > > On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: >> > > > [...] >> > > >> > > All of these things are messy because of tc legacy. bpf tried to follow tc style >> > > with cls and act distinction and it didn't quite work. cls with >> > > direct-action is the only >> > > thing that became mainstream while tc style attach wasn't really addressed. >> > > There were several incidents where tc had tens of thousands of progs attached >> > > because of this attach/query/index weirdness described above. >> > > I think the only way to address this properly is to introduce bpf_link style of >> > > attaching to tc. Such bpf_link would support ingress/egress only. >> > > direction-action will be implied. There won't be any index and query >> > > will be obvious. >> > >> > Note that we already have bpf_link support working (without support for pinning >> > ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, >> > chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link >> > and are able to operate on the exact filter during release. >> >> Except they're not unique. The library can stash them, but something else >> doing detach via iproute2 or their own netlink calls will detach the prog. >> This other app can attach to the same spot a different prog and now >> bpf_link__destroy will be detaching somebody else prog. >> >> > > So I would like to propose to take this patch set a step further from >> > > what Daniel said: >> > > int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): >> > > and make this proposed api to return FD. >> > > To detach from tc ingress/egress just close(fd). >> > >> > You mean adding an fd-based TC API to the kernel? >> >> yes. > > I'm totally for bpf_link-based TC attachment. > > But I think *also* having "legacy" netlink-based APIs will allow > applications to handle older kernels in a much nicer way without extra > dependency on iproute2. We have a similar situation with kprobe, where > currently libbpf only supports "modern" fd-based attachment, but users > periodically ask questions and struggle to figure out issues on older > kernels that don't support new APIs.
+1; I am OK with adding a new bpf_link-based way to attach TC programs, but we still need to support the netlink API in libbpf.
> So I think we'd have to support legacy TC APIs, but I agree with > Alexei and Daniel that we should keep it to the simplest and most > straightforward API of supporting direction-action attachments and > setting up qdisc transparently (if I'm getting all the terminology > right, after reading Quentin's blog post). That coincidentally should > probably match how bpf_link-based TC API will look like, so all that > can be abstracted behind a single bpf_link__attach_tc() API as well, > right? That's the plan for dealing with kprobe right now, btw. Libbpf > will detect the best available API and transparently fall back (maybe > with some warning for awareness, due to inherent downsides of legacy > APIs: no auto-cleanup being the most prominent one).
Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the high-level API auto-detect. That way users can also still use the netlink attach function if they don't want the fd-based auto-close behaviour of bpf_link.
So I thought a bit more about this, and it feels like the right move would be to expose only higher-level TC BPF API behind bpf_link. It will keep the API complexity and amount of APIs that libbpf will have to support to the minimum, and will keep the API itself simple: direct-attach with the minimum amount of input arguments. By not exposing low-level APIs we also table the whole bpf_tc_cls_attach_id design discussion, as we now can keep as much info as needed inside bpf_link_tc (which will embed bpf_link internally as well) to support detachment and possibly some additional querying, if needed.
But then there would be no way for the caller to explicitly select a mechanism? I.e., if I write a BPF program using this mechanism targeting a 5.12 kernel, I'll get netlink attachment, which can stick around when I do bpf_link__disconnect(). But then if the kernel gets upgraded to support bpf_link for TC programs I'll suddenly transparently get bpf_link and the attachments will go away unless I pin them. This seems... less than ideal?
That's what we are doing with bpf_program__attach_kprobe(), though. And so far I've only seen people (privately) saying how good it would be to have bpf_link-based TC APIs, doesn't seem like anyone with a realistic use case prefers the current APIs. So I suspect it's not going to be a problem in practice. But at least I'd start there and see how people are using it and if they need anything else.
*sigh* - I really wish you would stop arbitrarily declaring your own use cases "realistic" and mine (implied) "unrealistic". Makes it really hard to have a productive discussion...
Well (sigh?..), this wasn't my intention, sorry you read it this way. But we had similar discussions when I was adding bpf_link-based XDP attach APIs.
Great, thank you! And yeah, we did discuss exactly this before, which is where my mental sigh came from - I feel like we already covered this ground and that I'm just being dismissed with "that is not a real use case". But OK, I'll give it another shot, see below.
And guess what, now I see that samples/bpf/whatever_xdp is switched to bpf_link-based XDP, because that makes everything simpler and more reliable. What I also know is that in production we ran into multiple issues with anything that doesn't auto-detach on process exit/crash (unless pinned explicitly, of course). And that people that are trying to use TC right now are saying how having bpf_link-based TC APIs would make everything *simpler* and *safer*. So I don't know... I understand it might be convenient in some cases to not care about a lifetime of BPF programs you are attaching, but then there are usually explicit and intentional ways to achieve at least similar behavior with safety by default.
So I guess call me unconvinced (yet? still?). Give it another shot, though.
I'm not arguing against adding bpf_link support, and I'm even fine with making it the default. As you say, there are plenty of use cases where the bpf_link semantics make sense, and the XDP programs in samples all fall in this category. So sure, let's add this support and make this convenient to use.
But there are also use cases where the BPF program lifetime absolutely shouldn't follow that of the userspace application. This includes both applications that don't have a long-running daemon at all (like a firewall that just loads a ruleset at boot; xdp-filter is such an application in the BPF world, but I'm sure there are others). And daemons that use BPF as a data path and want the packets to keep flowing even when they restart, like Cilium as Daniel mentioned.
So the latter category of applications need their BPF programs to be permanently attached to the interface. And sure, this can sorta be done by pinning the bpf_link; but not really, because then:
- You incur a new dependency on bpffs, so you have to make sure that is mounted and that you can get at the particular fs instance you're using; the latter is especially painful if you switch namespaces.
- Your BPF program lifetime is no longer tied to the interface, so you have to deal with garbage collecting your pinned files somehow. This is especially painful if you don't have a daemon.
Together, these issues make bpf_link a much less compelling proposition, to the point where it's no longer the better API for these use cases, IMO. And I know that because I had to work around just these issues with bpf_link for xdp-tools.
But I'm not even asking for the netlink API to be the default, I'm fine with bpf_link being the default and encouraged API. I'm just asking for a way to make it *possible* to select which attach mode I want. Either by a flag to bpf_program__attach_tc(), or by exposing the low-level bpf_tc_cls_*() netlink functions, like we do for XDP.
If we expose the low-level API I can elect to just use this if I know I want netlink behaviour, but if bpf_program__attach_tc() is the only API available it would at least need a flag to enforce one mode or the other (I can see someone wanting to enforce kernel bpf_link semantics as well, so a flag for either mode seems reasonable?).
Sophisticated enough users can also do feature detection to know if it's going to work or not.
Sure, but that won't help if there's no API to pick the attach mode they want.
I'm not intending to allow legacy kprobe APIs to be "chosen", for instance. Because I'm convinced it's a bad API that no one should use if they can use an FD-based one.
I'd tend to agree with you for the tracing APIs, actually. But a BPF-based data plane is different, as I tried to explain above.
It might be a different case for TC, who knows. I'd just start with safer APIs and then evaluate whether there is a real demand for less safe ones. It's just some minor refactoring and exposing more APIs, when/if we need them.
There you go again with the "real demand" argument. How can I read this in any other way than that you don't consider my use case "real" (as you just assured me above was not the case)? What do you consider "real demand"?
-Toke
On Thu, Apr 15, 2021 at 8:57 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:51 PM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Wed, Apr 14, 2021 at 3:58 AM Toke Høiland-Jørgensen toke@redhat.com wrote:
Andrii Nakryiko andrii.nakryiko@gmail.com writes:
On Tue, Apr 6, 2021 at 3:06 AM Toke Høiland-Jørgensen toke@redhat.com wrote: > > Andrii Nakryiko andrii.nakryiko@gmail.com writes: > > > On Sat, Apr 3, 2021 at 10:47 AM Alexei Starovoitov > > alexei.starovoitov@gmail.com wrote: > >> > >> On Sat, Apr 03, 2021 at 12:38:06AM +0530, Kumar Kartikeya Dwivedi wrote: > >> > On Sat, Apr 03, 2021 at 12:02:14AM IST, Alexei Starovoitov wrote: > >> > > On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote: > >> > > > [...] > >> > > > >> > > All of these things are messy because of tc legacy. bpf tried to follow tc style > >> > > with cls and act distinction and it didn't quite work. cls with > >> > > direct-action is the only > >> > > thing that became mainstream while tc style attach wasn't really addressed. > >> > > There were several incidents where tc had tens of thousands of progs attached > >> > > because of this attach/query/index weirdness described above. > >> > > I think the only way to address this properly is to introduce bpf_link style of > >> > > attaching to tc. Such bpf_link would support ingress/egress only. > >> > > direction-action will be implied. There won't be any index and query > >> > > will be obvious. > >> > > >> > Note that we already have bpf_link support working (without support for pinning > >> > ofcourse) in a limited way. The ifindex, protocol, parent_id, priority, handle, > >> > chain_index tuple uniquely identifies a filter, so we stash this in the bpf_link > >> > and are able to operate on the exact filter during release. > >> > >> Except they're not unique. The library can stash them, but something else > >> doing detach via iproute2 or their own netlink calls will detach the prog. > >> This other app can attach to the same spot a different prog and now > >> bpf_link__destroy will be detaching somebody else prog. > >> > >> > > So I would like to propose to take this patch set a step further from > >> > > what Daniel said: > >> > > int bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}): > >> > > and make this proposed api to return FD. > >> > > To detach from tc ingress/egress just close(fd). > >> > > >> > You mean adding an fd-based TC API to the kernel? > >> > >> yes. > > > > I'm totally for bpf_link-based TC attachment. > > > > But I think *also* having "legacy" netlink-based APIs will allow > > applications to handle older kernels in a much nicer way without extra > > dependency on iproute2. We have a similar situation with kprobe, where > > currently libbpf only supports "modern" fd-based attachment, but users > > periodically ask questions and struggle to figure out issues on older > > kernels that don't support new APIs. > > +1; I am OK with adding a new bpf_link-based way to attach TC programs, > but we still need to support the netlink API in libbpf. > > > So I think we'd have to support legacy TC APIs, but I agree with > > Alexei and Daniel that we should keep it to the simplest and most > > straightforward API of supporting direction-action attachments and > > setting up qdisc transparently (if I'm getting all the terminology > > right, after reading Quentin's blog post). That coincidentally should > > probably match how bpf_link-based TC API will look like, so all that > > can be abstracted behind a single bpf_link__attach_tc() API as well, > > right? That's the plan for dealing with kprobe right now, btw. Libbpf > > will detect the best available API and transparently fall back (maybe > > with some warning for awareness, due to inherent downsides of legacy > > APIs: no auto-cleanup being the most prominent one). > > Yup, SGTM: Expose both in the low-level API (in bpf.c), and make the > high-level API auto-detect. That way users can also still use the > netlink attach function if they don't want the fd-based auto-close > behaviour of bpf_link.
So I thought a bit more about this, and it feels like the right move would be to expose only higher-level TC BPF API behind bpf_link. It will keep the API complexity and amount of APIs that libbpf will have to support to the minimum, and will keep the API itself simple: direct-attach with the minimum amount of input arguments. By not exposing low-level APIs we also table the whole bpf_tc_cls_attach_id design discussion, as we now can keep as much info as needed inside bpf_link_tc (which will embed bpf_link internally as well) to support detachment and possibly some additional querying, if needed.
But then there would be no way for the caller to explicitly select a mechanism? I.e., if I write a BPF program using this mechanism targeting a 5.12 kernel, I'll get netlink attachment, which can stick around when I do bpf_link__disconnect(). But then if the kernel gets upgraded to support bpf_link for TC programs I'll suddenly transparently get bpf_link and the attachments will go away unless I pin them. This seems... less than ideal?
That's what we are doing with bpf_program__attach_kprobe(), though. And so far I've only seen people (privately) saying how good it would be to have bpf_link-based TC APIs, doesn't seem like anyone with a realistic use case prefers the current APIs. So I suspect it's not going to be a problem in practice. But at least I'd start there and see how people are using it and if they need anything else.
*sigh* - I really wish you would stop arbitrarily declaring your own use cases "realistic" and mine (implied) "unrealistic". Makes it really hard to have a productive discussion...
Well (sigh?..), this wasn't my intention, sorry you read it this way. But we had similar discussions when I was adding bpf_link-based XDP attach APIs.
Great, thank you! And yeah, we did discuss exactly this before, which is where my mental sigh came from - I feel like we already covered this ground and that I'm just being dismissed with "that is not a real use case". But OK, I'll give it another shot, see below.
And guess what, now I see that samples/bpf/whatever_xdp is switched to bpf_link-based XDP, because that makes everything simpler and more reliable. What I also know is that in production we ran into multiple issues with anything that doesn't auto-detach on process exit/crash (unless pinned explicitly, of course). And that people that are trying to use TC right now are saying how having bpf_link-based TC APIs would make everything *simpler* and *safer*. So I don't know... I understand it might be convenient in some cases to not care about a lifetime of BPF programs you are attaching, but then there are usually explicit and intentional ways to achieve at least similar behavior with safety by default.
So I guess call me unconvinced (yet? still?). Give it another shot, though.
I'm not arguing against adding bpf_link support, and I'm even fine with making it the default. As you say, there are plenty of use cases where the bpf_link semantics make sense, and the XDP programs in samples all fall in this category. So sure, let's add this support and make this convenient to use.
But there are also use cases where the BPF program lifetime absolutely shouldn't follow that of the userspace application. This includes both applications that don't have a long-running daemon at all (like a firewall that just loads a ruleset at boot; xdp-filter is such an application in the BPF world, but I'm sure there are others). And daemons that use BPF as a data path and want the packets to keep flowing even when they restart, like Cilium as Daniel mentioned.
So the latter category of applications need their BPF programs to be permanently attached to the interface. And sure, this can sorta be done by pinning the bpf_link; but not really, because then:
- You incur a new dependency on bpffs, so you have to make sure that is mounted and that you can get at the particular fs instance you're using; the latter is especially painful if you switch namespaces.
So I understand that it's more painful than current TC and legacy XDP APIs for these specific use cases. But for other types of BPF programs that want to persist across user-space process exit, all those needs to be addressed and designed around anyway. And that doesn't prevent others from doing it, otherwise we wouldn't even implement BPFFS.
- Your BPF program lifetime is no longer tied to the interface, so you have to deal with garbage collecting your pinned files somehow. This is especially painful if you don't have a daemon.
With bpf_link auto-detach, you'll still get the underlying BPF program freed, only a small bpf_link shell will persist. If you churn through interface going up/down, whichever application is responsible for re-attaching BPF programs would deal with clean up.
But yes, I'm not oblivious to the need to change how you design your applications and new inconveniences that would cause. But think about this from a slightly different point of view. If we had to choose between bpf_link model and current TC API, which one would we choose? I'd argue we should choose bpf_link, because: 1) it allows to do things more safely (auto-cleanup) by default; 2) it still allows for having BPF program/link persistence, even if in a different and slightly more inconvenient way.
So it's a more generic and powerful approach, it's just not as perfectly aligned with the way it's used for cases you mentioned.
Together, these issues make bpf_link a much less compelling proposition, to the point where it's no longer the better API for these use cases, IMO. And I know that because I had to work around just these issues with bpf_link for xdp-tools.
But I'm not even asking for the netlink API to be the default, I'm fine with bpf_link being the default and encouraged API. I'm just asking for a way to make it *possible* to select which attach mode I want. Either by a flag to bpf_program__attach_tc(), or by exposing the low-level bpf_tc_cls_*() netlink functions, like we do for XDP.
Given significant semantic differences between bpf_link and current TC APIs, I'm not sure anymore if it's a good idea to hide current API behind bpf_link abstraction. For tracing it's a good fit, so I still think falling back to legacy kprobe API makes sense there. For TC, given your and Daniel's replies, I'd rather have low-level APIs exposed directly instead of having some options switch in bpf_program__attach_tc() API.
If we expose the low-level API I can elect to just use this if I know I want netlink behaviour, but if bpf_program__attach_tc() is the only API available it would at least need a flag to enforce one mode or the other (I can see someone wanting to enforce kernel bpf_link semantics as well, so a flag for either mode seems reasonable?).
Sophisticated enough users can also do feature detection to know if it's going to work or not.
Sure, but that won't help if there's no API to pick the attach mode they want.
I'm not intending to allow legacy kprobe APIs to be "chosen", for instance. Because I'm convinced it's a bad API that no one should use if they can use an FD-based one.
I'd tend to agree with you for the tracing APIs, actually. But a BPF-based data plane is different, as I tried to explain above.
Right, see above.
It might be a different case for TC, who knows. I'd just start with safer APIs and then evaluate whether there is a real demand for less safe ones. It's just some minor refactoring and exposing more APIs, when/if we need them.
There you go again with the "real demand" argument. How can I read this in any other way than that you don't consider my use case "real" (as you just assured me above was not the case)? What do you consider "real demand"?
Please try not to read more in my words than there really is. By real demand I meant at least few *different* use cases coming from different parties. That says nothing about whether your use case is real or not, just that it's a one case (or at least coming from just one party).
-Toke
On Fri, Apr 2, 2021 at 8:27 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
On Fri, Apr 02, 2021 at 05:49:29AM IST, Daniel Borkmann wrote:
On 3/31/21 11:44 AM, Kumar Kartikeya Dwivedi wrote:
On Wed, Mar 31, 2021 at 02:55:47AM IST, Daniel Borkmann wrote:
Do we even need the _block variant? I would rather prefer to take the chance and make it as simple as possible, and only iff really needed extend with other APIs, for example:
The block variant can be dropped, I'll use the TC_BLOCK/TC_DEV alternative which sets parent_id/ifindex properly.
bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS});
Internally, this will create the sch_clsact qdisc & cls_bpf filter instance iff not present yet, and attach to a default prio 1 handle 1, and _always_ in direct-action mode. This is /as simple as it gets/ and we don't need to bother users with more complex tc/cls_bpf internals unless desired. For example, extended APIs could add prio/parent so that multi-prog can be attached to a single cls_bpf instance, but even that could be a second step, imho.
I am not opposed to clsact qdisc setup if INGRESS/EGRESS is supplied (not sure how others feel about it).
What speaks against it? It would be 100% clear from API side where the prog is being attached. Same as with tc cmdline where you specify 'ingress'/'egress'.
Ok, I will add the qdisc setup in the next revision.
We could make direct_action mode default, and similarly choose prio
To be honest, I wouldn't even support a mode from the lib/API side where direct_action is not set. It should always be forced to true. Everything else is rather broken setup-wise, imho, since it won't scale. We added direct_action a bit later to the kernel than original cls_bpf, but if I would do it again today, I'd make it the only available option. I don't see a reasonable use-case where you have it to false.
I'm all for doing that, but in some sense that also speaks against SCHED_ACT support. Currently, you can load SCHED_ACT programs using this series, but not really bind them to classifier. I left that option open to a future patch, it would just reuse the existing tc_act_add_action helper (also why I kept it in its own function). Maybe we need to reconsider that, if direct action is the only recommended way going forward (to discourage people from using SCHED_ACT), or just add opts to do all the setup in low level API, instead of leaving it incomplete.
as 1 by default instead of letting the kernel do it. Then you can just pass in NULL for bpf_tc_cls_opts and be close to what you're proposing. For protocol we can choose ETH_P_ALL by default too if the user doesn't set it.
Same here with ETH_P_ALL, I'm not sure anyone uses anything other than ETH_P_ALL, so yes, that should be default.
Ack.
With these modifications, the equivalent would look like bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, INGRESS), NULL, &id);
Few things compared to bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS}):
- nit, but why even 'cls' in the name. I think we shouldn't expose such old-days tc semantics to a user. Just bpf_tc_attach() is cleaner/simpler to understand.
Since it would make it clear this is for SCHED_CLS progs, likewise bpf_tc_act_* is for SCHED_ACT progs. Not opposed to changing the name.
- What's the 'TC_DEV(ifindex, INGRESS)' macro doing exactly? Looks unnecessary, why not regular args to the API?
It is very easy to support BLOCK (I know it's not really popular here, but I think if supporting it just requires adding a macro, then we can go ahead). So the user can use TC_BLOCK(block_idx) instead of remembering ifindex is to be set to TCM_IFINDEX_MAGIC_BLOCK and parent_id to actual block index. It will just expand to:
#define TC_BLOCK(block_idx) TCM_IFINDEX_MAGIC_BLOCK, (block_idx)
TC_DEV macro can be dropped, since user can directly pass ifindex and parent_id.
if _block variant is just a special ifindex value, then it should be fine for users to know such a detail (we can leave a comment mentioning this specifically), especially given it's not a very popular thing. Almost doubling amount of APIs just for this doesn't make much sense, IMO.
- Exposed bpf_tc_attach() API could internally call a bpf_tc_attach_opts() API with preset defaults, and the latter could have all the custom bits if the user needs to go beyond the simple API, so from your bpf_tc_cls_attach() I'd also drop the NULL.
Ok, this is probably better (but maybe we can do this for the high-level bpf_program__attach that returns a bpf_link * instead of introducing yet another function).
If we know that we need variant with options, I'd vote for having just one bpf_tc_attach() API which always takes options. Passing NULL for opts is simple, no need for two APIs, I think.
- For the simple API I'd likely also drop the id (you could have a query API if needed).
This would be fine, because it's not a fast path or anything, but right now we return the id using the netlink response, otherwise for query we have to open the socket, prepare the msg, send and recv again. So it's a minor optimization.
However, there's one other problem. In an earlier version of this series, I didn't keep the id/index out parameters (to act as handle to the newly attached filter/action). This lead to problems on query. Suppose a user doesn't properly fill the opts during query (e.g. in case of filters). This means the netlink dump includes all filters matching filled in attributes. If the prog_id for all of these is same (e.g. all have same bpf classifier prog attached to them), it becomes impossible to determine which one is the filter user asked for. It is not possible to enforce filling in all kinds of attributes since some can be left out and assigned by default in the kernel (priority, chain_index etc.). So returning the newly created filter's id turned out to be the best option. This is also used to stash filter related information in bpf_link to properly release it later.
The same problem happens with actions, where we look up using the prog_id, we multiple actions with different index can match on same prog_id. It is not possible to determine which index corresponds to last loaded action.
So unless there's a better idea on how to deal with this, a query API won't work for the case where same bpf prog is attached more than once. Returning the id/index during attach seemed better than all other options we considered.
Which parts of that id struct is the data that caller might not know or can't know? Is it handle and chain_index? Or just one of them? Or?... If there is something that has to be returned back, I'd keep only that, instead of returning 6+ fields, most of which user should already know.
-- Kartikeya
On Mon, Apr 05, 2021 at 10:51:09PM IST, Andrii Nakryiko wrote:
[...]
if _block variant is just a special ifindex value, then it should be fine for users to know such a detail (we can leave a comment mentioning this specifically), especially given it's not a very popular thing. Almost doubling amount of APIs just for this doesn't make much sense, IMO.
Ok.
If we know that we need variant with options, I'd vote for having just one bpf_tc_attach() API which always takes options. Passing NULL for opts is simple, no need for two APIs, I think.
Ack.
Which parts of that id struct is the data that caller might not know or can't know? Is it handle and chain_index? Or just one of them? Or?... If there is something that has to be returned back, I'd keep only that, instead of returning 6+ fields, most of which user should already know.
The user will know ifindex and parent_id, and perhaps protocol (it would be ETH_P_ALL if they don't supply one by default). Other fields like handle, priority and chain_index can all be kernel assigned, so keeping those still makes sense. I'll change this in v2.
-- Kartikeya
Daniel Borkmann daniel@iogearbox.net writes:
On 3/30/21 10:39 PM, Andrii Nakryiko wrote:
On Sun, Mar 28, 2021 at 1:11 AM Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
On Sun, Mar 28, 2021 at 10:12:40AM IST, Andrii Nakryiko wrote:
Is there some succinct but complete enough documentation/tutorial/etc that I can reasonably read to understand kernel APIs provided by TC (w.r.t. BPF, of course). I'm trying to wrap my head around this and whether API makes sense or not. Please share links, if you have some.
Hi Andrii,
Unfortunately for the kernel API part, I couldn't find any when I was working on this. So I had to read the iproute2 tc code (tc_filter.c, f_bpf.c, m_action.c, m_bpf.c) and the kernel side bits (cls_api.c, cls_bpf.c, act_api.c, act_bpf.c) to grok anything I didn't understand. There's also similar code in libnl (lib/route/{act,cls}.c).
Other than that, these resources were useful (perhaps you already went through some/all of them):
https://docs.cilium.io/en/latest/bpf/#tc-traffic-control https://qmonnet.github.io/whirl-offload/2020/04/11/tc-bpf-direct-action/ tc(8), and tc-bpf(8) man pages
I hope this is helpful!
Thanks! I'll take a look. Sorry, I'm a bit behind with all the stuff, trying to catch up.
I was just wondering if it would be more natural instead of having _dev _block variants and having to specify __u32 ifindex, __u32 parent_id, __u32 protocol, to have some struct specifying TC "destination"? Maybe not, but I thought I'd bring this up early. So you'd have just bpf_tc_cls_attach(), and you'd so something like
bpf_tc_cls_attach(prog_fd, TC_DEV(ifindex, parent_id, protocol))
or
bpf_tc_cls_attach(prog_fd, TC_BLOCK(block_idx, protocol))
? Or it's taking it too far?
But even if not, I think detaching can be unified between _dev and _block, can't it?
Do we even need the _block variant? I would rather prefer to take the chance and make it as simple as possible, and only iff really needed extend with other APIs, for example:
bpf_tc_attach(prog_fd, ifindex, {INGRESS,EGRESS});
Internally, this will create the sch_clsact qdisc & cls_bpf filter instance iff not present yet, and attach to a default prio 1 handle 1, and _always_ in direct-action mode. This is /as simple as it gets/ and we don't need to bother users with more complex tc/cls_bpf internals unless desired. For example, extended APIs could add prio/parent so that multi-prog can be attached to a single cls_bpf instance, but even that could be a second step, imho.
While I'm all for simplifying where possible, the question becomes at what level? I.e., we initially figured we'd expose (most of) the netlink API in the low-level API (patch 3 in the series) and then have the bpf_program__* level API be the simple "just attach" one...
We could simplify the low-level one further, of course, for instance by getting rid of the block stuff entirely, but I don't see much value in leaving out the support for prio/parent in the bpf_tc_cls_* - we'd have to make the API extensible so it could be added later anyway, so why not just include it from the get-go (especially as Kumar has already written the code?)
-Toke
On Thu 25 Mar 2021 at 14:00, Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
This adds functions that wrap the netlink API used for adding, manipulating, and removing filters and actions. These functions operate directly on the loaded prog's fd, and return a handle to the filter and action using an out parameter (id for tc_cls, and index for tc_act).
The basic featureset is covered to allow for attaching, manipulation of properties, and removal of filters and actions. Some additional features like TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can added on top later by extending the bpf_tc_cls_opts struct.
Support for binding actions directly to a classifier by passing them in during filter creation has also been omitted for now. These actions have an auto clean up property because their lifetime is bound to the filter they are attached to. This can be added later, but was omitted for now as direct action mode is a better alternative to it.
An API summary:
The BPF TC-CLS API
bpf_tc_act_{attach, change, replace}_{dev, block} may be used to attach, change, and replace SCHED_CLS bpf classifiers. Separate set of functions are provided for network interfaces and shared filter blocks.
bpf_tc_cls_detach_{dev, block} may be used to detach existing SCHED_CLS filter. The bpf_tc_cls_attach_id object filled in during attach, change, or replace must be passed in to the detach functions for them to remove the filter and its attached classififer correctly.
bpf_tc_cls_get_info is a helper that can be used to obtain attributes for the filter and classififer. The opts structure may be used to choose the granularity of search, such that info for a specific filter corresponding to the same loaded bpf program can be obtained. By default, the first match is returned to the user.
Examples:
struct bpf_tc_cls_attach_id id = {}; struct bpf_object *obj; struct bpf_program *p; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "classifier"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
if (bpf_object__load(obj) < 0) return -1;
fd = bpf_program__fd(p);
r = bpf_tc_cls_attach_dev(fd, if_nametoindex("lo"), BPF_TC_CLSACT_INGRESS, ETH_P_IP, NULL, &id); if (r < 0) return r;
... which is roughly equivalent to (after clsact qdisc setup): # tc filter add dev lo ingress bpf obj /home/kkd/foo.o sec classifier
If a user wishes to modify existing options on an attached filter, the bpf_tc_cls_change_{dev, block} API may be used. Parameters like chain_index, priority, and handle are ignored in the bpf_tc_cls_opts struct as they cannot be modified after attaching a filter.
Example:
/* Optional parameters necessary to select the right filter */ DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, .handle = id.handle, .priority = id.priority, .chain_index = id.chain_index) /* Turn on direct action mode */ opts.direct_action = true; r = bpf_tc_cls_change_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &id); if (r < 0) return r;
/* Verify that the direct action mode has been set */ struct bpf_tc_cls_info info = {}; r = bpf_tc_cls_get_info_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &info); if (r < 0) return r;
assert(info.bpf_flags & TCA_BPF_FLAG_ACT_DIRECT);
This would be roughly equivalent to doing: # tc filter change dev lo egress prio <p> handle <h> bpf obj /home/kkd/foo.o section classifier da
... except a new bpf program will be loaded and replace existing one.
If a user wishes to either replace an existing filter, or create a new one with the same properties, they can use bpf_tc_cls_replace_dev. The benefit of bpf_tc_cls_change is that it fails if no matching filter exists.
The BPF TC-ACT API
bpf_tc_act_{attach, replace} may be used to attach and replace already attached SCHED_ACT actions. Passing an index of 0 has special meaning, in that an index will be automatically chosen by the kernel. The index chosen by the kernel is the return value of these functions in case of success.
bpf_tc_act_detach may be used to detach a SCHED_ACT action prog identified by the index parameter. The index 0 again has a special meaning, in that passing it will flush all existing SCHED_ACT actions loaded using the ACT API.
bpf_tc_act_get_info is a helper to get the required attributes of a loaded program to be able to manipulate it futher, by passing them into the aforementioned functions.
Example:
struct bpf_object *obj; struct bpf_program *p; __u32 index; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "action"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
if (bpf_object__load(obj) < 0) return -1;
fd = bpf_program__fd(p);
r = bpf_tc_act_attach(fd, NULL, &index); if (r < 0) return r;
if (bpf_tc_act_detach(index)) return -1;
... which is equivalent to the following sequence: tc action add action bpf obj /home/kkd/foo.o sec action tc action del action bpf index <idx>
How do you handle the locking here? Please note that while RTM_{NEW|GET|DEL}FILTER API has been refactored to handle its own locking internally (and registered with RTNL_FLAG_DOIT_UNLOCKED flag), RTM_{NEW|GET|DEL}ACTION API still expects to be called with rtnl lock taken.
[...]
Vlad Buslov vladbu@nvidia.com writes:
On Thu 25 Mar 2021 at 14:00, Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
This adds functions that wrap the netlink API used for adding, manipulating, and removing filters and actions. These functions operate directly on the loaded prog's fd, and return a handle to the filter and action using an out parameter (id for tc_cls, and index for tc_act).
The basic featureset is covered to allow for attaching, manipulation of properties, and removal of filters and actions. Some additional features like TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can added on top later by extending the bpf_tc_cls_opts struct.
Support for binding actions directly to a classifier by passing them in during filter creation has also been omitted for now. These actions have an auto clean up property because their lifetime is bound to the filter they are attached to. This can be added later, but was omitted for now as direct action mode is a better alternative to it.
An API summary:
The BPF TC-CLS API
bpf_tc_act_{attach, change, replace}_{dev, block} may be used to attach, change, and replace SCHED_CLS bpf classifiers. Separate set of functions are provided for network interfaces and shared filter blocks.
bpf_tc_cls_detach_{dev, block} may be used to detach existing SCHED_CLS filter. The bpf_tc_cls_attach_id object filled in during attach, change, or replace must be passed in to the detach functions for them to remove the filter and its attached classififer correctly.
bpf_tc_cls_get_info is a helper that can be used to obtain attributes for the filter and classififer. The opts structure may be used to choose the granularity of search, such that info for a specific filter corresponding to the same loaded bpf program can be obtained. By default, the first match is returned to the user.
Examples:
struct bpf_tc_cls_attach_id id = {}; struct bpf_object *obj; struct bpf_program *p; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "classifier"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
if (bpf_object__load(obj) < 0) return -1;
fd = bpf_program__fd(p);
r = bpf_tc_cls_attach_dev(fd, if_nametoindex("lo"), BPF_TC_CLSACT_INGRESS, ETH_P_IP, NULL, &id); if (r < 0) return r;
... which is roughly equivalent to (after clsact qdisc setup): # tc filter add dev lo ingress bpf obj /home/kkd/foo.o sec classifier
If a user wishes to modify existing options on an attached filter, the bpf_tc_cls_change_{dev, block} API may be used. Parameters like chain_index, priority, and handle are ignored in the bpf_tc_cls_opts struct as they cannot be modified after attaching a filter.
Example:
/* Optional parameters necessary to select the right filter */ DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, .handle = id.handle, .priority = id.priority, .chain_index = id.chain_index) /* Turn on direct action mode */ opts.direct_action = true; r = bpf_tc_cls_change_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &id); if (r < 0) return r;
/* Verify that the direct action mode has been set */ struct bpf_tc_cls_info info = {}; r = bpf_tc_cls_get_info_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &info); if (r < 0) return r;
assert(info.bpf_flags & TCA_BPF_FLAG_ACT_DIRECT);
This would be roughly equivalent to doing: # tc filter change dev lo egress prio <p> handle <h> bpf obj /home/kkd/foo.o section classifier da
... except a new bpf program will be loaded and replace existing one.
If a user wishes to either replace an existing filter, or create a new one with the same properties, they can use bpf_tc_cls_replace_dev. The benefit of bpf_tc_cls_change is that it fails if no matching filter exists.
The BPF TC-ACT API
bpf_tc_act_{attach, replace} may be used to attach and replace already attached SCHED_ACT actions. Passing an index of 0 has special meaning, in that an index will be automatically chosen by the kernel. The index chosen by the kernel is the return value of these functions in case of success.
bpf_tc_act_detach may be used to detach a SCHED_ACT action prog identified by the index parameter. The index 0 again has a special meaning, in that passing it will flush all existing SCHED_ACT actions loaded using the ACT API.
bpf_tc_act_get_info is a helper to get the required attributes of a loaded program to be able to manipulate it futher, by passing them into the aforementioned functions.
Example:
struct bpf_object *obj; struct bpf_program *p; __u32 index; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "action"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
if (bpf_object__load(obj) < 0) return -1;
fd = bpf_program__fd(p);
r = bpf_tc_act_attach(fd, NULL, &index); if (r < 0) return r;
if (bpf_tc_act_detach(index)) return -1;
... which is equivalent to the following sequence: tc action add action bpf obj /home/kkd/foo.o sec action tc action del action bpf index <idx>
How do you handle the locking here? Please note that while RTM_{NEW|GET|DEL}FILTER API has been refactored to handle its own locking internally (and registered with RTNL_FLAG_DOIT_UNLOCKED flag), RTM_{NEW|GET|DEL}ACTION API still expects to be called with rtnl lock taken.
Huh, locking? This is all userspace code that uses the netlink API...
-Toke
On Mon 29 Mar 2021 at 15:32, Toke Høiland-Jørgensen toke@redhat.com wrote:
Vlad Buslov vladbu@nvidia.com writes:
On Thu 25 Mar 2021 at 14:00, Kumar Kartikeya Dwivedi memxor@gmail.com wrote:
This adds functions that wrap the netlink API used for adding, manipulating, and removing filters and actions. These functions operate directly on the loaded prog's fd, and return a handle to the filter and action using an out parameter (id for tc_cls, and index for tc_act).
The basic featureset is covered to allow for attaching, manipulation of properties, and removal of filters and actions. Some additional features like TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can added on top later by extending the bpf_tc_cls_opts struct.
Support for binding actions directly to a classifier by passing them in during filter creation has also been omitted for now. These actions have an auto clean up property because their lifetime is bound to the filter they are attached to. This can be added later, but was omitted for now as direct action mode is a better alternative to it.
An API summary:
The BPF TC-CLS API
bpf_tc_act_{attach, change, replace}_{dev, block} may be used to attach, change, and replace SCHED_CLS bpf classifiers. Separate set of functions are provided for network interfaces and shared filter blocks.
bpf_tc_cls_detach_{dev, block} may be used to detach existing SCHED_CLS filter. The bpf_tc_cls_attach_id object filled in during attach, change, or replace must be passed in to the detach functions for them to remove the filter and its attached classififer correctly.
bpf_tc_cls_get_info is a helper that can be used to obtain attributes for the filter and classififer. The opts structure may be used to choose the granularity of search, such that info for a specific filter corresponding to the same loaded bpf program can be obtained. By default, the first match is returned to the user.
Examples:
struct bpf_tc_cls_attach_id id = {}; struct bpf_object *obj; struct bpf_program *p; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "classifier"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
if (bpf_object__load(obj) < 0) return -1;
fd = bpf_program__fd(p);
r = bpf_tc_cls_attach_dev(fd, if_nametoindex("lo"), BPF_TC_CLSACT_INGRESS, ETH_P_IP, NULL, &id); if (r < 0) return r;
... which is roughly equivalent to (after clsact qdisc setup): # tc filter add dev lo ingress bpf obj /home/kkd/foo.o sec classifier
If a user wishes to modify existing options on an attached filter, the bpf_tc_cls_change_{dev, block} API may be used. Parameters like chain_index, priority, and handle are ignored in the bpf_tc_cls_opts struct as they cannot be modified after attaching a filter.
Example:
/* Optional parameters necessary to select the right filter */ DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, .handle = id.handle, .priority = id.priority, .chain_index = id.chain_index) /* Turn on direct action mode */ opts.direct_action = true; r = bpf_tc_cls_change_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &id); if (r < 0) return r;
/* Verify that the direct action mode has been set */ struct bpf_tc_cls_info info = {}; r = bpf_tc_cls_get_info_dev(fd, id.ifindex, id.parent_id, id.protocol, &opts, &info); if (r < 0) return r;
assert(info.bpf_flags & TCA_BPF_FLAG_ACT_DIRECT);
This would be roughly equivalent to doing: # tc filter change dev lo egress prio <p> handle <h> bpf obj /home/kkd/foo.o section classifier da
... except a new bpf program will be loaded and replace existing one.
If a user wishes to either replace an existing filter, or create a new one with the same properties, they can use bpf_tc_cls_replace_dev. The benefit of bpf_tc_cls_change is that it fails if no matching filter exists.
The BPF TC-ACT API
bpf_tc_act_{attach, replace} may be used to attach and replace already attached SCHED_ACT actions. Passing an index of 0 has special meaning, in that an index will be automatically chosen by the kernel. The index chosen by the kernel is the return value of these functions in case of success.
bpf_tc_act_detach may be used to detach a SCHED_ACT action prog identified by the index parameter. The index 0 again has a special meaning, in that passing it will flush all existing SCHED_ACT actions loaded using the ACT API.
bpf_tc_act_get_info is a helper to get the required attributes of a loaded program to be able to manipulate it futher, by passing them into the aforementioned functions.
Example:
struct bpf_object *obj; struct bpf_program *p; __u32 index; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "action"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
if (bpf_object__load(obj) < 0) return -1;
fd = bpf_program__fd(p);
r = bpf_tc_act_attach(fd, NULL, &index); if (r < 0) return r;
if (bpf_tc_act_detach(index)) return -1;
... which is equivalent to the following sequence: tc action add action bpf obj /home/kkd/foo.o sec action tc action del action bpf index <idx>
How do you handle the locking here? Please note that while RTM_{NEW|GET|DEL}FILTER API has been refactored to handle its own locking internally (and registered with RTNL_FLAG_DOIT_UNLOCKED flag), RTM_{NEW|GET|DEL}ACTION API still expects to be called with rtnl lock taken.
Huh, locking? This is all userspace code that uses the netlink API...
-Toke
Thanks for the clarification. I'm not familiar with libbpf internals and it wasn't obvious to me that this functionality is not for creating classifiers/actions from BPF program executing in kernel-space.
A high level API is provided using the aforementioned routines internally, and these return a bpf_link object to the user. These are limited to just attach for now, and can be extended to change/replace if the use case arises in the future. It is also possible to call bpf_link__disconnect on the link and switch to managing the filter/action manually if the need arises. In most cases, the higher level API should suffice.
Example:
struct bpf_tc_cls_info info = {}; struct bpf_object *obj; struct bpf_program *p; struct bpf_link *link; __u32 index; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "classifier"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, .handle = 1); link = bpf_program__attach_tc_cls_dev(p, if_nametoindex("lo"), BPF_TC_CLSACT_INGRESS, ETH_P_IP, &opts); if (IS_ERR_OR_NULL(link)) return PTR_ERR(link);
/* We want to take ownership of the filter, so we disconnect the * link and detach it on our own */ bpf_link__disconnect(link);
r = bpf_tc_cls_get_info_dev(bpf_program__fd(fd), if_nametoindex("lo"), BPF_TC_CLSACT_INGRESS, ETH_P_IP, &opts, &info); if (r < 0) return r;
/* We get the attach_id in the info struct, pass it to detach */ bpf_tc_cls_detach_dev(&info.id);
bpf_link__destroy(link);
Example:
struct bpf_object *obj; struct bpf_program *p; struct bpf_link *link; __u32 index; int fd, r;
obj = bpf_object_open("foo.o"); if (IS_ERR_OR_NULL(obj)) return PTR_ERR(obj);
p = bpf_object__find_program_by_title(obj, "action"); if (IS_ERR_OR_NULL(p)) return PTR_ERR(p);
/* A simple example that attaches a SCHED_ACT prog */ link = bpf_program__attach_tc_act(p, NULL); if (IS_ERR_OR_NULL(link)) return PTR_ERR(link);
bpf_link__destroy(link);
Reviewed-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi memxor@gmail.com --- tools/lib/bpf/libbpf.c | 110 ++++++++++++++++++++++++++++++++++++++- tools/lib/bpf/libbpf.h | 15 ++++++ tools/lib/bpf/libbpf.map | 3 ++ 3 files changed, 127 insertions(+), 1 deletion(-)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 058b643cbcb1..cc5c200a661d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -33,6 +33,7 @@ #include <linux/filter.h> #include <linux/list.h> #include <linux/limits.h> +#include <linux/rtnetlink.h> #include <linux/perf_event.h> #include <linux/ring_buffer.h> #include <linux/version.h> @@ -6847,7 +6848,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
for (i = 0; i < obj->nr_programs; i++) { struct bpf_program *p = &obj->programs[i]; - + if (!p->nr_reloc) continue;
@@ -9443,6 +9444,10 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, struct bpf_link { int (*detach)(struct bpf_link *link); int (*destroy)(struct bpf_link *link); + union { + struct bpf_tc_cls_attach_id *tc_cls_id; + __u32 tc_act_index; + }; char *pin_path; /* NULL, if not pinned */ int fd; /* hook FD, -1 if not applicable */ bool disconnected; @@ -10199,6 +10204,109 @@ struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map) return link; }
+static int bpf_link__detach_tc_cls(struct bpf_link *link) +{ + return bpf_tc_cls_detach_dev(link->tc_cls_id); +} + +static int bpf_link__destroy_tc_cls(struct bpf_link *link) +{ + zfree(&link->tc_cls_id); + return 0; +} + +struct bpf_link *bpf_program__attach_tc_cls_dev(struct bpf_program *prog, + __u32 ifindex, __u32 parent_id, + __u32 protocol, + const struct bpf_tc_cls_opts *opts) +{ + struct bpf_tc_cls_attach_id *id = NULL; + struct bpf_link *link = NULL; + char errmsg[STRERR_BUFSIZE]; + int prog_fd, err; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return ERR_PTR(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return ERR_PTR(-ENOMEM); + link->detach = &bpf_link__detach_tc_cls; + link->destroy = &bpf_link__destroy_tc_cls; + link->fd = -1; + + id = calloc(1, sizeof(*id)); + if (!id) { + err = -ENOMEM; + goto end; + } + + err = bpf_tc_cls_attach_dev(prog_fd, ifindex, parent_id, protocol, opts, id); + if (err < 0) { + pr_warn("prog '%s': failed to attach classifier: %s\n", + prog->name, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto end; + } + + link->tc_cls_id = id; + return link; + +end: + free(id); + free(link); + return ERR_PTR(err); +} + +struct bpf_link *bpf_program__attach_tc_cls_block(struct bpf_program *prog, + __u32 block_index, __u32 protocol, + const struct bpf_tc_cls_opts *opts) +{ + return bpf_program__attach_tc_cls_dev(prog, TCM_IFINDEX_MAGIC_BLOCK, block_index, + protocol, opts); +} + +static int bpf_link__detach_tc_act(struct bpf_link *link) +{ + return bpf_tc_act_detach(link->tc_act_index); +} + +struct bpf_link *bpf_program__attach_tc_act(struct bpf_program *prog, + const struct bpf_tc_act_opts *opts) +{ + struct bpf_link *link = NULL; + char errmsg[STRERR_BUFSIZE]; + int prog_fd, err; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loading\n", prog->name); + return ERR_PTR(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return ERR_PTR(-ENOMEM); + link->detach = &bpf_link__detach_tc_act; + link->fd = -1; + + err = bpf_tc_act_attach(prog_fd, opts, &link->tc_act_index); + if (err < 0) { + pr_warn("prog '%s': failed to attach action: %s\n", prog->name, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto end; + } + + return link; + +end: + free(link); + return ERR_PTR(err); +} + enum bpf_perf_event_ret bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, void **copy_mem, size_t *copy_size, diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 63baef6045b1..e33720d0b672 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -268,6 +268,21 @@ LIBBPF_API struct bpf_link * bpf_program__attach_freplace(struct bpf_program *prog, int target_fd, const char *attach_func_name);
+struct bpf_tc_cls_opts; +struct bpf_tc_act_opts; + +LIBBPF_API struct bpf_link * +bpf_program__attach_tc_cls_dev(struct bpf_program *prog, __u32 ifindex, + __u32 parent_id, __u32 protocol, + const struct bpf_tc_cls_opts *opts); +LIBBPF_API struct bpf_link * +bpf_program__attach_tc_cls_block(struct bpf_program *prog, __u32 block_index, + __u32 protocol, + const struct bpf_tc_cls_opts *opts); +LIBBPF_API struct bpf_link * +bpf_program__attach_tc_act(struct bpf_program *prog, + const struct bpf_tc_act_opts *opts); + struct bpf_map;
LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 72022b45a8b9..2e1390e4ebf0 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -373,4 +373,7 @@ LIBBPF_0.4.0 { bpf_tc_cls_replace_dev; bpf_tc_cls_get_info_dev; bpf_tc_cls_get_info_block; + bpf_program__attach_tc_cls_dev; + bpf_program__attach_tc_cls_block; + bpf_program__attach_tc_act; } LIBBPF_0.3.0;
This adds some basic tests for the low level bpf_tc_* API and its bpf_program__attach_tc_* wrapper on top.
Reviewed-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi memxor@gmail.com --- .../selftests/bpf/prog_tests/test_tc_bpf.c | 261 ++++++++++++++++++ .../selftests/bpf/progs/test_tc_bpf_kern.c | 18 ++ 2 files changed, 279 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_tc_bpf.c create mode 100644 tools/testing/selftests/bpf/progs/test_tc_bpf_kern.c
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tc_bpf.c b/tools/testing/selftests/bpf/prog_tests/test_tc_bpf.c new file mode 100644 index 000000000000..8bab56b4dea0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_tc_bpf.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <linux/err.h> +#include <bpf/libbpf.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <test_progs.h> +#include <linux/if_ether.h> + +#define LO_IFINDEX 1 + +static int test_tc_cls_internal(int fd, __u32 parent_id) +{ + struct bpf_tc_cls_attach_id id = {}; + struct bpf_tc_cls_info info = {}; + int ret; + DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, .handle = 1, .priority = 10, + .class_id = TC_H_MAKE(1UL << 16, 1), + .chain_index = 5); + + ret = bpf_tc_cls_attach_dev(fd, LO_IFINDEX, parent_id, ETH_P_IP, &opts, + &id); + if (CHECK_FAIL(ret < 0)) + return ret; + + ret = bpf_tc_cls_get_info_dev(fd, LO_IFINDEX, parent_id, ETH_P_IP, NULL, + &info); + if (CHECK_FAIL(ret < 0)) + goto end; + + ret = -1; + + if (CHECK_FAIL(info.id.ifindex != id.ifindex) || + CHECK_FAIL(info.id.parent_id != id.parent_id) || + CHECK_FAIL(info.id.handle != id.handle) || + CHECK_FAIL(info.id.protocol != id.protocol) || + CHECK_FAIL(info.id.chain_index != id.chain_index) || + CHECK_FAIL(info.id.priority != id.priority) || + CHECK_FAIL(info.id.ifindex != LO_IFINDEX) || + CHECK_FAIL(info.id.parent_id != parent_id) || + CHECK_FAIL(info.id.handle != 1) || + CHECK_FAIL(info.id.priority != 10) || + CHECK_FAIL(info.id.protocol != ETH_P_IP) || + CHECK_FAIL(info.class_id != TC_H_MAKE(1UL << 16, 1)) || + CHECK_FAIL(info.id.chain_index != 5)) + goto end; + + opts.direct_action = true; + ret = bpf_tc_cls_replace_dev(fd, id.ifindex, id.parent_id, id.protocol, + &opts, &id); + if (CHECK_FAIL(ret < 0)) + return ret; + +end:; + ret = bpf_tc_cls_detach_dev(&id); + CHECK_FAIL(ret < 0); + return ret; +} + +static int test_tc_cls(struct bpf_program *prog, __u32 parent_id) +{ + struct bpf_tc_cls_info info = {}; + struct bpf_link *link; + int ret; + DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, .priority = 10, .handle = 1, + .class_id = TC_H_MAKE(1UL << 16, 1)); + + link = bpf_program__attach_tc_cls_dev(prog, LO_IFINDEX, parent_id, + ETH_P_ALL, &opts); + if (CHECK_FAIL(IS_ERR_OR_NULL(link))) + return PTR_ERR(link); + + ret = bpf_tc_cls_get_info_dev(bpf_program__fd(prog), LO_IFINDEX, + parent_id, ETH_P_ALL, NULL, &info); + if (CHECK_FAIL(ret < 0)) + goto end; + + ret = -1; + + if (CHECK_FAIL(info.id.ifindex != LO_IFINDEX) || + CHECK_FAIL(info.id.handle != 1) || + CHECK_FAIL(info.id.priority != 10) || + CHECK_FAIL(info.id.protocol != ETH_P_ALL) || + CHECK_FAIL(info.class_id != TC_H_MAKE(1UL << 16, 1))) + goto end; + + /* Demonstrate changing attributes (e.g. to direct action) */ + opts.class_id = TC_H_MAKE(1UL << 16, 2); + opts.direct_action = true; + + /* Disconnect as we drop to the lower level API, which invalidates the + * link. + */ + bpf_link__disconnect(link); + + ret = bpf_tc_cls_change_dev(bpf_program__fd(prog), info.id.ifindex, + info.id.parent_id, info.id.protocol, &opts, + &info.id); + if (CHECK_FAIL(ret < 0)) + goto end; + + ret = bpf_tc_cls_get_info_dev(bpf_program__fd(prog), info.id.ifindex, + info.id.parent_id, info.id.protocol, NULL, + &info); + if (CHECK_FAIL(ret < 0)) + goto end; + + ret = -1; + + if (CHECK_FAIL(info.class_id != TC_H_MAKE(1UL << 16, 2))) + goto end; + if (CHECK_FAIL((info.bpf_flags & TCA_BPF_FLAG_ACT_DIRECT) != 1)) + goto end; + + ret = bpf_tc_cls_detach_dev(&info.id); + if (CHECK_FAIL(ret < 0)) + goto end; + +end: + ret = bpf_link__destroy(link); + CHECK_FAIL(ret < 0); + return ret; +} + +static int test_tc_act_internal(int fd) +{ + struct bpf_tc_act_info info = {}; + __u32 index = 0; + int ret; + DECLARE_LIBBPF_OPTS(bpf_tc_act_opts, opts, 0); + + ret = bpf_tc_act_attach(fd, &opts, &index); + if (CHECK_FAIL(ret < 0 || !index)) + goto end; + + index = 0; + ret = bpf_tc_act_attach(fd, &opts, &index); + if (CHECK_FAIL(ret < 0 || !index)) + goto end; + + opts.index = 3; + index = 0; + ret = bpf_tc_act_attach(fd, &opts, &index); + if (CHECK_FAIL(ret < 0 || !index)) + goto end; + + index = 0; + ret = bpf_tc_act_replace(fd, &opts, &index); + if (CHECK_FAIL(ret < 0 || !index)) + goto end; + + opts.index = 1; + ret = bpf_tc_act_attach(fd, &opts, &index); + if (CHECK_FAIL(!ret || ret != -EEXIST)) { + ret = -1; + goto end; + } + + for (int i = 0; i < 3; i++) { + memset(&info, 0, sizeof(info)); + + ret = bpf_tc_act_get_info(fd, &info); + if (CHECK_FAIL(ret < 0 && ret != -ESRCH)) + goto end; + + if (CHECK_FAIL(ret == -ESRCH)) + goto end; + + if (CHECK_FAIL(info.refcnt != 1)) + goto end; + + ret = bpf_tc_act_detach(info.index); + if (CHECK_FAIL(ret < 0)) + goto end; + } + + CHECK_FAIL(bpf_tc_act_get_info(fd, &info) == -ESRCH); + +end: + ret = bpf_tc_act_detach(0); + CHECK_FAIL(ret < 0); + return ret; +} + +static int test_tc_act(struct bpf_program *prog) +{ + struct bpf_tc_act_info info = {}; + struct bpf_link *link; + int ret; + DECLARE_LIBBPF_OPTS(bpf_tc_act_opts, opts, .index = 42); + + link = bpf_program__attach_tc_act(prog, &opts); + if (CHECK_FAIL(IS_ERR_OR_NULL(link))) + return PTR_ERR(link); + + ret = bpf_tc_act_get_info(bpf_program__fd(prog), &info); + if (CHECK_FAIL(ret < 0)) + goto end; + + if (CHECK_FAIL(info.index != 42)) + goto end; + +end: + ret = bpf_link__destroy(link); + CHECK_FAIL(ret < 0); + return ret; +} + +void test_test_tc_bpf(void) +{ + const char *file = "./test_tc_bpf_kern.o"; + int cls_fd, act_fd, ret; + struct bpf_program *clsp, *actp; + struct bpf_object *obj; + + obj = bpf_object__open(file); + if (CHECK_FAIL(IS_ERR_OR_NULL(obj))) + return; + + clsp = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(IS_ERR_OR_NULL(clsp))) + goto end; + + actp = bpf_object__find_program_by_title(obj, "action"); + if (CHECK_FAIL(IS_ERR_OR_NULL(clsp))) + goto end; + + ret = bpf_object__load(obj); + if (CHECK_FAIL(ret < 0)) + goto end; + + cls_fd = bpf_program__fd(clsp); + act_fd = bpf_program__fd(actp); + + if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) + goto end; + + ret = test_tc_cls_internal(cls_fd, BPF_TC_CLSACT_INGRESS); + if (CHECK_FAIL(ret < 0)) + goto end; + + ret = test_tc_cls(clsp, BPF_TC_CLSACT_EGRESS); + if (CHECK_FAIL(ret < 0)) + goto end; + + system("tc qdisc del dev lo clsact"); + + ret = test_tc_act_internal(act_fd); + if (CHECK_FAIL(ret < 0)) + goto end; + + ret = test_tc_act(actp); + if (CHECK_FAIL(ret < 0)) + goto end; + +end: + bpf_object__close(obj); + return; +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_bpf_kern.c b/tools/testing/selftests/bpf/progs/test_tc_bpf_kern.c new file mode 100644 index 000000000000..d39644ea0fd7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_bpf_kern.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +// Dummy prog to test tc_bpf API + +SEC("classifier") +int cls(struct __sk_buff *skb) +{ + return 0; +} + +SEC("action") +int act(struct __sk_buff *skb) +{ + return 0; +}
On Thu, Mar 25, 2021 at 05:30:03PM +0530, Kumar Kartikeya Dwivedi wrote:
This adds some basic tests for the low level bpf_tc_* API and its bpf_program__attach_tc_* wrapper on top.
*_block() apis from patch 3 and 4 are not covered by this selftest. Why were they added ? And how were they tested?
Pls trim your cc. bpf@vger and netdev@vger would have been enough.
My main concern with this set is that it adds netlink apis to libbpf while we already agreed to split xdp manipulation pieces out of libbpf. It would be odd to add tc apis now only to split them later. I think it's better to start with new library for tc/xdp and have libbpf as a dependency on that new lib. For example we can add it as subdir in tools/lib/bpf/.
Similarly I think integerating static linking into libbpf was a mistake. It should be a sub library as well.
If we end up with core libbpf and ten sublibs for tc, xdp, af_xdp, linking, whatever else the users would appreciate that we don't shove single libbpf to them with a ton of features that they might never use.
Alexei Starovoitov alexei.starovoitov@gmail.com writes:
On Thu, Mar 25, 2021 at 05:30:03PM +0530, Kumar Kartikeya Dwivedi wrote:
This adds some basic tests for the low level bpf_tc_* API and its bpf_program__attach_tc_* wrapper on top.
*_block() apis from patch 3 and 4 are not covered by this selftest. Why were they added ? And how were they tested?
Pls trim your cc. bpf@vger and netdev@vger would have been enough.
My main concern with this set is that it adds netlink apis to libbpf while we already agreed to split xdp manipulation pieces out of libbpf. It would be odd to add tc apis now only to split them later.
We're not removing the ability to attach an XDP program via netlink from libxdp, though. This is the equivalent for TC: the minimum support to attach a program, and if you want to do more, you pull in another library or roll your own.
I'm fine with cutting out more stuff and making this even more minimal (e.g., remove the block stuff and only support attach/detach on ifaces), but we figured we'd err on the side of including too much and getting some feedback from others on which bits are the essential ones to keep, and which can be dropped.
I think it's better to start with new library for tc/xdp and have libbpf as a dependency on that new lib. For example we can add it as subdir in tools/lib/bpf/.
I agree for the higher-level stuff (though I'm not sure what that would be for TC), but right now TC programs are the only ones that cannot be attached by libbpf, which is annoying; that's what we're trying to fix.
-Toke
On Sat, Mar 27, 2021 at 04:17:16PM +0100, Toke Høiland-Jørgensen wrote:
Alexei Starovoitov alexei.starovoitov@gmail.com writes:
On Thu, Mar 25, 2021 at 05:30:03PM +0530, Kumar Kartikeya Dwivedi wrote:
This adds some basic tests for the low level bpf_tc_* API and its bpf_program__attach_tc_* wrapper on top.
*_block() apis from patch 3 and 4 are not covered by this selftest. Why were they added ? And how were they tested?
Pls trim your cc. bpf@vger and netdev@vger would have been enough.
My main concern with this set is that it adds netlink apis to libbpf while we already agreed to split xdp manipulation pieces out of libbpf. It would be odd to add tc apis now only to split them later.
We're not removing the ability to attach an XDP program via netlink from libxdp, though. This is the equivalent for TC: the minimum support to attach a program, and if you want to do more, you pull in another library or roll your own.
I'm fine with cutting out more stuff and making this even more minimal (e.g., remove the block stuff and only support attach/detach on ifaces), but we figured we'd err on the side of including too much and getting some feedback from others on which bits are the essential ones to keep, and which can be dropped.
This is up to you. I'm trying to understand the motivation for *_block() apis. I'm not taking a stance for/against them.
I think it's better to start with new library for tc/xdp and have libbpf as a dependency on that new lib. For example we can add it as subdir in tools/lib/bpf/.
I agree for the higher-level stuff (though I'm not sure what that would be for TC), but right now TC programs are the only ones that cannot be attached by libbpf, which is annoying; that's what we're trying to fix.
Sure. I wasn't saying that there is no place for these APIs in libbpf+. Just that existing libbpf is already became a kitchen sink of features that users are not going to use like static linking. tc-api was a straw that broke the camel's back. I think we must move static linking and skeleton out of libbpf before the next release.
On Mon, Mar 29, 2021 at 06:56:02AM IST, Alexei Starovoitov wrote:
This is up to you. I'm trying to understand the motivation for *_block() apis. I'm not taking a stance for/against them.
The block APIs simply attach to a different shared filter block, so in that sense they just forward to the bpf_tc_cls_*_dev API internally, where parent_id is substituted as block_index, and ifindex is set to a special value (to indicate operation on a block), but is still a distinct attach point, and both APIs cannot be mixed (i.e. manipulation of filter attached using block API is not possible using dev API).
e.g.
# tc qdisc add dev <foo> ingress block 1 # tc qdisc add dev <bar> ingress block 1
Now you can attach a filter to the shared block, e.g.
# tc filter add block 1 bpf /home/kkd/foo.o sec cls direct-action
and it will attach the identical filter with the bpf prog classifier to both qdiscs in one go, instead of having to duplicate filter creation for each qdisc. You can add arbitrarily many qdiscs to such a filter block, easing filter management, and saving on resources.
So for the API, it made sense to separate this into its own function as it is a different attach point, both for the low level API and their higher level wrappers. This does increase the symbol count, but maintenance wise it is zero-cost since it simply forwards to the dev functions.
As for the tests, I'll add them for the block API in v2, when I get around to sending it (i.e. after the review is over).
[...]
-- Kartikeya
On Fri, Mar 26, 2021 at 7:15 PM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Thu, Mar 25, 2021 at 05:30:03PM +0530, Kumar Kartikeya Dwivedi wrote:
This adds some basic tests for the low level bpf_tc_* API and its bpf_program__attach_tc_* wrapper on top.
*_block() apis from patch 3 and 4 are not covered by this selftest. Why were they added ? And how were they tested?
Pls trim your cc. bpf@vger and netdev@vger would have been enough.
My main concern with this set is that it adds netlink apis to libbpf while we already agreed to split xdp manipulation pieces out of libbpf. It would be odd to add tc apis now only to split them later.
We weren't going to split out basic attach APIs at all. So bpf_set_link_xdp_fd() and bpf_program__attach_xdp() would stay in libbpf. libxdp/libxsk would contain higher-level APIs which establish additional conventions, beyond the basic operation of attaching BPF program to XDP hook. E.g, all the chaining and how individual XDP "sub-programs" are ordered, processed, updated/replaced, etc. That's all based on one particular convention that libxdp would establish, so that part shouldn't live in libbpf.
So in that sense, having TC attach APIs makes sense to complete libbpf's APIs. I think it's totally in libbpf's domain to provide APIs of the form "attach BPF program to BPF hook".
I think it's better to start with new library for tc/xdp and have libbpf as a dependency on that new lib. For example we can add it as subdir in tools/lib/bpf/.
Similarly I think integerating static linking into libbpf was a mistake. It should be a sub library as well.
If we end up with core libbpf and ten sublibs for tc, xdp, af_xdp, linking, whatever else the users would appreciate that we don't shove single libbpf to them with a ton of features that they might never use.
What's the concern exactly? The size of the library? Having 10 micro-libraries has its own set of downsides, I'm not convinced that's a better situation for end users. And would certainly cause more hassle for libbpf developers and packagers.
And what did you include in "core libbpf"?
On Sat, Mar 27, 2021 at 09:32:58PM -0700, Andrii Nakryiko wrote:
I think it's better to start with new library for tc/xdp and have libbpf as a dependency on that new lib. For example we can add it as subdir in tools/lib/bpf/.
Similarly I think integerating static linking into libbpf was a mistake. It should be a sub library as well.
If we end up with core libbpf and ten sublibs for tc, xdp, af_xdp, linking, whatever else the users would appreciate that we don't shove single libbpf to them with a ton of features that they might never use.
What's the concern exactly? The size of the library? Having 10 micro-libraries has its own set of downsides,
specifically?
I'm not convinced that's a better situation for end users. And would certainly cause more hassle for libbpf developers and packagers.
For developers and packagers.. yes. For users.. quite the opposite. The skel gen and static linking must be split out before the next libbpf release. Not a single application linked with libbpf is going to use those pieces. bpftool is one and only that needs them. Hence forcing libbpf users to increase their .text with a dead code is a selfish call of libbpf developers and packagers. The user's priorities must come first.
And what did you include in "core libbpf"?
I would take this opportunity to split libbpf into maintainable pieces: - libsysbpf - sys_bpf wrappers (pretty much tools/lib/bpf/bpf.c) - libbpfutil - hash, strset - libbtf - BTF read/write - libbpfelf - ELF parsing, CORE, ksym, kconfig - libbpfskel - skeleton gen used by bpftool only - libbpflink - linker used by bpftool only - libbpfnet - networking attachment via netlink including TC and XDP - libbpftrace - perfbuf, ringbuf - libxdp - Toke's xdp chaining - libxsk - af_xdp logic
In the future the stack trace symbolization code can come into libbpftrace or be a part of its own lib. My upcoming loader program and signed prog generation logic can be part of libbpfskel.
On Sun, Mar 28, 2021 at 6:40 PM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sat, Mar 27, 2021 at 09:32:58PM -0700, Andrii Nakryiko wrote:
I think it's better to start with new library for tc/xdp and have libbpf as a dependency on that new lib. For example we can add it as subdir in tools/lib/bpf/.
Similarly I think integerating static linking into libbpf was a mistake. It should be a sub library as well.
If we end up with core libbpf and ten sublibs for tc, xdp, af_xdp, linking, whatever else the users would appreciate that we don't shove single libbpf to them with a ton of features that they might never use.
What's the concern exactly? The size of the library? Having 10 micro-libraries has its own set of downsides,
specifically?
You didn't answer my question, but from what you write below I assume libbpf size is your main concern?
As for downsides, I'm sure I'm not yet seeing all of the problems we'll encounter when splitting libbpf into 10 pieces. But as a user, having to figure out which libraries I need to use is a big hassle. E.g., for XDP application using ringbuf, I'll need libbpfelf, libbpftrace, libbpfnet, which implicitly also would depend on libsysbpf, libbtf, libbpfutil, I assume. So having to list 3 vs 1 library is already annoying, but when statically linking I'd need to specify all 6. I'd very much rather know that it has to be -lbpf at it will provide me with all the basics (and it's already -lz and -lelf in static linking scenario, which I wish we could get rid of).
I'm not convinced that's a better situation for end users. And would certainly cause more hassle for libbpf developers and packagers.
For developers and packagers.. yes. For users.. quite the opposite.
See above. I don't know which hassle is libbpf for users today. You were implying code size used for functionality users might not use (e.g., linker). Libbpf is a very small library, <300KB. There are users building tools for constrained embedded systems that use libbpf. There are/were various problems mentioned, but the size of libbpf wasn't yet one of them. We should certainly watch the code bloat, but we are not yet at the point where library is too big for users to be turned off. In shared library case it's even less of a concern.
The skel gen and static linking must be split out before the next libbpf release. Not a single application linked with libbpf is going to use those pieces. bpftool is one and only that needs them. Hence forcing libbpf users to increase their .text with a dead code is a selfish call of libbpf developers and packagers. The user's priorities must come first.
And what did you include in "core libbpf"?
I would take this opportunity to split libbpf into maintainable pieces:
- libsysbpf - sys_bpf wrappers (pretty much tools/lib/bpf/bpf.c)
- libbpfutil - hash, strset
strset and hash are internal data structures, I never intended to expose them through public APIs. I haven't investigated, but if we have a separate shared library (libbpfutil), I imagine we won't be able to hide those APIs, right?
- libbtf - BTF read/write
- libbpfelf - ELF parsing, CORE, ksym, kconfig
- libbpfskel - skeleton gen used by bpftool only
skeleton generation is already part of bpftool, there is no need to split anything out
- libbpflink - linker used by bpftool only
- libbpfnet - networking attachment via netlink including TC and XDP
- libbpftrace - perfbuf, ringbuf
ringbuf and perfbuf are both very small code-wise, and are used in majority of BPF applications anyways
- libxdp - Toke's xdp chaining
- libxsk - af_xdp logic
Now, if we look at libbpf .o files, we can approximately see what functionality is using most code:
File Size Percent
bpf.o 17800 4.88 bpf_prog_linfo.o 2952 0.81 btf_dump.o 20472 5.61 btf.o 58160 15.93 hashmap.o 4056 1.11 libbpf_errno.o 2912 0.80 libbpf.o 190072 52.06 libbpf_probes.o 6696 1.83 linker.o 29408 8.05 netlink.o 5944 1.63 nlattr.o 2744 0.75 ringbuf.o 6128 1.68 str_error.o 1640 0.45 strset.o 3656 1.00 xsk.o 12456 3.41
Total 365096 100.00
so libbpf.o which has mostly bpf_object open/load logic and CO-RE take more than half already. And it depends on still more stuff in btf, hashmap, bpf, libbpf_probes, errno. But the final code size is even smaller, because libbpf.so is just 285128 bytes (not 365096 as implied by the table above), so even these numbers are pessimistic.
linker.o, which is about 8% of the code right now, but is also actually taking less than 29KB, because when I remove linker.o and re-compile, the final libbpf.so goes from 285128 to 267576 = 17552 reduction. Even if it grows 2x, I'd still say it's not a big deal.
One reason to keep BPF linker in libbpf is that it is not only bpftool that would be using it. Our libbpf Rust bindings is implementing its own BPF skeleton generation, and we'd like to use linker APIs to support static linking when using libbpf-rs without depending on bpftool. So having it in libbpf and not in bpftool is good when you consider the wider ecosystem.
But again, let's just reflect for a second that we are talking about the library that takes less than 300KB total. It would be also interesting to experiment with LTO and its effect on final binaries when statically linking against libbpf. I haven't tried yet, though.
In the future the stack trace symbolization code can come into libbpftrace or be a part of its own lib. My upcoming loader program and signed prog generation logic can be part of libbpfskel.
On Sun, Mar 28, 2021 at 07:38:42PM -0700, Andrii Nakryiko wrote:
See above. I don't know which hassle is libbpf for users today. You were implying code size used for functionality users might not use (e.g., linker). Libbpf is a very small library, <300KB. There are users building tools for constrained embedded systems that use libbpf. There are/were various problems mentioned, but the size of libbpf wasn't yet one of them. We should certainly watch the code bloat, but we are not yet at the point where library is too big for users to be turned off.
It's true that today sizeof(libbpf + libelf + libz) ~ 500k is not a concern. I'm worried what it will get to if we don't start splitting things up.
Why split libxdp into its own lib? If tc attach is going to part of libbpf all things xdp should be part of libbpf as well.
But af_xdp folks are probably annoyed that they need to add -lelf an -lz though they're not using them. Just a tech debt that eventually need to be paid.
I would take this opportunity to split libbpf into maintainable pieces:
- libsysbpf - sys_bpf wrappers (pretty much tools/lib/bpf/bpf.c)
- libbpfutil - hash, strset
strset and hash are internal data structures, I never intended to expose them through public APIs. I haven't investigated, but if we have a separate shared library (libbpfutil), I imagine we won't be able to hide those APIs, right?
In the other thread you've proposed to copy paste hash implemenation into pahole. That's not ideal. If we had libbpfutil other projects could have used that without copy-paste.
But again, let's just reflect for a second that we are talking about the library that takes less than 300KB total.
that's today. Plus mandatory libelf and libz. I would like to have libsysbpf that doesn't depend on libelf/libz for folks that don't need it. Also I'd like to see symbolizer to be included in "libbpf package". Currently it's the main component that libbcc offers, but libbpf doesn't. Say we don't split libbpf. Then symbolizer will bring some dwarf library (say libdwarves ~ 1Mbyte) and libiberty ~ 500k (for c++ demangle). Now we're looking at multi megabyte libbpf package. I think the users would benefit from smaller building blocks. Splitting into 10 mini libs is overkill, of course, but some split is necessary. I agree that moving static linking into separate lib won't really affect .text size. The point is not to reduce text, but to establish a framework where such things are possible. Then symbolizer and anything fancier that would depend on other libs can be part of "libbpf package". I mean single rpm that contains all libbpf libs. Basic libsysbpf wouldn't need libelf/z. libbpfsymbolizer would need libdwarf, etc. Maybe some libbpfnet would depend on libnl or what not.
On Mon, Mar 29, 2021 at 8:28 PM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Sun, Mar 28, 2021 at 07:38:42PM -0700, Andrii Nakryiko wrote:
See above. I don't know which hassle is libbpf for users today. You were implying code size used for functionality users might not use (e.g., linker). Libbpf is a very small library, <300KB. There are users building tools for constrained embedded systems that use libbpf. There are/were various problems mentioned, but the size of libbpf wasn't yet one of them. We should certainly watch the code bloat, but we are not yet at the point where library is too big for users to be turned off.
It's true that today sizeof(libbpf + libelf + libz) ~ 500k is not a concern. I'm worried what it will get to if we don't start splitting things up.
I'd say let's cross that bridge when we get there. We might never have to even worry about that because libbpf won't grow in size that much.
Why split libxdp into its own lib?
Because libxdp is establishing *a way* to perform XDP chaining and all the setup around that. If it was the only right way to do this (and it was clear it is the only way), then we might have declared that as a solved problem worth providing with libbpf out of the box. I don't think even Toke would claim that his approach is the only possible and clearly superior to anything else. I think it's too nuanced and complicated problem to have the solution.
If tc attach is going to part of libbpf all things xdp should be part of libbpf as well.
I'm not TC expert, but it seems to be conceptually equivalent to basic "attach to cgroup" or "attach XDP to interface" or "attach to tracepoint" API, so seems in line with what libbpf is trying to provide. If someone would want to construct higher-level concept on top of that (e.g., some chaining of TC programs or whatnot), then it would be out of scope for libbpf.
But af_xdp folks are probably annoyed that they need to add -lelf an -lz though they're not using them. Just a tech debt that eventually need to be paid.
Those are dependencies of libbpf. Unless we want to re-implement ELF handling code, we can't get rid of -lelf. I don't consider that a tech debt at all. As for -lz, it's used for processing /proc/kconfig.gz (for __kconfig externs). We can do dynamic libz.so loading only when __kconfig externs are used, if you think that's a big problem. But libz is such a widely available library, that no one complained so far. Yes, I'm annoyed by having to specify -lelf and -lz as well, but that's how C linking work, so I can't do much about that. Even more, the order matters as well!
And just in the last email you were proposing to add 10 more -l<libbpfsomething> and were wondering what's the downside, so I'm confused about the direction of this discussion.
I would take this opportunity to split libbpf into maintainable pieces:
- libsysbpf - sys_bpf wrappers (pretty much tools/lib/bpf/bpf.c)
- libbpfutil - hash, strset
strset and hash are internal data structures, I never intended to expose them through public APIs. I haven't investigated, but if we have a separate shared library (libbpfutil), I imagine we won't be able to hide those APIs, right?
In the other thread you've proposed to copy paste hash implemenation into pahole. That's not ideal. If we had libbpfutil other projects could have used that without copy-paste.
I know it's not ideal. But I don't think libbpf should be in the business of providing generic data structures with stable APIs either. We are stuck with C, unfortunately, so we have to live with its deficiencies.
But again, let's just reflect for a second that we are talking about the library that takes less than 300KB total.
that's today. Plus mandatory libelf and libz. I would like to have libsysbpf that doesn't depend on libelf/libz for folks that don't need it.
TBH, bpf.c is such a minimal shim on top of bpf() syscall, that providing all of its implementation as a single .h wouldn't be too horrible. Then whatever applications want those syscall wrappers would just include bpf/bpf.h and have no need for the library at all.
Also I'd like to see symbolizer to be included in "libbpf package". Currently it's the main component that libbcc offers, but libbpf doesn't. Say we don't split libbpf. Then symbolizer will bring some dwarf library (say libdwarves ~ 1Mbyte) and libiberty ~ 500k (for c++ demangle). Now we're looking at multi megabyte libbpf package.
Right, which is one of the reasons why it probably doesn't belong in libbpf at all. Another is that it's not BPF-specific functionality at all.
I think the users would benefit from smaller building blocks. Splitting into 10 mini libs is overkill, of course, but some split is necessary. I agree that moving static linking into separate lib won't really affect .text size. The point is not to reduce text, but to establish a framework where such things are possible. Then symbolizer and anything fancier that would depend on other libs can be part of "libbpf package". I mean single rpm that contains all libbpf libs. Basic libsysbpf wouldn't need libelf/z. libbpfsymbolizer would need libdwarf, etc. Maybe some libbpfnet would depend on libnl or what not.
I'm against pro-active splitting just in case. I'd rather discuss specific problems when we get to them. I think it's premature right now to split libbpf.
On Tue, Mar 30, 2021 at 1:28 PM Andrii Nakryiko andrii.nakryiko@gmail.com wrote:
In the other thread you've proposed to copy paste hash implemenation into pahole. That's not ideal. If we had libbpfutil other projects could have used that without copy-paste.
I know it's not ideal. But I don't think libbpf should be in the business of providing generic data structures with stable APIs either.
There is a need for hash in pahole and it's already using libbpf. Would be good to reuse the code.
that's today. Plus mandatory libelf and libz. I would like to have libsysbpf that doesn't depend on libelf/libz for folks that don't need it.
TBH, bpf.c is such a minimal shim on top of bpf() syscall, that providing all of its implementation as a single .h wouldn't be too horrible. Then whatever applications want those syscall wrappers would just include bpf/bpf.h and have no need for the library at all.
1k line bpf.h. hmm. That's not going to be a conventional C header, but it could work I guess.
Also I'd like to see symbolizer to be included in "libbpf package". Currently it's the main component that libbcc offers, but libbpf doesn't. Say we don't split libbpf. Then symbolizer will bring some dwarf library (say libdwarves ~ 1Mbyte) and libiberty ~ 500k (for c++ demangle). Now we're looking at multi megabyte libbpf package.
Right, which is one of the reasons why it probably doesn't belong in libbpf at all. Another is that it's not BPF-specific functionality at all.
symbolizer, usdt, python and lua bindings is what made libbcc successful. I think "libbpf package" should include everything that bpf tracing folks might need. Getting -l flags correct from a single package isn't a big deal compared with the need to deal with different packages that depend on each other.
I'm against pro-active splitting just in case. I'd rather discuss specific problems when we get to them. I think it's premature right now to split libbpf.
Fine. I'm mainly advocating to change the mental model to see libbpf as a collection of tools and libraries and not just single libbpf.a
Alexei Starovoitov alexei.starovoitov@gmail.com writes:
On Sat, Mar 27, 2021 at 09:32:58PM -0700, Andrii Nakryiko wrote:
I think it's better to start with new library for tc/xdp and have libbpf as a dependency on that new lib. For example we can add it as subdir in tools/lib/bpf/.
Similarly I think integerating static linking into libbpf was a mistake. It should be a sub library as well.
If we end up with core libbpf and ten sublibs for tc, xdp, af_xdp, linking, whatever else the users would appreciate that we don't shove single libbpf to them with a ton of features that they might never use.
What's the concern exactly? The size of the library? Having 10 micro-libraries has its own set of downsides,
specifically?
I'm not convinced that's a better situation for end users. And would certainly cause more hassle for libbpf developers and packagers.
For developers and packagers.. yes. For users.. quite the opposite. The skel gen and static linking must be split out before the next libbpf release. Not a single application linked with libbpf is going to use those pieces.
I'd tend to agree about the skeleton generation, but I have one use case in mind where having the linker in library form would be handy: dynamically building an XDP program at load time from pre-compiled pieces.
Consider xdp-filter[0]: it's a simplistic packet filter that can filter on different bits of the packet header, mostly meant as a demonstration of XDP packet filtering performance. It's also using conditional compilation so that it can be loaded in a mode that skips parsing L4 headers entirely if port-based filtering is not enabled. Right now we do that by pre-compiling five different variants of the XDP program and loading based on the selected feature set, but with linking in libbpf, we could instead have a single BPF program with granular filtering functions and just assemble the final program from those bits at load time.
The actual xdp-filter program may be too simplistic to gain any performance for this, but I believe the general approach could be a way to realise the "improved performance through skipping code" promise of an XDP-based data path. Having linking be part of libbpf will make this straight-forward to integrate into applications.
[0] https://github.com/xdp-project/xdp-tools/tree/master/xdp-filter
bpftool is one and only that needs them. Hence forcing libbpf users to increase their .text with a dead code is a selfish call of libbpf developers and packagers. The user's priorities must come first.
And what did you include in "core libbpf"?
I would take this opportunity to split libbpf into maintainable pieces:
- libsysbpf - sys_bpf wrappers (pretty much tools/lib/bpf/bpf.c)
- libbpfutil - hash, strset
- libbtf - BTF read/write
- libbpfelf - ELF parsing, CORE, ksym, kconfig
- libbpfskel - skeleton gen used by bpftool only
- libbpflink - linker used by bpftool only
- libbpfnet - networking attachment via netlink including TC and XDP
- libbpftrace - perfbuf, ringbuf
- libxdp - Toke's xdp chaining
- libxsk - af_xdp logic
Huh? You've got to be joking? How is that going to improve things for users? Just the cognitive load of figuring out which linker flags to use is going to be prohibitive. Not to mention the hassle of keeping multiple library versions in sync etc.
If the concern is .text size, surely there are better ways to fix that? LTO is the obvious "automagic" solution, but even without that, just supporting conditional compilation via defines in the existing libbpf ought to achieve the same thing without exposing the gory details to the users?
-Toke
linux-kselftest-mirror@lists.linaro.org