Allow looking up an nf_conn. This allows eBPF programs to leverage nf_conntrack state for similar purposes to socket state use cases, as provided by the socket lookup helpers. This is particularly useful when nf_conntrack state is locally available, but socket state is not.
v2: - Fix functions in need of and missing static inline (kbuild) - Move tests to separate patch and submit as a series (John) - Improve clarity in helper documentation (John) - Add CONFIG_NF_CONNTRACK=m support (Daniel)
Signed-off-by: Matthew Cover matthew.cover@stackpath.com --- include/linux/bpf.h | 29 ++++ include/linux/netfilter.h | 12 ++ include/uapi/linux/bpf.h | 111 ++++++++++++++- kernel/bpf/verifier.c | 105 ++++++++++++++- net/core/filter.c | 277 ++++++++++++++++++++++++++++++++++++++ net/netfilter/core.c | 16 +++ net/netfilter/nf_conntrack_core.c | 1 + scripts/bpf_helpers_doc.py | 4 + tools/include/uapi/linux/bpf.h | 111 ++++++++++++++- 9 files changed, 658 insertions(+), 8 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8e3b8f4..f502e1f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -239,6 +239,7 @@ enum bpf_arg_type { ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ + ARG_PTR_TO_NF_CONN, /* pointer to bpf_nf_conn */ };
/* type of values returned from helper functions */ @@ -250,6 +251,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ + RET_PTR_TO_NF_CONN_OR_NULL, /* returns a pointer to a nf_conn or NULL */ };
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -316,6 +318,8 @@ enum bpf_reg_type { PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_BTF_ID, /* reg points to kernel struct */ + PTR_TO_NF_CONN, /* reg points to struct nf_conn */ + PTR_TO_NF_CONN_OR_NULL, /* reg points to struct nf_conn or NULL */ };
/* The information passed from prog-specific *_is_valid_access @@ -1513,4 +1517,29 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2);
+#if IS_ENABLED(CONFIG_NF_CONNTRACK) +bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); + +u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size); +#else +static inline bool bpf_nf_conn_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} + +static inline u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + return 0; +} +#endif /* CONFIG_NF_CONNTRACK */ + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index eb312e7..a360ced 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -451,6 +451,9 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb); +struct nf_conntrack_tuple_hash * +nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone, + const struct nf_conntrack_tuple *tuple); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} struct nf_conntrack_tuple; @@ -459,6 +462,12 @@ static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, { return false; } +static inline struct nf_conntrack_tuple_hash * +nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone, + const struct nf_conntrack_tuple *tuple) +{ + return NULL; +} #endif
struct nf_conn; @@ -469,6 +478,9 @@ struct nf_ct_hook { void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); + struct nf_conntrack_tuple_hash * + (*find_get)(struct net *net, const struct nf_conntrack_zone *zone, + const struct nf_conntrack_tuple *tuple); }; extern struct nf_ct_hook __rcu *nf_ct_hook;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 033d90a..85c4b3f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2885,6 +2885,88 @@ struct bpf_stack_build_id { * **-EPERM** if no permission to send the *sig*. * * **-EAGAIN** if bpf program can try again. + * + * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP nf_conntrack entry matching *tuple*, optionally in + * a child network namespace *netns*. The return value must be + * checked, and if non-**NULL**, released via + * **bpf_ct_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or xdp_md (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 nf_conn. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 nf_conn. + * + * If the *netns* is a negative signed 32-bit integer, then the + * nf_conn lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For XDP hooks, this is the netns of the device in + * the xdp_md. If *netns* is any other signed 32-bit value greater + * than or equal to zero then it specifies the ID of the netns + * relative to the netns associated with the *ctx*. *netns* values + * beyond the range of 32-bit integers are reserved for future + * use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper will always return NULL if the kernel was compiled + * without **CONFIG_NF_CONNTRACK**. + * Return + * Pointer to **struct bpf_nf_conn**, or **NULL** in case of + * failure. + * + * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for UDP nf_conntrack entry matching *tuple*, optionally in + * a child network namespace *netns*. The return value must be + * checked, and if non-**NULL**, released via + * **bpf_ct_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or xdp_md (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 nf_conn. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 nf_conn. + * + * If the *netns* is a negative signed 32-bit integer, then the + * nf_conn lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For XDP hooks, this is the netns of the device in + * the xdp_md. If *netns* is any other signed 32-bit value greater + * than or equal to zero then it specifies the ID of the netns + * relative to the netns associated with the *ctx*. *netns* values + * beyond the range of 32-bit integers are reserved for future + * use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper will always return NULL if the kernel was compiled + * without **CONFIG_NF_CONNTRACK**. + * Return + * Pointer to **struct bpf_nf_conn**, or **NULL** in case of + * failure. + * + * int bpf_ct_release(struct bpf_nf_conn *ct) + * Description + * Release the reference held by *ct*. *ct* must be a + * non-**NULL** pointer that was returned from + * **bpf_ct_lookup_xxx**\ (). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3004,7 +3086,10 @@ struct bpf_stack_build_id { FN(probe_read_user_str), \ FN(probe_read_kernel_str), \ FN(tcp_send_ack), \ - FN(send_signal_thread), + FN(send_signal_thread), \ + FN(ct_lookup_tcp), \ + FN(ct_lookup_udp), \ + FN(ct_release),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3278,6 +3363,30 @@ struct bpf_sock_tuple { }; };
+struct bpf_nf_conn { + __u32 cpu; + __u32 mark; + __u32 status; + __u32 timeout; +}; + +struct bpf_nf_conntrack_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + struct bpf_xdp_sock { __u32 queue_id; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ca17dccc..0ea0ee7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -362,6 +362,11 @@ static const char *ltrim(const char *s) env->prev_linfo = linfo; }
+static bool type_is_nf_ct_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_NF_CONN; +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || @@ -381,7 +386,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_NF_CONN_OR_NULL; }
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -395,12 +401,15 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_NF_CONN || + type == PTR_TO_NF_CONN_OR_NULL; }
static bool arg_type_may_be_refcounted(enum bpf_arg_type type) { - return type == ARG_PTR_TO_SOCK_COMMON; + return type == ARG_PTR_TO_SOCK_COMMON || + type == ARG_PTR_TO_NF_CONN; }
/* Determine whether the function releases some resources allocated by another @@ -409,14 +418,17 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ct_release; }
static bool is_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp; + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ct_lookup_tcp || + func_id == BPF_FUNC_ct_lookup_udp; }
static bool is_ptr_cast_function(enum bpf_func_id func_id) @@ -447,6 +459,8 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_NF_CONN] = "nf_conn", + [PTR_TO_NF_CONN_OR_NULL] = "nf_conn_or_null", };
static char slot_type_char[] = { @@ -1913,6 +1927,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_NF_CONN: + case PTR_TO_NF_CONN_OR_NULL: return true; default: return false; @@ -2440,6 +2456,35 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, return 0; }
+static int check_nf_ct_access(struct bpf_verifier_env *env, int insn_idx, + u32 regno, int off, int size, + enum bpf_access_type t) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[regno]; + struct bpf_insn_access_aux info = {}; + bool valid; + + switch (reg->type) { + case PTR_TO_NF_CONN: + valid = bpf_nf_conn_is_valid_access(off, size, t, &info); + break; + default: + valid = false; + } + + if (valid) { + env->insn_aux_data[insn_idx].ctx_field_size = + info.ctx_field_size; + return 0; + } + + verbose(env, "R%d invalid %s access off=%d size=%d\n", + regno, reg_type_str[reg->type], off, size); + + return -EACCES; +} + static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int size, enum bpf_access_type t) @@ -2511,6 +2556,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_CTX; }
+static bool is_nf_ct_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + return type_is_nf_ct_pointer(reg->type); +} + static bool is_sk_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = reg_state(env, regno); @@ -2635,6 +2687,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_XDP_SOCK: pointer_desc = "xdp_sock "; break; + case PTR_TO_NF_CONN: + pointer_desc = "nf_conn "; + break; default: break; } @@ -3050,6 +3105,15 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (type_is_nf_ct_pointer(reg->type)) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); + return -EACCES; + } + err = check_nf_ct_access(env, insn_idx, regno, off, size, t); + if (!err && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_TP_BUFFER) { err = check_tp_buffer_access(env, reg, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) @@ -3099,7 +3163,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg)) { + is_sk_reg(env, insn->dst_reg) || + is_nf_ct_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); @@ -3501,6 +3566,19 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } + } else if (arg_type == ARG_PTR_TO_NF_CONN) { + expected_type = PTR_TO_NF_CONN; + if (!type_is_nf_ct_pointer(type)) + goto err_type; + if (reg->ref_obj_id) { + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; + } } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -4368,6 +4446,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_NF_CONN_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_NF_CONN_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -4649,6 +4731,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: + case PTR_TO_NF_CONN: + case PTR_TO_NF_CONN_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -5915,6 +5999,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCK_COMMON; } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { reg->type = PTR_TO_TCP_SOCK; + } else if (reg->type == PTR_TO_NF_CONN_OR_NULL) { + reg->type = PTR_TO_NF_CONN; } if (is_null) { /* We don't need id and ref_obj_id from this point @@ -7232,6 +7318,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: + case PTR_TO_NF_CONN: + case PTR_TO_NF_CONN_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -7760,6 +7848,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_NF_CONN: + case PTR_TO_NF_CONN_OR_NULL: return false; default: return true; @@ -8867,6 +8957,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -EINVAL; } continue; + case PTR_TO_NF_CONN: + convert_ctx_access = bpf_nf_conn_convert_ctx_access; + break; default: continue; } diff --git a/net/core/filter.c b/net/core/filter.c index 17de674..80319d3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -74,6 +74,12 @@ #include <net/ipv6_stubs.h> #include <net/bpf_sk_storage.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack.h> +#endif + /** * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff @@ -5122,6 +5128,253 @@ static void bpf_update_srh_state(struct sk_buff *skb) }; #endif /* CONFIG_IPV6_SEG6_BPF */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK) +bool bpf_nf_conn_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_nf_conn, + timeout)) + return false; + + if (off % size != 0) + return false; + + return size == sizeof(__u32); +} + +u32 bpf_nf_conn_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_nf_conn, cpu): + BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, cpu) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + offsetof(struct nf_conn, cpu)); + + break; + + case offsetof(struct bpf_nf_conn, mark): +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, mark) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct nf_conn, mark)); +#else + *target_size = 4; + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_nf_conn, status): + BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, status) < 4 || + __IPS_MAX_BIT > 32); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct nf_conn, status)); + + break; + + case offsetof(struct bpf_nf_conn, timeout): + BUILD_BUG_ON(FIELD_SIZEOF(struct nf_conn, timeout) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct nf_conn, timeout)); + + break; + } + + return insn - insn_buf; +} + +static struct nf_conn * +ct_lookup(struct net *net, struct bpf_nf_conntrack_tuple *tuple, + u8 family, u8 proto) +{ + struct nf_conntrack_tuple_hash *hash; + struct nf_conntrack_tuple tup; + struct nf_conn *ct = NULL; + + memset(&tup, 0, sizeof(tup)); + + tup.dst.protonum = proto; + tup.src.l3num = family; + + if (family == AF_INET) { + tup.src.u3.ip = tuple->ipv4.saddr; + tup.dst.u3.ip = tuple->ipv4.daddr; + tup.src.u.tcp.port = tuple->ipv4.sport; + tup.dst.u.tcp.port = tuple->ipv4.dport; +#if IS_ENABLED(CONFIG_IPV6) + } else { + memcpy(tup.src.u3.ip6, tuple->ipv6.saddr, sizeof(tup.src.u3.ip6)); + memcpy(tup.dst.u3.ip6, tuple->ipv6.daddr, sizeof(tup.dst.u3.ip6)); + tup.src.u.tcp.port = tuple->ipv6.sport; + tup.dst.u.tcp.port = tuple->ipv6.dport; +#endif + } + + hash = nf_ct_find_get(net, &nf_ct_zone_dflt, &tup); + if (!hash) + goto out; + ct = nf_ct_tuplehash_to_ctrack(hash); + +out: + return ct; +} + +static struct nf_conn * +__bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len, + struct net *caller_net, u8 proto, u64 netns_id, u64 flags) +{ + struct nf_conn *ct = NULL; + u8 family = AF_UNSPEC; + struct net *net; + + if (len == sizeof(tuple->ipv4)) + family = AF_INET; + else if (len == sizeof(tuple->ipv6)) + family = AF_INET6; + else + goto out; + + if (unlikely(family == AF_UNSPEC || flags || + !((s32)netns_id < 0 || netns_id <= S32_MAX))) + goto out; + + if ((s32)netns_id < 0) { + net = caller_net; + ct = ct_lookup(net, tuple, family, proto); + } else { + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + ct = ct_lookup(net, tuple, family, proto); + put_net(net); + } + +out: + return ct; +} + +static struct nf_conn * +bpf_ct_lookup(struct sk_buff *skb, struct bpf_nf_conntrack_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + } else { + caller_net = sock_net(skb->sk); + } + + return __bpf_ct_lookup(skb, tuple, len, caller_net, proto, + netns_id, flags); +} + +BPF_CALL_5(bpf_ct_lookup_tcp, struct sk_buff *, skb, + struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id, + u64, flags) +{ + return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_ct_lookup_tcp_proto = { + .func = bpf_ct_lookup_tcp, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_PTR_TO_NF_CONN_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_ct_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id, + u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + + return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_ct_lookup_tcp_proto = { + .func = bpf_xdp_ct_lookup_tcp, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_PTR_TO_NF_CONN_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_ct_lookup_udp, struct sk_buff *, skb, + struct bpf_nf_conntrack_tuple *, tuple, u32, len, u64, netns_id, + u64, flags) +{ + return (unsigned long)bpf_ct_lookup(skb, tuple, len, IPPROTO_UDP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_ct_lookup_udp_proto = { + .func = bpf_ct_lookup_udp, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_PTR_TO_NF_CONN_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_ct_lookup_udp, struct xdp_buff *, ctx, + struct bpf_nf_conntrack_tuple *, tuple, u32, len, u32, netns_id, + u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + + return (unsigned long)__bpf_ct_lookup(NULL, tuple, len, caller_net, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_ct_lookup_udp_proto = { + .func = bpf_xdp_ct_lookup_udp, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_PTR_TO_NF_CONN_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_ct_release, struct nf_conn *, ct) +{ + nf_conntrack_put(&ct->ct_general); + return 0; +} + +static const struct bpf_func_proto bpf_ct_release_proto = { + .func = bpf_ct_release, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_NF_CONN, +}; +#endif + #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) @@ -6139,6 +6392,14 @@ bool bpf_helper_changes_pkt_data(void *func) case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; #endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + case BPF_FUNC_ct_lookup_tcp: + return &bpf_ct_lookup_tcp_proto; + case BPF_FUNC_ct_lookup_udp: + return &bpf_ct_lookup_udp_proto; + case BPF_FUNC_ct_release: + return &bpf_ct_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -6180,6 +6441,14 @@ bool bpf_helper_changes_pkt_data(void *func) case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; #endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + case BPF_FUNC_ct_lookup_tcp: + return &bpf_xdp_ct_lookup_tcp_proto; + case BPF_FUNC_ct_lookup_udp: + return &bpf_xdp_ct_lookup_udp_proto; + case BPF_FUNC_ct_release: + return &bpf_ct_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -6284,6 +6553,14 @@ bool bpf_helper_changes_pkt_data(void *func) case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; #endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + case BPF_FUNC_ct_lookup_tcp: + return &bpf_ct_lookup_tcp_proto; + case BPF_FUNC_ct_lookup_udp: + return &bpf_ct_lookup_udp_proto; + case BPF_FUNC_ct_release: + return &bpf_ct_release_proto; +#endif default: return bpf_base_func_proto(func_id); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 78f046e..855c6b0 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -617,6 +617,22 @@ bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, } EXPORT_SYMBOL(nf_ct_get_tuple_skb);
+struct nf_conntrack_tuple_hash * +nf_ct_find_get(struct net *net, const struct nf_conntrack_zone *zone, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_ct_hook *ct_hook; + struct nf_conntrack_tuple_hash *ret = NULL; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ret = ct_hook->find_get(net, zone, tuple); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_find_get); + /* Built-in default zone used e.g. by modules. */ const struct nf_conntrack_zone nf_ct_zone_dflt = { .id = NF_CT_DEFAULT_ZONE_ID, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f4c4b46..a44df88 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2484,6 +2484,7 @@ int nf_conntrack_init_start(void) .update = nf_conntrack_update, .destroy = destroy_conntrack, .get_tuple_skb = nf_conntrack_get_tuple_skb, + .find_get = nf_conntrack_find_get, };
void nf_conntrack_init_end(void) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 90baf7d..26f0c2a 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -398,6 +398,8 @@ class PrinterHelpers(Printer):
type_fwds = [ 'struct bpf_fib_lookup', + 'struct bpf_nf_conn', + 'struct bpf_nf_conntrack_tuple', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_sock', @@ -433,6 +435,8 @@ class PrinterHelpers(Printer): '__wsum',
'struct bpf_fib_lookup', + 'struct bpf_nf_conn', + 'struct bpf_nf_conntrack_tuple', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_sock', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 033d90a..85c4b3f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2885,6 +2885,88 @@ struct bpf_stack_build_id { * **-EPERM** if no permission to send the *sig*. * * **-EAGAIN** if bpf program can try again. + * + * struct bpf_nf_conn *bpf_ct_lookup_tcp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP nf_conntrack entry matching *tuple*, optionally in + * a child network namespace *netns*. The return value must be + * checked, and if non-**NULL**, released via + * **bpf_ct_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or xdp_md (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 nf_conn. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 nf_conn. + * + * If the *netns* is a negative signed 32-bit integer, then the + * nf_conn lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For XDP hooks, this is the netns of the device in + * the xdp_md. If *netns* is any other signed 32-bit value greater + * than or equal to zero then it specifies the ID of the netns + * relative to the netns associated with the *ctx*. *netns* values + * beyond the range of 32-bit integers are reserved for future + * use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper will always return NULL if the kernel was compiled + * without **CONFIG_NF_CONNTRACK**. + * Return + * Pointer to **struct bpf_nf_conn**, or **NULL** in case of + * failure. + * + * struct bpf_nf_conn *bpf_ct_lookup_udp(void *ctx, struct bpf_nf_conntrack_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for UDP nf_conntrack entry matching *tuple*, optionally in + * a child network namespace *netns*. The return value must be + * checked, and if non-**NULL**, released via + * **bpf_ct_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or xdp_md (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 nf_conn. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 nf_conn. + * + * If the *netns* is a negative signed 32-bit integer, then the + * nf_conn lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For XDP hooks, this is the netns of the device in + * the xdp_md. If *netns* is any other signed 32-bit value greater + * than or equal to zero then it specifies the ID of the netns + * relative to the netns associated with the *ctx*. *netns* values + * beyond the range of 32-bit integers are reserved for future + * use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper will always return NULL if the kernel was compiled + * without **CONFIG_NF_CONNTRACK**. + * Return + * Pointer to **struct bpf_nf_conn**, or **NULL** in case of + * failure. + * + * int bpf_ct_release(struct bpf_nf_conn *ct) + * Description + * Release the reference held by *ct*. *ct* must be a + * non-**NULL** pointer that was returned from + * **bpf_ct_lookup_xxx**\ (). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3004,7 +3086,10 @@ struct bpf_stack_build_id { FN(probe_read_user_str), \ FN(probe_read_kernel_str), \ FN(tcp_send_ack), \ - FN(send_signal_thread), + FN(send_signal_thread), \ + FN(ct_lookup_tcp), \ + FN(ct_lookup_udp), \ + FN(ct_release),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3278,6 +3363,30 @@ struct bpf_sock_tuple { }; };
+struct bpf_nf_conn { + __u32 cpu; + __u32 mark; + __u32 status; + __u32 timeout; +}; + +struct bpf_nf_conntrack_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + struct bpf_xdp_sock { __u32 queue_id; };