Thanks for your time reviewing!
Changes in v4: - Handle special case of no offset bits (a=0 / A=2^16).
Cole Dishington (3): net: netfilter: Add RFC-7597 Section 5.1 PSID support xtables API net: netfilter: Add RFC-7597 Section 5.1 PSID support selftests: netfilter: Add RFC-7597 Section 5.1 PSID selftests
include/uapi/linux/netfilter/nf_nat.h | 3 +- net/netfilter/nf_nat_core.c | 39 +++- net/netfilter/nf_nat_masquerade.c | 20 +- net/netfilter/xt_MASQUERADE.c | 44 ++++- .../netfilter/nat_masquerade_psid.sh | 182 ++++++++++++++++++ 5 files changed, 276 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/netfilter/nat_masquerade_psid.sh
Add support for revision 2 of xtables masquerade extension.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz --- include/uapi/linux/netfilter/nf_nat.h | 3 +- net/netfilter/xt_MASQUERADE.c | 44 ++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 6 deletions(-)
diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index a64586e77b24..660e53ffdb57 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -12,6 +12,7 @@ #define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) #define NF_NAT_RANGE_PROTO_OFFSET (1 << 5) #define NF_NAT_RANGE_NETMAP (1 << 6) +#define NF_NAT_RANGE_PSID (1 << 7)
#define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) @@ -20,7 +21,7 @@ (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \ - NF_NAT_RANGE_NETMAP) + NF_NAT_RANGE_NETMAP | NF_NAT_RANGE_PSID)
struct nf_nat_ipv4_range { unsigned int flags; diff --git a/net/netfilter/xt_MASQUERADE.c b/net/netfilter/xt_MASQUERADE.c index eae05c178336..dc6870ca2b71 100644 --- a/net/netfilter/xt_MASQUERADE.c +++ b/net/netfilter/xt_MASQUERADE.c @@ -16,7 +16,7 @@ MODULE_AUTHOR("Netfilter Core Team coreteam@netfilter.org"); MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
/* FIXME: Multiple targets. --RR */ -static int masquerade_tg_check(const struct xt_tgchk_param *par) +static int masquerade_tg_check_v0(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
@@ -31,8 +31,19 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) return nf_ct_netns_get(par->net, par->family); }
+static int masquerade_tg_check_v1(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) { + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; + } + return nf_ct_netns_get(par->net, par->family); +} + static unsigned int -masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) +masquerade_tg_v0(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; @@ -46,6 +57,15 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) xt_out(par)); }
+static unsigned int +masquerade_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; + + return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), range, + xt_out(par)); +} + static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); @@ -73,6 +93,7 @@ static struct xt_target masquerade_tg_reg[] __read_mostly = { { #if IS_ENABLED(CONFIG_IPV6) .name = "MASQUERADE", + .revision = 0, .family = NFPROTO_IPV6, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), @@ -84,15 +105,28 @@ static struct xt_target masquerade_tg_reg[] __read_mostly = { }, { #endif .name = "MASQUERADE", + .revision = 0, .family = NFPROTO_IPV4, - .target = masquerade_tg, + .target = masquerade_tg_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, - .checkentry = masquerade_tg_check, + .checkentry = masquerade_tg_check_v0, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, - } + }, + { + .name = "MASQUERADE", + .revision = 1, + .family = NFPROTO_IPV4, + .target = masquerade_tg_v1, + .targetsize = sizeof(struct nf_nat_range2), + .table = "nat", + .hooks = 1 << NF_INET_POST_ROUTING, + .checkentry = masquerade_tg_check_v1, + .destroy = masquerade_tg_destroy, + .me = THIS_MODULE, + }, };
static int __init masquerade_tg_init(void)
Adds support for masquerading into a smaller subset of ports - defined by the PSID values from RFC-7597 Section 5.1. This is part of the support for MAP-E and Lightweight 4over6, which allows multiple devices to share an IPv4 address by splitting the L4 port / id into ranges.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz ---
Notes: Thanks for your time reviewing!
Changes in v4: - Add support for a=0 (A=2^16) special case. - Now that offset=0 is used for the special case of 2^16 (as it cannot fit in a u16) the divide by zero issues are no longer present.
net/netfilter/nf_nat_core.c | 39 +++++++++++++++++++++++++++---- net/netfilter/nf_nat_masquerade.c | 20 ++++++++++++++-- 2 files changed, 53 insertions(+), 6 deletions(-)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7de595ead06a..4a9448684504 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -195,13 +195,36 @@ static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) + const union nf_conntrack_man_proto *max, + const union nf_conntrack_man_proto *base, + bool is_psid) { __be16 port; + u16 psid, psid_mask, offset_mask; + + /* In this case we are in PSID mode, avoid checking all ranges by computing bitmasks */ + if (is_psid) { + u16 power_j = ntohs(max->all) - ntohs(min->all) + 1; + u32 offset = ntohs(base->all); + u16 power_a; + + if (offset == 0) + offset = 1 << 16; + + power_a = (1 << 16) / offset; + offset_mask = (power_a - 1) * offset; + psid_mask = ((offset / power_j) << 1) - 1; + psid = ntohs(min->all) & psid_mask; + }
switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: + if (is_psid) { + return (offset_mask == 0 || + (ntohs(tuple->src.u.icmp.id) & offset_mask) != 0) && + ((ntohs(tuple->src.u.icmp.id) & psid_mask) == psid); + } return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ @@ -215,6 +238,10 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, else port = tuple->dst.u.all;
+ if (is_psid) { + return (offset_mask == 0 || (ntohs(port) & offset_mask) != 0) && + ((ntohs(port) & psid_mask) == psid); + } return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: @@ -239,7 +266,8 @@ static int in_range(const struct nf_conntrack_tuple *tuple, return 1;
return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, - &range->min_proto, &range->max_proto); + &range->min_proto, &range->max_proto, &range->base_proto, + range->flags & NF_NAT_RANGE_PSID); }
static inline int @@ -532,8 +560,11 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, - &range->min_proto, - &range->max_proto) && + &range->min_proto, + &range->max_proto, + &range->base_proto, + range->flags & + NF_NAT_RANGE_PSID) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345..7e2fb0da344a 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -55,8 +55,24 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; + + if (range->flags & NF_NAT_RANGE_PSID) { + u16 base = ntohs(range->base_proto.all); + u16 min = ntohs(range->min_proto.all); + u16 off = 0; + + /* If offset=0, port range is in one contiguous block */ + if (base) + off = prandom_u32() % (((1 << 16) / base) - 1); + + newrange.min_proto.all = htons(min + base * off); + newrange.max_proto.all = htons(ntohs(newrange.min_proto.all) + ntohs(range->max_proto.all) - min); + newrange.base_proto = range->base_proto; + newrange.flags = newrange.flags | NF_NAT_RANGE_PROTO_SPECIFIED; + } else { + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + }
/* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
Cole Dishington Cole.Dishington@alliedtelesis.co.nz wrote:
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7de595ead06a..4a9448684504 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -195,13 +195,36 @@ static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min,
const union nf_conntrack_man_proto *max)
const union nf_conntrack_man_proto *max,
const union nf_conntrack_man_proto *base,
bool is_psid)
{ __be16 port;
- u16 psid, psid_mask, offset_mask;
- /* In this case we are in PSID mode, avoid checking all ranges by computing bitmasks */
- if (is_psid) {
u16 power_j = ntohs(max->all) - ntohs(min->all) + 1;
u32 offset = ntohs(base->all);
u16 power_a;
if (offset == 0)
offset = 1 << 16;
power_a = (1 << 16) / offset;
Since the dividie is only needed nat setup and not for each packet I think its ok.
- if (range->flags & NF_NAT_RANGE_PSID) {
u16 base = ntohs(range->base_proto.all);
u16 min = ntohs(range->min_proto.all);
u16 off = 0;
/* If offset=0, port range is in one contiguous block */
if (base)
off = prandom_u32() % (((1 << 16) / base) - 1);
Bases 32769 > gives 0 for the modulo value, so perhaps compute that independently.
You could reject > 32769 in the iptables checkentry target.
Also, base of 21846 and above always give 0 result (% 1).
I don't know psid well enough to give a recommendation here.
If such inputs are nonsensical, just reject it when userspace asks for this and add a
if (WARN_ON_ONCE(base > bogus)) return NF_DROP;
with s small coment explaining that xtables is supposed to not provide such value.
Other than this I think its ok.
I still dislike the 'bool is_psid' in the nat core, but I can't find a better solution.
Adds support for masquerading into a smaller subset of ports - defined by the PSID values from RFC-7597 Section 5.1. This is part of the support for MAP-E and Lightweight 4over6, which allows multiple devices to share an IPv4 address by splitting the L4 port / id into ranges.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz ---
Notes: Thanks for time reviewing!
Changes in v5: - Add WARN_ON_ONCE for invalid value of range->base.
net/netfilter/nf_nat_core.c | 39 +++++++++++++++++++++++++++---- net/netfilter/nf_nat_masquerade.c | 27 +++++++++++++++++++-- 2 files changed, 60 insertions(+), 6 deletions(-)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7de595ead06a..4a9448684504 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -195,13 +195,36 @@ static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) + const union nf_conntrack_man_proto *max, + const union nf_conntrack_man_proto *base, + bool is_psid) { __be16 port; + u16 psid, psid_mask, offset_mask; + + /* In this case we are in PSID mode, avoid checking all ranges by computing bitmasks */ + if (is_psid) { + u16 power_j = ntohs(max->all) - ntohs(min->all) + 1; + u32 offset = ntohs(base->all); + u16 power_a; + + if (offset == 0) + offset = 1 << 16; + + power_a = (1 << 16) / offset; + offset_mask = (power_a - 1) * offset; + psid_mask = ((offset / power_j) << 1) - 1; + psid = ntohs(min->all) & psid_mask; + }
switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: + if (is_psid) { + return (offset_mask == 0 || + (ntohs(tuple->src.u.icmp.id) & offset_mask) != 0) && + ((ntohs(tuple->src.u.icmp.id) & psid_mask) == psid); + } return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ @@ -215,6 +238,10 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, else port = tuple->dst.u.all;
+ if (is_psid) { + return (offset_mask == 0 || (ntohs(port) & offset_mask) != 0) && + ((ntohs(port) & psid_mask) == psid); + } return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: @@ -239,7 +266,8 @@ static int in_range(const struct nf_conntrack_tuple *tuple, return 1;
return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, - &range->min_proto, &range->max_proto); + &range->min_proto, &range->max_proto, &range->base_proto, + range->flags & NF_NAT_RANGE_PSID); }
static inline int @@ -532,8 +560,11 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, - &range->min_proto, - &range->max_proto) && + &range->min_proto, + &range->max_proto, + &range->base_proto, + range->flags & + NF_NAT_RANGE_PSID) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345..dea6106f1699 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -55,8 +55,31 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; + + if (range->flags & NF_NAT_RANGE_PSID) { + u16 base = ntohs(range->base_proto.all); + u16 min = ntohs(range->min_proto.all); + u16 off = 0; + + /* xtables should stop base > 2^15 by enforcement of + * 0 <= offset_len < 16 argument, with offset_len=0 + * as a special case inwhich base=0. + */ + if (WARN_ON_ONCE(base > (1 << 15))) + return NF_DROP; + + /* If offset=0, port range is in one contiguous block */ + if (base) + off = prandom_u32() % (((1 << 16) / base) - 1); + + newrange.min_proto.all = htons(min + base * off); + newrange.max_proto.all = htons(ntohs(newrange.min_proto.all) + ntohs(range->max_proto.all) - min); + newrange.base_proto = range->base_proto; + newrange.flags = newrange.flags | NF_NAT_RANGE_PROTO_SPECIFIED; + } else { + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + }
/* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
Cole Dishington Cole.Dishington@alliedtelesis.co.nz wrote:
Adds support for masquerading into a smaller subset of ports - defined by the PSID values from RFC-7597 Section 5.1. This is part of the support for MAP-E and Lightweight 4over6, which allows multiple devices to share an IPv4 address by splitting the L4 port / id into ranges.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz
- /* In this case we are in PSID mode, avoid checking all ranges by computing bitmasks */
- if (is_psid) {
u16 power_j = ntohs(max->all) - ntohs(min->all) + 1;
I think this needs to be 'u32 power_j' to prevent overflow of 65535 + 1 -> 0.
if (base)
off = prandom_u32() % (((1 << 16) / base) - 1);
I think this can use prandom_u32_max(((1 << 16) / base) - 1).
I have no other comments. Other kernel patches LGTM.
Adds support for masquerading into a smaller subset of ports - defined by the PSID values from RFC-7597 Section 5.1. This is part of the support for MAP-E and Lightweight 4over6, which allows multiple devices to share an IPv4 address by splitting the L4 port / id into ranges.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz ---
Notes: Thanks for your time reviewing!
Changes in v6: - Use prandom_u32_max() rather than prandom_u32() % max for generating PSID sub-range offset. - Use u32 for power_j for the case of a=0,psid_len=0.
net/netfilter/nf_nat_core.c | 39 +++++++++++++++++++++++++++---- net/netfilter/nf_nat_masquerade.c | 27 +++++++++++++++++++-- 2 files changed, 60 insertions(+), 6 deletions(-)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7de595ead06a..f07a3473aab5 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -195,13 +195,36 @@ static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) + const union nf_conntrack_man_proto *max, + const union nf_conntrack_man_proto *base, + bool is_psid) { __be16 port; + u16 psid, psid_mask, offset_mask; + + /* In this case we are in PSID mode, avoid checking all ranges by computing bitmasks */ + if (is_psid) { + u32 power_j = ntohs(max->all) - ntohs(min->all) + 1; + u32 offset = ntohs(base->all); + u16 power_a; + + if (offset == 0) + offset = 1 << 16; + + power_a = (1 << 16) / offset; + offset_mask = (power_a - 1) * offset; + psid_mask = ((offset / power_j) << 1) - 1; + psid = ntohs(min->all) & psid_mask; + }
switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: + if (is_psid) { + return (offset_mask == 0 || + (ntohs(tuple->src.u.icmp.id) & offset_mask) != 0) && + ((ntohs(tuple->src.u.icmp.id) & psid_mask) == psid); + } return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ @@ -215,6 +238,10 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, else port = tuple->dst.u.all;
+ if (is_psid) { + return (offset_mask == 0 || (ntohs(port) & offset_mask) != 0) && + ((ntohs(port) & psid_mask) == psid); + } return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: @@ -239,7 +266,8 @@ static int in_range(const struct nf_conntrack_tuple *tuple, return 1;
return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, - &range->min_proto, &range->max_proto); + &range->min_proto, &range->max_proto, &range->base_proto, + range->flags & NF_NAT_RANGE_PSID); }
static inline int @@ -532,8 +560,11 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, - &range->min_proto, - &range->max_proto) && + &range->min_proto, + &range->max_proto, + &range->base_proto, + range->flags & + NF_NAT_RANGE_PSID) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345..19a4754cda76 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -55,8 +55,31 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; + + if (range->flags & NF_NAT_RANGE_PSID) { + u16 base = ntohs(range->base_proto.all); + u16 min = ntohs(range->min_proto.all); + u16 off = 0; + + /* xtables should stop base > 2^15 by enforcement of + * 0 <= offset_len < 16 argument, with offset_len=0 + * as a special case inwhich base=0. + */ + if (WARN_ON_ONCE(base > (1 << 15))) + return NF_DROP; + + /* If offset=0, port range is in one contiguous block */ + if (base) + off = prandom_u32_max(((1 << 16) / base) - 1); + + newrange.min_proto.all = htons(min + base * off); + newrange.max_proto.all = htons(ntohs(newrange.min_proto.all) + ntohs(range->max_proto.all) - min); + newrange.base_proto = range->base_proto; + newrange.flags = newrange.flags | NF_NAT_RANGE_PROTO_SPECIFIED; + } else { + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + }
/* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
Cole Dishington Cole.Dishington@alliedtelesis.co.nz wrote:
Adds support for masquerading into a smaller subset of ports - defined by the PSID values from RFC-7597 Section 5.1. This is part of the support for MAP-E and Lightweight 4over6, which allows multiple devices to share an IPv4 address by splitting the L4 port / id into ranges.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz
Thanks for your patience and addressing all the comments.
Reviewed-by: Florian Westphal fw@strlen.de
Thanks for your time reviewing!
Changes in v7: - Added net-next to subject. - Added Reviewed-by: Florian Westphal fw@strlen.de to patch 2/3.
Cole Dishington (3): net: netfilter: Add RFC-7597 Section 5.1 PSID support xtables API net: netfilter: Add RFC-7597 Section 5.1 PSID support selftests: netfilter: Add RFC-7597 Section 5.1 PSID selftests
include/uapi/linux/netfilter/nf_nat.h | 3 +- net/netfilter/nf_nat_core.c | 39 +++- net/netfilter/nf_nat_masquerade.c | 27 ++- net/netfilter/xt_MASQUERADE.c | 44 ++++- .../netfilter/nat_masquerade_psid.sh | 182 ++++++++++++++++++ 5 files changed, 283 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/netfilter/nat_masquerade_psid.sh
Add support for revision 2 of xtables masquerade extension.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz ---
Notes: No changes.
include/uapi/linux/netfilter/nf_nat.h | 3 +- net/netfilter/xt_MASQUERADE.c | 44 ++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 6 deletions(-)
diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index a64586e77b24..660e53ffdb57 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -12,6 +12,7 @@ #define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) #define NF_NAT_RANGE_PROTO_OFFSET (1 << 5) #define NF_NAT_RANGE_NETMAP (1 << 6) +#define NF_NAT_RANGE_PSID (1 << 7)
#define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) @@ -20,7 +21,7 @@ (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \ - NF_NAT_RANGE_NETMAP) + NF_NAT_RANGE_NETMAP | NF_NAT_RANGE_PSID)
struct nf_nat_ipv4_range { unsigned int flags; diff --git a/net/netfilter/xt_MASQUERADE.c b/net/netfilter/xt_MASQUERADE.c index eae05c178336..dc6870ca2b71 100644 --- a/net/netfilter/xt_MASQUERADE.c +++ b/net/netfilter/xt_MASQUERADE.c @@ -16,7 +16,7 @@ MODULE_AUTHOR("Netfilter Core Team coreteam@netfilter.org"); MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
/* FIXME: Multiple targets. --RR */ -static int masquerade_tg_check(const struct xt_tgchk_param *par) +static int masquerade_tg_check_v0(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
@@ -31,8 +31,19 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) return nf_ct_netns_get(par->net, par->family); }
+static int masquerade_tg_check_v1(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) { + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; + } + return nf_ct_netns_get(par->net, par->family); +} + static unsigned int -masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) +masquerade_tg_v0(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; @@ -46,6 +57,15 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) xt_out(par)); }
+static unsigned int +masquerade_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; + + return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), range, + xt_out(par)); +} + static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); @@ -73,6 +93,7 @@ static struct xt_target masquerade_tg_reg[] __read_mostly = { { #if IS_ENABLED(CONFIG_IPV6) .name = "MASQUERADE", + .revision = 0, .family = NFPROTO_IPV6, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), @@ -84,15 +105,28 @@ static struct xt_target masquerade_tg_reg[] __read_mostly = { }, { #endif .name = "MASQUERADE", + .revision = 0, .family = NFPROTO_IPV4, - .target = masquerade_tg, + .target = masquerade_tg_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, - .checkentry = masquerade_tg_check, + .checkentry = masquerade_tg_check_v0, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, - } + }, + { + .name = "MASQUERADE", + .revision = 1, + .family = NFPROTO_IPV4, + .target = masquerade_tg_v1, + .targetsize = sizeof(struct nf_nat_range2), + .table = "nat", + .hooks = 1 << NF_INET_POST_ROUTING, + .checkentry = masquerade_tg_check_v1, + .destroy = masquerade_tg_destroy, + .me = THIS_MODULE, + }, };
static int __init masquerade_tg_init(void)
Adds support for masquerading into a smaller subset of ports - defined by the PSID values from RFC-7597 Section 5.1. This is part of the support for MAP-E and Lightweight 4over6, which allows multiple devices to share an IPv4 address by splitting the L4 port / id into ranges.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz Reviewed-by: Florian Westphal fw@strlen.de ---
Notes: Changes: - Added Reviewed-by: Florian Westphal fw@strlen.de.
net/netfilter/nf_nat_core.c | 39 +++++++++++++++++++++++++++---- net/netfilter/nf_nat_masquerade.c | 27 +++++++++++++++++++-- 2 files changed, 60 insertions(+), 6 deletions(-)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7de595ead06a..f07a3473aab5 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -195,13 +195,36 @@ static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) + const union nf_conntrack_man_proto *max, + const union nf_conntrack_man_proto *base, + bool is_psid) { __be16 port; + u16 psid, psid_mask, offset_mask; + + /* In this case we are in PSID mode, avoid checking all ranges by computing bitmasks */ + if (is_psid) { + u32 power_j = ntohs(max->all) - ntohs(min->all) + 1; + u32 offset = ntohs(base->all); + u16 power_a; + + if (offset == 0) + offset = 1 << 16; + + power_a = (1 << 16) / offset; + offset_mask = (power_a - 1) * offset; + psid_mask = ((offset / power_j) << 1) - 1; + psid = ntohs(min->all) & psid_mask; + }
switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: + if (is_psid) { + return (offset_mask == 0 || + (ntohs(tuple->src.u.icmp.id) & offset_mask) != 0) && + ((ntohs(tuple->src.u.icmp.id) & psid_mask) == psid); + } return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ @@ -215,6 +238,10 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, else port = tuple->dst.u.all;
+ if (is_psid) { + return (offset_mask == 0 || (ntohs(port) & offset_mask) != 0) && + ((ntohs(port) & psid_mask) == psid); + } return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: @@ -239,7 +266,8 @@ static int in_range(const struct nf_conntrack_tuple *tuple, return 1;
return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, - &range->min_proto, &range->max_proto); + &range->min_proto, &range->max_proto, &range->base_proto, + range->flags & NF_NAT_RANGE_PSID); }
static inline int @@ -532,8 +560,11 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, - &range->min_proto, - &range->max_proto) && + &range->min_proto, + &range->max_proto, + &range->base_proto, + range->flags & + NF_NAT_RANGE_PSID) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345..19a4754cda76 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -55,8 +55,31 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; + + if (range->flags & NF_NAT_RANGE_PSID) { + u16 base = ntohs(range->base_proto.all); + u16 min = ntohs(range->min_proto.all); + u16 off = 0; + + /* xtables should stop base > 2^15 by enforcement of + * 0 <= offset_len < 16 argument, with offset_len=0 + * as a special case inwhich base=0. + */ + if (WARN_ON_ONCE(base > (1 << 15))) + return NF_DROP; + + /* If offset=0, port range is in one contiguous block */ + if (base) + off = prandom_u32_max(((1 << 16) / base) - 1); + + newrange.min_proto.all = htons(min + base * off); + newrange.max_proto.all = htons(ntohs(newrange.min_proto.all) + ntohs(range->max_proto.all) - min); + newrange.base_proto = range->base_proto; + newrange.flags = newrange.flags | NF_NAT_RANGE_PROTO_SPECIFIED; + } else { + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + }
/* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
Hi,
On Mon, Aug 09, 2021 at 04:10:36PM +1200, Cole Dishington wrote:
Adds support for masquerading into a smaller subset of ports - defined by the PSID values from RFC-7597 Section 5.1. This is part of the support for MAP-E and Lightweight 4over6, which allows multiple devices to share an IPv4 address by splitting the L4 port / id into ranges.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz Reviewed-by: Florian Westphal fw@strlen.de
[...]
Looking at the userspace logic:
https://patchwork.ozlabs.org/project/netfilter-devel/patch/20210716002219.30...
Chunk extracted from void parse_psid(...)
offset = (1 << (16 - offset_len));
Assuming offset_len = 6, then you skip 0-1023 ports, OK.
psid = psid << (16 - offset_len - psid_len);
This psid calculation is correct? Maybe:
psid = psid << (16 - offset_len);
instead?
psid=0 => 0 << (16 - 6) = 1024 psid=1 => 1 << (16 - 6) = 2048
This is implicitly assuming that 64 PSIDs are available, each of them taking 1024 ports, ie. psid_len is 6 bits. But why are you subtracting the psid_len above?
/* Handle the special case of no offset bits (a=0), so offset loops */ min = psid;
OK, this line above is the minimal port in the range
if (offset) min += offset;
... which is incremented by the offset (to skip the 0-1023 ports).
r->min_proto.all = htons(min); r->max_proto.all = htons(min + ((1 << (16 - offset_len - psid_len)) - 1));
Here, you subtract psid_len again, not sure why.
r->base_proto.all = htons(offset);
base is set to offset, ie. 1024.
r->flags |= NF_NAT_RANGE_PSID; r->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
Now looking at the kernel side.
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345..19a4754cda76 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -55,8 +55,31 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc;
- newrange.min_proto = range->min_proto;
- newrange.max_proto = range->max_proto;
- if (range->flags & NF_NAT_RANGE_PSID) {
u16 base = ntohs(range->base_proto.all);
u16 min = ntohs(range->min_proto.all);
u16 off = 0;
/* xtables should stop base > 2^15 by enforcement of
* 0 <= offset_len < 16 argument, with offset_len=0
* as a special case inwhich base=0.
I don't understand this comment.
*/
if (WARN_ON_ONCE(base > (1 << 15)))
return NF_DROP;
/* If offset=0, port range is in one contiguous block */
if (base)
off = prandom_u32_max(((1 << 16) / base) - 1);
Assuming the example above, base is set to 1024. Then, off is a random value between UINT16_MAX (you expressed this as 1 << 16) and the base which is 1024 minus 1.
So this is picking a random off (actually the PSID?) between 0 and 63. What about clashes? I mean, two different machines behind the NAT might get the same off.
newrange.min_proto.all = htons(min + base * off);
min could be 1024, 2048, 3072... you add base which is 1024 * off.
Is this duplicated? Both calculated in user and kernel space?
newrange.max_proto.all = htons(ntohs(newrange.min_proto.all) + ntohs(range->max_proto.all) - min);
I'm stopping here, I'm getting lost.
My understanding about this RFC is that you would like to split the 16-bit ports in ranges to uniquely identify the host behind the NAT.
Why don't you just you just select the port range from userspace utilizing the existing infrastructure? I mean, why do you need this kernel patch?
Florian already suggested:
Is it really needed to place all of this in the nat core?
The only thing that has to be done in the NAT core, afaics, is to suppress port reallocation attmepts when NF_NAT_RANGE_PSID is set.
Is there a reason why nf_nat_masquerade_ipv4/6 can't be changed instead to do what you want?
AFAICS its enough to set NF_NAT_RANGE_PROTO_SPECIFIED and init the upper/lower boundaries, i.e. change input given to nf_nat_setup_info().
extracted from:
https://patchwork.ozlabs.org/project/netfilter-devel/patch/20210422023506.46...
Hello,
Thanks for your time reviewing!
On Wed, 2021-08-25 at 19:05 +0200, Pablo Neira Ayuso wrote:
Hi,
On Mon, Aug 09, 2021 at 04:10:36PM +1200, Cole Dishington wrote:
Adds support for masquerading into a smaller subset of ports - defined by the PSID values from RFC-7597 Section 5.1. This is part of the support for MAP-E and Lightweight 4over6, which allows multiple devices to share an IPv4 address by splitting the L4 port / id into ranges.
Co-developed-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Signed-off-by: Anthony Lineham anthony.lineham@alliedtelesis.co.nz Co-developed-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Scott Parlane scott.parlane@alliedtelesis.co.nz Signed-off-by: Blair Steven blair.steven@alliedtelesis.co.nz Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz Reviewed-by: Florian Westphal fw@strlen.de
[...]
Looking at the userspace logic:
https://scanmail.trustwave.com/?c=20988&d=6vim4fcVLjPkIbLUDqz3Tj2W4gXWNC...
Chunk extracted from void parse_psid(...)
offset = (1 << (16 - offset_len));
Assuming offset_len = 6, then you skip 0-1023 ports, OK.
psid = psid << (16 - offset_len - psid_len);
This psid calculation is correct? Maybe:
psid = psid << (16 - offset_len);
PSID port numbers have the form [offset|PSID|j] and 16 = offset_length + PSID_length + j_length. The PSID calculation above is bit shifting the passed psid up j_length.
The userspace tool accepts the unshifted psid to be consistent with how RFC7597 specified it (see RFC7597 Appendix A. Examples).
instead?
psid=0 => 0 << (16 - 6) = 1024 psid=1 => 1 << (16 - 6) = 2048
This is implicitly assuming that 64 PSIDs are available, each of them taking 1024 ports, ie. psid_len is 6 bits. But why are you subtracting the psid_len above?
/* Handle the special case of no offset bits (a=0), so offset loops */ min = psid;
OK, this line above is the minimal port in the range
if (offset) min += offset;
... which is incremented by the offset (to skip the 0-1023 ports).
r->min_proto.all = htons(min); r->max_proto.all = htons(min + ((1 << (16 - offset_len - psid_len)) - 1));
Here, you subtract psid_len again, not sure why.
Each PSID port range is made up of many smaller contiguous port sub-ranges (except for the special case of offset_len = 0) e.g. for PSID=0x34,psid_length=8,psid_offset=6 the ranges are 1232-1235, 2256-2259, ..., 63696-63699, 64720-64723 (Taken from rfc7597 Appendix A. Examples). The above calculation is selecting the first sub-range. Max is computed by finding j_length and filling it with 1's.
r->base_proto.all = htons(offset);
base is set to offset, ie. 1024.
r->flags |= NF_NAT_RANGE_PSID; r->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
Now looking at the kernel side.
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345..19a4754cda76 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -55,8 +55,31 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc;
- newrange.min_proto = range->min_proto;
- newrange.max_proto = range->max_proto;
- if (range->flags & NF_NAT_RANGE_PSID) {
u16 base = ntohs(range->base_proto.all);
u16 min = ntohs(range->min_proto.all);
u16 off = 0;
/* xtables should stop base > 2^15 by enforcement of
* 0 <= offset_len < 16 argument, with offset_len=0
* as a special case inwhich base=0.
I don't understand this comment.
This is a sanity check. The userspace tool restricts offset_len to the specified range and since base = 2^(16 - offset_len) (or base = 0 for the special case of offset_len = 16) the below condition should never be true. However, if base greater than 1<<15 was allowed, a divide by zero error would occur on the block below.
*/
if (WARN_ON_ONCE(base > (1 << 15)))
return NF_DROP;
/* If offset=0, port range is in one contiguous block */
if (base)
off = prandom_u32_max(((1 << 16) / base) - 1);
Assuming the example above, base is set to 1024. Then, off is a random value between UINT16_MAX (you expressed this as 1 << 16) and the base which is 1024 minus 1.
So this is picking a random off (actually the PSID?) between 0 and 63. What about clashes? I mean, two different machines behind the NAT might get the same off.
newrange.min_proto.all = htons(min + base * off);
min could be 1024, 2048, 3072... you add base which is 1024 * off.
Is this duplicated? Both calculated in user and kernel space?
Each PSID value defines many contiguous port sub-ranges. The randomly chosen off selects the ith sub-range for a given PSID e.g. off=1 would select 2256-2259 for rfc7597 Appendix A. Examples.
The userspace tool calculates the min and max of the first sub-range for a given psid, whereas the above randomly selects one of the sub-ranges for a given psid.
j_length determines how large each sub-range will be, so for small j_length values there still is the risk the chosen sub-range will be exhausted.
newrange.max_proto.all = htons(ntohs(newrange.min_proto.all) + ntohs(range->max_proto.all) - min);
I'm stopping here, I'm getting lost.
My understanding about this RFC is that you would like to split the 16-bit ports in ranges to uniquely identify the host behind the NAT.
Why don't you just you just select the port range from userspace utilizing the existing infrastructure? I mean, why do you need this kernel patch?
If utilizing existing infrastruture to install PSID port ranges a lot of rules would be required as each PSID port range is made up of many smaller sub-ranges.
e.g. (from rfc7597 Appendix A. Examples) for psid_length=8,offset_length=6 each PSID would need 63 NF_NAT_RANGE_PROTO_SPECIFIED rules, hence a total of 16128 rules if all the PSIDs were allocated.
Florian already suggested:
Is it really needed to place all of this in the nat core?
The only thing that has to be done in the NAT core, afaics, is to suppress port reallocation attmepts when NF_NAT_RANGE_PSID is set.
Is there a reason why nf_nat_masquerade_ipv4/6 can't be changed instead to do what you want?
AFAICS its enough to set NF_NAT_RANGE_PROTO_SPECIFIED and init the upper/lower boundaries, i.e. change input given to nf_nat_setup_info().
extracted from:
https://scanmail.trustwave.com/?c=20988&d=6vim4fcVLjPkIbLUDqz3Tj2W4gXWNC...
Add selftests for masquerading into a smaller subset of ports defined by PSID.
Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz ---
Notes: No changes.
.../netfilter/nat_masquerade_psid.sh | 182 ++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 tools/testing/selftests/netfilter/nat_masquerade_psid.sh
diff --git a/tools/testing/selftests/netfilter/nat_masquerade_psid.sh b/tools/testing/selftests/netfilter/nat_masquerade_psid.sh new file mode 100644 index 000000000000..56c1b509caf6 --- /dev/null +++ b/tools/testing/selftests/netfilter/nat_masquerade_psid.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# <:copyright-gpl +# Copyright (C) 2021 Allied Telesis Labs NZ +# +# check that NAT can masquerade using PSID defined ranges. +# +# Setup is: +# +# nsclient1(veth0) -> (veth1)nsrouter(veth2) -> (veth0)nsclient2 +# Setup a nat masquerade rule with psid defined ranges. +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 +ns_all="nsclient1 nsrouter nsclient2" + +readonly infile="$(mktemp)" +readonly outfile="$(mktemp)" +readonly datalen=32 +readonly server_port=8080 + +conntrack -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +iptables --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iptables tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ipv4() { + echo -n 192.168.$1.$2 +} + +cleanup() { + for n in $ns_all; do ip netns del $n;done + + if [ -f "${outfile}" ]; then + rm "$outfile" + fi + if [ -f "${infile}" ]; then + rm "$infile" + fi +} + +server_listen() { + ip netns exec nsclient2 nc -l -p "$server_port" > "$outfile" & + server_pid=$! + sleep 0.2 +} + +client_connect() { + ip netns exec nsclient1 timeout 2 nc -w 1 -p "$port" $(ipv4 2 2) "$server_port" < $infile +} + +verify_data() { + local _ret=0 + wait "$server_pid" + cmp "$infile" "$outfile" 2>/dev/null + _ret=$? + rm "$outfile" + return $_ret +} + +test_service() { + server_listen + client_connect + verify_data +} + +check_connection() { + local _ret=0 + entry=$(ip netns exec nsrouter conntrack -p tcp --sport $port -L 2>&1) + entry=${entry##*sport=8080 dport=} + entry=${entry%% *} + + if [[ "x$(( ($entry & $psid_mask) / $two_power_j ))" != "x$psid" ]]; then + _ret=1 + echo "Failed psid mask check for $offset_len:$psid:$psid_length with port $entry" + fi + + if [[ "x$_ret" = "x0" ]] && + [[ "x$offset_mask" != "x0" -a "x$(( ($entry & $offset_mask) ))" == "x0" ]]; then + _ret=1 + echo "Failed offset mask check for $offset_len:$psid:$psid_length with port $entry" + fi + return $_ret +} + +run_test() { + ip netns exec nsrouter iptables -A FORWARD -i veth1 -j ACCEPT + ip netns exec nsrouter iptables -P FORWARD DROP + ip netns exec nsrouter iptables -A FORWARD -m state --state ESTABLISHED,RELATED -j ACCEPT + ip netns exec nsrouter iptables -t nat --new psid + ip netns exec nsrouter iptables -t nat --insert psid -j MASQUERADE \ + --psid $offset_len:$psid:$psid_length + ip netns exec nsrouter iptables -t nat -I POSTROUTING -o veth2 -j psid + + # calculate psid mask + offset=$(( 1 << (16 - $offset_len) )) + two_power_j=$(( $offset / (1 << $psid_length) )) + offset_mask=$(( ( (1 << $offset_len) - 1 ) << (16 - $offset_len) )) + psid_mask=$(( ( (1 << $psid_length) - 1) * $two_power_j )) + + # Create file + dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 >/dev/null 2>&1 + + # Test multiple ports + for p in 1 2 3 4 5; do + port=1080$p + + test_service + if [ $? -ne 0 ]; then + ret=1 + break + fi + + check_connection + if [ $? -ne 0 ]; then + ret=1 + break + fi + done + + # tidy up test rules + ip netns exec nsrouter iptables -F + ip netns exec nsrouter iptables -t nat -F + ip netns exec nsrouter iptables -t nat -X psid +} + +for n in $ns_all; do + ip netns add $n + ip -net $n link set lo up +done + +for i in 1 2; do + ip link add veth0 netns nsclient$i type veth peer name veth$i netns nsrouter + + ip -net nsclient$i link set veth0 up + ip -net nsclient$i addr add $(ipv4 $i 2)/24 dev veth0 + + ip -net nsrouter link set veth$i up + ip -net nsrouter addr add $(ipv4 $i 1)/24 dev veth$i +done + +ip -net nsclient1 route add default via $(ipv4 1 1) +ip -net nsclient2 route add default via $(ipv4 2 1) + +ip netns exec nsrouter sysctl -q net.ipv4.conf.all.forwarding=1 + +offset_len=0 +psid_length=8 +for psid in 0 52; do + run_test + if [ $? -ne 0 ]; then + break + fi +done + +offset_len=6 +psid_length=8 +for psid in 0 52; do + run_test + if [ $? -ne 0 ]; then + break + fi +done + +cleanup +exit $ret
Add selftests for masquerading into a smaller subset of ports defined by PSID.
Signed-off-by: Cole Dishington Cole.Dishington@alliedtelesis.co.nz ---
Notes: Thanks for your time reviewing!
Changes in v4: - Add tests for a=0 (A=2^16) special case. - Update to use offset_length (from iptables change).
.../netfilter/nat_masquerade_psid.sh | 182 ++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 tools/testing/selftests/netfilter/nat_masquerade_psid.sh
diff --git a/tools/testing/selftests/netfilter/nat_masquerade_psid.sh b/tools/testing/selftests/netfilter/nat_masquerade_psid.sh new file mode 100644 index 000000000000..56c1b509caf6 --- /dev/null +++ b/tools/testing/selftests/netfilter/nat_masquerade_psid.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# <:copyright-gpl +# Copyright (C) 2021 Allied Telesis Labs NZ +# +# check that NAT can masquerade using PSID defined ranges. +# +# Setup is: +# +# nsclient1(veth0) -> (veth1)nsrouter(veth2) -> (veth0)nsclient2 +# Setup a nat masquerade rule with psid defined ranges. +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 +ns_all="nsclient1 nsrouter nsclient2" + +readonly infile="$(mktemp)" +readonly outfile="$(mktemp)" +readonly datalen=32 +readonly server_port=8080 + +conntrack -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +iptables --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iptables tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ipv4() { + echo -n 192.168.$1.$2 +} + +cleanup() { + for n in $ns_all; do ip netns del $n;done + + if [ -f "${outfile}" ]; then + rm "$outfile" + fi + if [ -f "${infile}" ]; then + rm "$infile" + fi +} + +server_listen() { + ip netns exec nsclient2 nc -l -p "$server_port" > "$outfile" & + server_pid=$! + sleep 0.2 +} + +client_connect() { + ip netns exec nsclient1 timeout 2 nc -w 1 -p "$port" $(ipv4 2 2) "$server_port" < $infile +} + +verify_data() { + local _ret=0 + wait "$server_pid" + cmp "$infile" "$outfile" 2>/dev/null + _ret=$? + rm "$outfile" + return $_ret +} + +test_service() { + server_listen + client_connect + verify_data +} + +check_connection() { + local _ret=0 + entry=$(ip netns exec nsrouter conntrack -p tcp --sport $port -L 2>&1) + entry=${entry##*sport=8080 dport=} + entry=${entry%% *} + + if [[ "x$(( ($entry & $psid_mask) / $two_power_j ))" != "x$psid" ]]; then + _ret=1 + echo "Failed psid mask check for $offset_len:$psid:$psid_length with port $entry" + fi + + if [[ "x$_ret" = "x0" ]] && + [[ "x$offset_mask" != "x0" -a "x$(( ($entry & $offset_mask) ))" == "x0" ]]; then + _ret=1 + echo "Failed offset mask check for $offset_len:$psid:$psid_length with port $entry" + fi + return $_ret +} + +run_test() { + ip netns exec nsrouter iptables -A FORWARD -i veth1 -j ACCEPT + ip netns exec nsrouter iptables -P FORWARD DROP + ip netns exec nsrouter iptables -A FORWARD -m state --state ESTABLISHED,RELATED -j ACCEPT + ip netns exec nsrouter iptables -t nat --new psid + ip netns exec nsrouter iptables -t nat --insert psid -j MASQUERADE \ + --psid $offset_len:$psid:$psid_length + ip netns exec nsrouter iptables -t nat -I POSTROUTING -o veth2 -j psid + + # calculate psid mask + offset=$(( 1 << (16 - $offset_len) )) + two_power_j=$(( $offset / (1 << $psid_length) )) + offset_mask=$(( ( (1 << $offset_len) - 1 ) << (16 - $offset_len) )) + psid_mask=$(( ( (1 << $psid_length) - 1) * $two_power_j )) + + # Create file + dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 >/dev/null 2>&1 + + # Test multiple ports + for p in 1 2 3 4 5; do + port=1080$p + + test_service + if [ $? -ne 0 ]; then + ret=1 + break + fi + + check_connection + if [ $? -ne 0 ]; then + ret=1 + break + fi + done + + # tidy up test rules + ip netns exec nsrouter iptables -F + ip netns exec nsrouter iptables -t nat -F + ip netns exec nsrouter iptables -t nat -X psid +} + +for n in $ns_all; do + ip netns add $n + ip -net $n link set lo up +done + +for i in 1 2; do + ip link add veth0 netns nsclient$i type veth peer name veth$i netns nsrouter + + ip -net nsclient$i link set veth0 up + ip -net nsclient$i addr add $(ipv4 $i 2)/24 dev veth0 + + ip -net nsrouter link set veth$i up + ip -net nsrouter addr add $(ipv4 $i 1)/24 dev veth$i +done + +ip -net nsclient1 route add default via $(ipv4 1 1) +ip -net nsclient2 route add default via $(ipv4 2 1) + +ip netns exec nsrouter sysctl -q net.ipv4.conf.all.forwarding=1 + +offset_len=0 +psid_length=8 +for psid in 0 52; do + run_test + if [ $? -ne 0 ]; then + break + fi +done + +offset_len=6 +psid_length=8 +for psid in 0 52; do + run_test + if [ $? -ne 0 ]; then + break + fi +done + +cleanup +exit $ret
linux-kselftest-mirror@lists.linaro.org