With a raw socket bound to IPPROTO_RAW (ie with hdrincl enabled), the
protocol field of the flow structure, build by raw_sendmsg() /
rawv6_sendmsg()), is set to IPPROTO_RAW. This breaks the ipsec policy
lookup when some policies are defined with a protocol in the selector.
For ipv6, the sin6_port field from 'struct sockaddr_in6' could be used to
specify the protocol. Just accept all values for IPPROTO_RAW socket.
For ipv4, the sin_port field of 'struct sockaddr_in' could not be used
without breaking backward compatibility (the value of this field was never
checked). Let's add a new kind of control message, so that the userland
could specify which protocol is used.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
CC: stable(a)vger.kernel.org
Signed-off-by: Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
---
The first version has been marked 'Awaiting Upstream'. Steffen confirmed
that the 'net' tree should be the target, thus I resend this patch.
I also CC stable(a)vger.kernel.org.
include/net/ip.h | 2 ++
include/uapi/linux/in.h | 1 +
net/ipv4/ip_sockglue.c | 15 ++++++++++++++-
net/ipv4/raw.c | 5 ++++-
net/ipv6/raw.c | 3 ++-
5 files changed, 23 insertions(+), 3 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index c3fffaa92d6e..acec504c469a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -76,6 +76,7 @@ struct ipcm_cookie {
__be32 addr;
int oif;
struct ip_options_rcu *opt;
+ __u8 protocol;
__u8 ttl;
__s16 tos;
char priority;
@@ -96,6 +97,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
ipcm->sockc.tsflags = inet->sk.sk_tsflags;
ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
ipcm->addr = inet->inet_saddr;
+ ipcm->protocol = inet->inet_num;
}
#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 4b7f2df66b99..e682ab628dfa 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -163,6 +163,7 @@ struct in_addr {
#define IP_MULTICAST_ALL 49
#define IP_UNICAST_IF 50
#define IP_LOCAL_PORT_RANGE 51
+#define IP_PROTOCOL 52
#define MCAST_EXCLUDE 0
#define MCAST_INCLUDE 1
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b511ff0adc0a..ec0fbe874426 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -317,7 +317,17 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
ipc->tos = val;
ipc->priority = rt_tos2priority(ipc->tos);
break;
-
+ case IP_PROTOCOL:
+ if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
+ val = *(int *)CMSG_DATA(cmsg);
+ else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
+ val = *(u8 *)CMSG_DATA(cmsg);
+ else
+ return -EINVAL;
+ if (val < 1 || val > 255)
+ return -EINVAL;
+ ipc->protocol = val;
+ break;
default:
return -EINVAL;
}
@@ -1761,6 +1771,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_LOCAL_PORT_RANGE:
val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
break;
+ case IP_PROTOCOL:
+ val = inet_sk(sk)->inet_num;
+ break;
default:
sockopt_release_sock(sk);
return -ENOPROTOOPT;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ff712bf2a98d..eadf1c9ef7e4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -532,6 +532,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
ipcm_init_sk(&ipc, inet);
+ /* Keep backward compat */
+ if (hdrincl)
+ ipc.protocol = IPPROTO_RAW;
if (msg->msg_controllen) {
err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -599,7 +602,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE,
- hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ hdrincl ? ipc.protocol : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0, sk->sk_uid);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 7d0adb612bdd..44ee7a2e72ac 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -793,7 +793,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!proto)
proto = inet->inet_num;
- else if (proto != inet->inet_num)
+ else if (proto != inet->inet_num &&
+ inet->inet_num != IPPROTO_RAW)
return -EINVAL;
if (proto > 255)
--
2.39.2
From: Xiubo Li <xiubli(a)redhat.com>
When the MClientSnap reqeust's op is not CEPH_SNAP_OP_SPLIT the
request may still contain a list of 'split_realms', and we need
to skip it anyway. Or it will be parsed as a corrupt snaptrace.
Cc: stable(a)vger.kernel.org
Cc: Frank Schilder <frans(a)dtu.dk>
Reported-by: Frank Schilder <frans(a)dtu.dk>
URL: https://tracker.ceph.com/issues/61200
Signed-off-by: Xiubo Li <xiubli(a)redhat.com>
---
V2:
- Add a detail comment for the code.
fs/ceph/snap.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 0e59e95a96d9..0f00f977c0f0 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1114,6 +1114,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
continue;
adjust_snap_realm_parent(mdsc, child, realm->ino);
}
+ } else {
+ /*
+ * In non-SPLIT op case both the 'num_split_inos' and
+ * 'num_split_realms' should always be 0 and this will
+ * do nothing. But the MDS has one bug that in one of
+ * the UPDATE op cases it will pass a 'split_realms'
+ * list by mistake, and then will corrupted the snap
+ * trace in ceph_update_snap_trace().
+ *
+ * So we should skip them anyway here.
+ */
+ p += sizeof(u64) * num_split_inos;
+ p += sizeof(u64) * num_split_realms;
}
/*
--
2.40.1