2024-10-29, 11:47:25 +0100, Antonio Quartulli wrote:
+static void ovpn_socket_release_work(struct work_struct *work) +{
- struct ovpn_socket *sock = container_of(work, struct ovpn_socket, work);
- ovpn_socket_detach(sock->sock);
- kfree_rcu(sock, rcu);
+}
+static void ovpn_socket_schedule_release(struct ovpn_socket *sock) +{
- INIT_WORK(&sock->work, ovpn_socket_release_work);
- schedule_work(&sock->work);
How does module unloading know that it has to wait for this work to complete? Will ovpn_cleanup get stuck until some refcount gets released by this work?
[...]
+static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb) +{
- struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp);
- struct strp_msg *msg = strp_msg(skb);
- size_t pkt_len = msg->full_len - 2;
- size_t off = msg->offset + 2;
- /* ensure skb->data points to the beginning of the openvpn packet */
- if (!pskb_pull(skb, off)) {
net_warn_ratelimited("%s: packet too small\n",
peer->ovpn->dev->name);
goto err;
- }
- /* strparser does not trim the skb for us, therefore we do it now */
- if (pskb_trim(skb, pkt_len) != 0) {
net_warn_ratelimited("%s: trimming skb failed\n",
peer->ovpn->dev->name);
goto err;
- }
- /* we need the first byte of data to be accessible
* to extract the opcode and the key ID later on
*/
- if (!pskb_may_pull(skb, 1)) {
net_warn_ratelimited("%s: packet too small to fetch opcode\n",
peer->ovpn->dev->name);
goto err;
- }
- /* DATA_V2 packets are handled in kernel, the rest goes to user space */
- if (likely(ovpn_opcode_from_skb(skb, 0) == OVPN_DATA_V2)) {
/* hold reference to peer as required by ovpn_recv().
*
* NOTE: in this context we should already be holding a
* reference to this peer, therefore ovpn_peer_hold() is
* not expected to fail
*/
if (WARN_ON(!ovpn_peer_hold(peer)))
goto err;
ovpn_recv(peer, skb);
- } else {
/* The packet size header must be there when sending the packet
* to userspace, therefore we put it back
*/
skb_push(skb, 2);
ovpn_tcp_to_userspace(peer, strp->sk, skb);
- }
- return;
+err:
- netdev_err(peer->ovpn->dev,
"cannot process incoming TCP data for peer %u\n", peer->id);
This should also be ratelimited, and maybe just combined with the net_warn_ratelimited just before each goto.
- dev_core_stats_rx_dropped_inc(peer->ovpn->dev);
- kfree_skb(skb);
- ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR);
+}
[...]
+void ovpn_tcp_socket_detach(struct socket *sock) +{
[...]
- /* restore CBs that were saved in ovpn_sock_set_tcp_cb() */
- sock->sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready;
- sock->sk->sk_write_space = peer->tcp.sk_cb.sk_write_space;
- sock->sk->sk_prot = peer->tcp.sk_cb.prot;
- sock->sk->sk_socket->ops = peer->tcp.sk_cb.ops;
- rcu_assign_sk_user_data(sock->sk, NULL);
- rcu_read_unlock();
- /* cancel any ongoing work. Done after removing the CBs so that these
* workers cannot be re-armed
*/
I'm not sure whether a barrier is needed to prevent compiler/CPU reordering here.
- cancel_work_sync(&peer->tcp.tx_work);
- strp_done(&peer->tcp.strp);
+}
+static void ovpn_tcp_send_sock(struct ovpn_peer *peer) +{
- struct sk_buff *skb = peer->tcp.out_msg.skb;
- if (!skb)
return;
- if (peer->tcp.tx_in_progress)
return;
- peer->tcp.tx_in_progress = true;
Sorry, I never answered your question about my concerns in a previous review here.
We can reach ovpn_tcp_send_sock in two different contexts: - lock_sock (either from ovpn_tcp_sendmsg or ovpn_tcp_tx_work) - bh_lock_sock (from ovpn_tcp_send_skb, ie "data path")
These are not fully mutually exclusive. lock_sock grabs bh_lock_sock (a spinlock) for a brief period to mark the (sleeping/mutex) lock as taken, and then releases it.
So when bh_lock_sock is held, it's not possible to grab lock_sock. But when lock_sock is taken, it's still possible to grab bh_lock_sock.
The buggy scenario would be:
(data path encrypt) (sendmsg) ovpn_tcp_send_skb lock_sock bh_lock_sock + owned=1 + bh_unlock_sock bh_lock_sock ovpn_tcp_send_sock_skb ovpn_tcp_send_sock_skb !peer->tcp.out_msg.skb !peer->tcp.out_msg.skb peer->tcp.out_msg.skb = ... peer->tcp.out_msg.skb = ... ovpn_tcp_send_sock ovpn_tcp_send_sock !peer->tcp.tx_in_progress !peer->tcp.tx_in_progress peer->tcp.tx_in_progress = true peer->tcp.tx_in_progress = true // proceed // proceed
That's 2 similar races, one on out_msg.skb and one on tx_in_progress. It's a bit unlikely (but not impossible) that we'll have 2 cpus trying to call skb_send_sock_locked at the same time, but if they just overwrite each other's skb/len it's already pretty bad. The end of ovpn_tcp_send_sock might also reset peer->tcp.out_msg.* just as ovpn_tcp_send_skb -> ovpn_tcp_send_sock_skb starts setting it up (peer->tcp.out_msg.skb gets cleared, ovpn_tcp_send_sock_skb proceeds and sets skb+len, then maybe len gets reset to 0 by ovpn_tcp_send_sock).
To avoid this problem, esp_output_tcp_finish (net/ipv4/esp4.c) does:
bh_lock_sock(sk); if (sock_owned_by_user(sk)) err = espintcp_queue_out(sk, skb); else err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk);
(espintcp_push_skb is roughly equivalent to ovpn_tcp_send_sock_skb)
- do {
int ret = skb_send_sock_locked(peer->sock->sock->sk, skb,
peer->tcp.out_msg.offset,
peer->tcp.out_msg.len);
if (unlikely(ret < 0)) {
if (ret == -EAGAIN)
goto out;
net_warn_ratelimited("%s: TCP error to peer %u: %d\n",
peer->ovpn->dev->name, peer->id,
ret);
/* in case of TCP error we can't recover the VPN
* stream therefore we abort the connection
*/
ovpn_peer_del(peer,
OVPN_DEL_PEER_REASON_TRANSPORT_ERROR);
break;
}
peer->tcp.out_msg.len -= ret;
peer->tcp.out_msg.offset += ret;
- } while (peer->tcp.out_msg.len > 0);
- if (!peer->tcp.out_msg.len)
dev_sw_netstats_tx_add(peer->ovpn->dev, 1, skb->len);
- kfree_skb(peer->tcp.out_msg.skb);
- peer->tcp.out_msg.skb = NULL;
- peer->tcp.out_msg.len = 0;
- peer->tcp.out_msg.offset = 0;
+out:
- peer->tcp.tx_in_progress = false;
+}
+static void ovpn_tcp_tx_work(struct work_struct *work) +{
- struct ovpn_peer *peer;
- peer = container_of(work, struct ovpn_peer, tcp.tx_work);
- lock_sock(peer->sock->sock->sk);
- ovpn_tcp_send_sock(peer);
- release_sock(peer->sock->sock->sk);
+}
+void ovpn_tcp_send_sock_skb(struct ovpn_peer *peer, struct sk_buff *skb) +{
- if (peer->tcp.out_msg.skb)
return;
That's leaking the skb? (and not counting the drop)
- peer->tcp.out_msg.skb = skb;
- peer->tcp.out_msg.len = skb->len;
- peer->tcp.out_msg.offset = 0;
- ovpn_tcp_send_sock(peer);
+}
+static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{
[...]
- ret = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
- if (ret) {
kfree_skb(skb);
net_err_ratelimited("%s: skb copy from iter failed: %d\n",
sock->peer->ovpn->dev->name, ret);
goto unlock;
- }
- ovpn_tcp_send_sock_skb(sock->peer, skb);
If we didn't send the packet (because one was already queued/in progress), we should either stash it, or tell userspace that it wasn't sent and it should retry later.
- ret = size;
+unlock:
- release_sock(sk);
+peer_free:
- ovpn_peer_put(peer);
- return ret;
+}