December 2024 - Linux-stable-mirror

[PATCH net 4/5] gve: process XSK TX descriptors as part of RX NAPI

by Praveen Kaligineedi

From: Joshua Washington <joshwash(a)google.com> When busy polling is enabled, xsk_sendmsg for AF_XDP zero copy marks the NAPI ID corresponding to the memory pool allocated for the socket. In GVE, this NAPI ID will never correspond to a NAPI ID of one of the dedicated XDP TX queues registered with the umem because XDP TX is not set up to share a NAPI with a corresponding RX queue. This patch moves XSK TX descriptor processing from the TX NAPI to the RX NAPI, and the gve_xsk_wakeup callback is updated to use the RX NAPI instead of the TX NAPI, accordingly. The branch on if the wakeup is for TX is removed, as the NAPI poll should be invoked whether the wakeup is for TX or for RX. Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format") Cc: stable(a)vger.kernel.org Signed-off-by: Praveen Kaligineedi <pkaligineedi(a)google.com> Signed-off-by: Joshua Washington <joshwash(a)google.com> Reviewed-by: Willem de Bruijn <willemb(a)google.com> --- drivers/net/ethernet/google/gve/gve.h | 1 + drivers/net/ethernet/google/gve/gve_main.c | 8 +++++ drivers/net/ethernet/google/gve/gve_tx.c | 36 +++++++++++++--------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index dd92949bb214..8167cc5fb0df 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1140,6 +1140,7 @@ int gve_xdp_xmit_one(struct gve_priv *priv, struct gve_tx_ring *tx, void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid); bool gve_tx_poll(struct gve_notify_block *block, int budget); bool gve_xdp_poll(struct gve_notify_block *block, int budget); +int gve_xsk_tx_poll(struct gve_notify_block *block, int budget); int gve_tx_alloc_rings_gqi(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg); void gve_tx_free_rings_gqi(struct gve_priv *priv, diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e4e8ff4f9f80..5cab7b88610f 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -333,6 +333,14 @@ int gve_napi_poll(struct napi_struct *napi, int budget) if (block->rx) { work_done = gve_rx_poll(block, budget); + + /* Poll XSK TX as part of RX NAPI. Setup re-poll based on max of + * TX and RX work done. + */ + if (priv->xdp_prog) + work_done = max_t(int, work_done, + gve_xsk_tx_poll(block, budget)); + reschedule |= work_done == budget; } diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 852f8c7e39d2..4350ebd9c2bd 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -981,33 +981,41 @@ static int gve_xsk_tx(struct gve_priv *priv, struct gve_tx_ring *tx, return sent; } +int gve_xsk_tx_poll(struct gve_notify_block *rx_block, int budget) +{ + struct gve_rx_ring *rx = rx_block->rx; + struct gve_priv *priv = rx->gve; + struct gve_tx_ring *tx; + int sent = 0; + + tx = &priv->tx[gve_xdp_tx_queue_id(priv, rx->q_num)]; + if (tx->xsk_pool) { + sent = gve_xsk_tx(priv, tx, budget); + + u64_stats_update_begin(&tx->statss); + tx->xdp_xsk_sent += sent; + u64_stats_update_end(&tx->statss); + if (xsk_uses_need_wakeup(tx->xsk_pool)) + xsk_set_tx_need_wakeup(tx->xsk_pool); + } + + return sent; +} + bool gve_xdp_poll(struct gve_notify_block *block, int budget) { struct gve_priv *priv = block->priv; struct gve_tx_ring *tx = block->tx; u32 nic_done; - bool repoll; u32 to_do; /* Find out how much work there is to be done */ nic_done = gve_tx_load_event_counter(priv, tx); to_do = min_t(u32, (nic_done - tx->done), budget); gve_clean_xdp_done(priv, tx, to_do); - repoll = nic_done != tx->done; - - if (tx->xsk_pool) { - int sent = gve_xsk_tx(priv, tx, budget); - - u64_stats_update_begin(&tx->statss); - tx->xdp_xsk_sent += sent; - u64_stats_update_end(&tx->statss); - repoll |= (sent == budget); - if (xsk_uses_need_wakeup(tx->xsk_pool)) - xsk_set_tx_need_wakeup(tx->xsk_pool); - } /* If we still have work we want to repoll */ - return repoll; + return nic_done != tx->done; } bool gve_tx_poll(struct gve_notify_block *block, int budget) -- 2.47.1.613.gc27f4b7a9f-goog

10 months, 3 weeks

1
0
0 0

[PATCH net 1/5] gve: clean XDP queues in gve_tx_stop_ring_gqi

by Praveen Kaligineedi

From: Joshua Washington <joshwash(a)google.com> When stopping XDP TX rings, the XDP clean function needs to be called to clean out the entire queue, similar to what happens in the normal TX queue case. Otherwise, the FIFO won't be cleared correctly, and xsk_tx_completed won't be reported. Fixes: 75eaae158b1b ("gve: Add XDP DROP and TX support for GQI-QPL format") Cc: stable(a)vger.kernel.org Signed-off-by: Joshua Washington <joshwash(a)google.com> Signed-off-by: Praveen Kaligineedi <pkaligineedi(a)google.com> Reviewed-by: Praveen Kaligineedi <pkaligineedi(a)google.com> Reviewed-by: Willem de Bruijn <willemb(a)google.com> --- drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index e7fb7d6d283d..83ad278ec91f 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -206,7 +206,10 @@ void gve_tx_stop_ring_gqi(struct gve_priv *priv, int idx) return; gve_remove_napi(priv, ntfy_idx); - gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + if (tx->q_num < priv->tx_cfg.num_queues) + gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + else + gve_clean_xdp_done(priv, tx, priv->tx_desc_cnt); netdev_tx_reset_queue(tx->netdev_txq); gve_tx_remove_from_block(priv, idx); } -- 2.47.1.613.gc27f4b7a9f-goog

10 months, 3 weeks

1
0
0 0

[PATCH 5.4.y 1/2] erofs: fix order >= MAX_ORDER warning due to crafted negative i_size

by Gao Xiang

commit 1dd73601a1cba37a0ed5f89a8662c90191df5873 upstream. As syzbot reported [1], the root cause is that i_size field is a signed type, and negative i_size is also less than EROFS_BLKSIZ. As a consequence, it's handled as fast symlink unexpectedly. Let's fall back to the generic path to deal with such unusual i_size. [1] https://lore.kernel.org/r/000000000000ac8efa05e7feaa1f@google.com Reported-by: syzbot+f966c13b1b4fc0403b19(a)syzkaller.appspotmail.com Fixes: 431339ba9042 ("staging: erofs: add inode operations") Reviewed-by: Yue Hu <huyue2(a)coolpad.com> Link: https://lore.kernel.org/r/20220909023948.28925-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang <hsiangkao(a)linux.alibaba.com> --- fs/erofs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 0dbeaf68e1d6..ba981076d6f2 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -202,7 +202,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data, /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= PAGE_SIZE) { + inode->i_size >= PAGE_SIZE || inode->i_size < 0) { inode->i_op = &erofs_symlink_iops; return 0; } -- 2.43.5

10 months, 3 weeks

2
3
0 0

[PATCH 5.15.y] erofs: fix incorrect symlink detection in fast symlink

by Gao Xiang

commit 9ed50b8231e37b1ae863f5dec8153b98d9f389b4 upstream. Fast symlink can be used if the on-disk symlink data is stored in the same block as the on-disk inode, so we don’t need to trigger another I/O for symlink data. However, currently fs correction could be reported _incorrectly_ if inode xattrs are too large. In fact, these should be valid images although they cannot be handled as fast symlinks. Many thanks to Colin for reporting this! Reported-by: Colin Walters <walters(a)verbum.org> Reported-by: https://honggfuzz.dev/ Link: https://lore.kernel.org/r/bb2dd430-7de0-47da-ae5b-82ab2dd4d945@app.fastmail… Fixes: 431339ba9042 ("staging: erofs: add inode operations") [ Note that it's a runtime misbehavior instead of a security issue. ] Link: https://lore.kernel.org/r/20240909031911.1174718-1-hsiangkao@linux.alibaba.… Signed-off-by: Gao Xiang <hsiangkao(a)linux.alibaba.com> --- fs/erofs/inode.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 638bb70d0d65..c68258ae70d3 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -219,11 +219,14 @@ static int erofs_fill_symlink(struct inode *inode, void *data, unsigned int m_pofs) { struct erofs_inode *vi = EROFS_I(inode); + loff_t off; char *lnk; - /* if it cannot be handled with fast symlink scheme */ - if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= PAGE_SIZE || inode->i_size < 0) { + m_pofs += vi->xattr_isize; + /* check if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || inode->i_size < 0 || + check_add_overflow(m_pofs, inode->i_size, &off) || + off > i_blocksize(inode)) { inode->i_op = &erofs_symlink_iops; return 0; } @@ -232,17 +235,6 @@ static int erofs_fill_symlink(struct inode *inode, void *data, if (!lnk) return -ENOMEM; - m_pofs += vi->xattr_isize; - /* inline symlink data shouldn't cross page boundary as well */ - if (m_pofs + inode->i_size > PAGE_SIZE) { - kfree(lnk); - erofs_err(inode->i_sb, - "inline data cross block boundary @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - memcpy(lnk, data + m_pofs, inode->i_size); lnk[inode->i_size] = '\0'; -- 2.43.5

10 months, 3 weeks

2
1
0 0

[PATCH 5.10.y 1/2] erofs: fix order >= MAX_ORDER warning due to crafted negative i_size

by Gao Xiang

commit 1dd73601a1cba37a0ed5f89a8662c90191df5873 upstream. As syzbot reported [1], the root cause is that i_size field is a signed type, and negative i_size is also less than EROFS_BLKSIZ. As a consequence, it's handled as fast symlink unexpectedly. Let's fall back to the generic path to deal with such unusual i_size. [1] https://lore.kernel.org/r/000000000000ac8efa05e7feaa1f@google.com Reported-by: syzbot+f966c13b1b4fc0403b19(a)syzkaller.appspotmail.com Fixes: 431339ba9042 ("staging: erofs: add inode operations") Reviewed-by: Yue Hu <huyue2(a)coolpad.com> Link: https://lore.kernel.org/r/20220909023948.28925-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang <hsiangkao(a)linux.alibaba.com> --- fs/erofs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 0a94a52a119f..93a4ed665d93 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -202,7 +202,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data, /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= PAGE_SIZE) { + inode->i_size >= PAGE_SIZE || inode->i_size < 0) { inode->i_op = &erofs_symlink_iops; return 0; } -- 2.43.5

10 months, 3 weeks

2
3
0 0

[PATCH v2 2/2] i2c: microchip-core: fix "ghost" detections

by Conor Dooley

From: Conor Dooley <conor.dooley(a)microchip.com> Running i2c-detect currently produces an output akin to: 0 1 2 3 4 5 6 7 8 9 a b c d e f 00: 08 -- 0a -- 0c -- 0e -- 10: 10 -- 12 -- 14 -- 16 -- UU 19 -- 1b -- 1d -- 1f 20: -- 21 -- 23 -- 25 -- 27 -- 29 -- 2b -- 2d -- 2f 30: -- -- -- -- -- -- -- -- 38 -- 3a -- 3c -- 3e -- 40: 40 -- 42 -- 44 -- 46 -- 48 -- 4a -- 4c -- 4e -- 50: -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 60: 60 -- 62 -- 64 -- 66 -- 68 -- 6a -- 6c -- 6e -- 70: 70 -- 72 -- 74 -- 76 -- This happens because for an i2c_msg with a len of 0 the driver will mark the transmission of the message as a success once the START has been sent, without waiting for the devices on the bus to respond with an ACK/NAK. Since i2cdetect seems to run in a tight loop over all addresses the NAK is treated as part of the next test for the next address. Delete the fast path that marks a message as complete when idev->msg_len is zero after sending a START/RESTART since this isn't a valid scenario. CC: stable(a)vger.kernel.org Fixes: 64a6f1c4987e ("i2c: add support for microchip fpga i2c controllers") Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com> --- My original tests with KASAN/UBSAN/PREEMPT_RT enabled saw far fewer of these "ghost" detections and the skip caused by the occupied address at 0x18 on this bus is part of my attribution of the problem. Unless I'm mistaken there's no scenario that you consider a message complete after sending a START/RESTART without waiting for the ACK/NAK and this code path I deleted is useless? Looking out of tree, it predates my involvement with the code so I don't know where it came from, nor is there anything like it in the bare-metal driver the linux one was based on. --- drivers/i2c/busses/i2c-microchip-corei2c.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-microchip-corei2c.c b/drivers/i2c/busses/i2c-microchip-corei2c.c index e5af38dfaa81..b0a51695138a 100644 --- a/drivers/i2c/busses/i2c-microchip-corei2c.c +++ b/drivers/i2c/busses/i2c-microchip-corei2c.c @@ -287,8 +287,6 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) ctrl &= ~CTRL_STA; writeb(idev->addr, idev->base + CORE_I2C_DATA); writeb(ctrl, idev->base + CORE_I2C_CTRL); - if (idev->msg_len == 0) - finished = true; break; case STATUS_M_ARB_LOST: idev->msg_err = -EAGAIN; -- 2.45.2

10 months, 3 weeks

1
0
0 0

[PATCH v2 1/2] i2c: microchip-core: actually use repeated sends

by Conor Dooley

From: Conor Dooley <conor.dooley(a)microchip.com> At present, where repeated sends are intended to be used, the i2c-microchip-core driver sends a stop followed by a start. Lots of i2c devices must not malfunction in the face of this behaviour, because the driver has operated like this for years! Try to keep track of whether or not a repeated send is required, and suppress sending a stop in these cases. CC: stable(a)vger.kernel.org Fixes: 64a6f1c4987e ("i2c: add support for microchip fpga i2c controllers") Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com> --- drivers/i2c/busses/i2c-microchip-corei2c.c | 124 ++++++++++++++++----- 1 file changed, 96 insertions(+), 28 deletions(-) diff --git a/drivers/i2c/busses/i2c-microchip-corei2c.c b/drivers/i2c/busses/i2c-microchip-corei2c.c index 0b0a1c4d17ca..e5af38dfaa81 100644 --- a/drivers/i2c/busses/i2c-microchip-corei2c.c +++ b/drivers/i2c/busses/i2c-microchip-corei2c.c @@ -93,27 +93,35 @@ * @base: pointer to register struct * @dev: device reference * @i2c_clk: clock reference for i2c input clock + * @msg_queue: pointer to the messages requiring sending * @buf: pointer to msg buffer for easier use * @msg_complete: xfer completion object * @adapter: core i2c abstraction * @msg_err: error code for completed message * @bus_clk_rate: current i2c bus clock rate * @isr_status: cached copy of local ISR status + * @total_num: total number of messages to be sent/received + * @current_num: index of the current message being sent/received * @msg_len: number of bytes transferred in msg * @addr: address of the current slave + * @restart_needed: whether or not a repeated start is required after current message */ struct mchp_corei2c_dev { void __iomem *base; struct device *dev; struct clk *i2c_clk; + struct i2c_msg *msg_queue; u8 *buf; struct completion msg_complete; struct i2c_adapter adapter; int msg_err; + int total_num; + int current_num; u32 bus_clk_rate; u32 isr_status; u16 msg_len; u8 addr; + bool restart_needed; }; static void mchp_corei2c_core_disable(struct mchp_corei2c_dev *idev) @@ -222,6 +230,47 @@ static int mchp_corei2c_fill_tx(struct mchp_corei2c_dev *idev) return 0; } +static void mchp_corei2c_next_msg(struct mchp_corei2c_dev *idev) +{ + struct i2c_msg *this_msg; + u8 ctrl; + + if (idev->current_num >= idev->total_num) { + complete(&idev->msg_complete); + return; + } + + /* + * If there's been an error, the isr needs to return control + * to the "main" part of the driver, so as not to keep sending + * messages once it completes and clears the SI bit. + */ + if (idev->msg_err) { + complete(&idev->msg_complete); + return; + } + + this_msg = idev->msg_queue++; + + if (idev->current_num < (idev->total_num - 1)) { + struct i2c_msg *next_msg = idev->msg_queue; + + idev->restart_needed = next_msg->flags & I2C_M_RD; + } else { + idev->restart_needed = false; + } + + idev->addr = i2c_8bit_addr_from_msg(this_msg); + idev->msg_len = this_msg->len; + idev->buf = this_msg->buf; + + ctrl = readb(idev->base + CORE_I2C_CTRL); + ctrl |= CTRL_STA; + writeb(ctrl, idev->base + CORE_I2C_CTRL); + + idev->current_num++; +} + static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) { u32 status = idev->isr_status; @@ -247,10 +296,14 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) break; case STATUS_M_SLAW_ACK: case STATUS_M_TX_DATA_ACK: - if (idev->msg_len > 0) + if (idev->msg_len > 0) { mchp_corei2c_fill_tx(idev); - else - last_byte = true; + } else { + if (idev->restart_needed) + finished = true; + else + last_byte = true; + } break; case STATUS_M_TX_DATA_NACK: case STATUS_M_SLAR_NACK: @@ -287,7 +340,7 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) mchp_corei2c_stop(idev); if (last_byte || finished) - complete(&idev->msg_complete); + mchp_corei2c_next_msg(idev); return IRQ_HANDLED; } @@ -311,21 +364,48 @@ static irqreturn_t mchp_corei2c_isr(int irq, void *_dev) return ret; } -static int mchp_corei2c_xfer_msg(struct mchp_corei2c_dev *idev, - struct i2c_msg *msg) +static int mchp_corei2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, + int num) { - u8 ctrl; + struct mchp_corei2c_dev *idev = i2c_get_adapdata(adap); + struct i2c_msg *this_msg = msgs; unsigned long time_left; - - idev->addr = i2c_8bit_addr_from_msg(msg); - idev->msg_len = msg->len; - idev->buf = msg->buf; - idev->msg_err = 0; - - reinit_completion(&idev->msg_complete); + u8 ctrl; mchp_corei2c_core_enable(idev); + /* + * The isr controls the flow of a transfer, this info needs to be saved + * to a location that it can access the queue information from. + */ + idev->restart_needed = false; + idev->msg_queue = msgs; + idev->total_num = num; + idev->current_num = 0; + + /* + * But the first entry to the isr is triggered by the start in this + * function, so the first message needs to be "dequeued". + */ + idev->addr = i2c_8bit_addr_from_msg(this_msg); + idev->msg_len = this_msg->len; + idev->buf = this_msg->buf; + idev->msg_err = 0; + + if (idev->total_num > 1) { + struct i2c_msg *next_msg = msgs + 1; + + idev->restart_needed = next_msg->flags & I2C_M_RD; + } + + idev->current_num++; + idev->msg_queue++; + + reinit_completion(&idev->msg_complete); + + /* + * Send the first start to pass control to the isr + */ ctrl = readb(idev->base + CORE_I2C_CTRL); ctrl |= CTRL_STA; writeb(ctrl, idev->base + CORE_I2C_CTRL); @@ -335,20 +415,8 @@ static int mchp_corei2c_xfer_msg(struct mchp_corei2c_dev *idev, if (!time_left) return -ETIMEDOUT; - return idev->msg_err; -} - -static int mchp_corei2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, - int num) -{ - struct mchp_corei2c_dev *idev = i2c_get_adapdata(adap); - int i, ret; - - for (i = 0; i < num; i++) { - ret = mchp_corei2c_xfer_msg(idev, msgs++); - if (ret) - return ret; - } + if (idev->msg_err) + return idev->msg_err; return num; } -- 2.45.2

10 months, 3 weeks

1
0
0 0

[PATCH v2] virtio: fix reference leak in register_virtio_device()

by Ma Ke

Once device_add(&dev->dev) failed, call put_device() to explicitly release dev->dev. Or it could cause double free problem. As comment of device_add() says, 'if device_add() succeeds, you should call device_del() when you want to get rid of it. If device_add() has not succeeded, use only put_device() to drop the reference count'. Found by code review. Cc: stable(a)vger.kernel.org Fixes: f2b44cde7e16 ("virtio: split device_register into device_initialize and device_add") Signed-off-by: Ma Ke <make_ruc2021(a)163.com> --- Changes in v2: - modified the bug description to make it more clear; - changed the Fixes tag. --- drivers/virtio/virtio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index b9095751e43b..ac721b5597e8 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -503,6 +503,7 @@ int register_virtio_device(struct virtio_device *dev) out_of_node_put: of_node_put(dev->dev.of_node); + put_device(&dev->dev); out_ida_remove: ida_free(&virtio_index_ida, dev->index); out: -- 2.25.1

10 months, 3 weeks

3
2
0 0

[PATCH] objtool: add bch2_trans_unlocked_error to bcachefs noreturns.

by chenchangcheng

fs/bcachefs/btree_trans_commit.o: warning: objtool: bch2_trans_commit_write_locked.isra.0() falls through to next function do_bch2_trans_commit.isra.0() fs/bcachefs/btree_trans_commit.o: warning: objtool: .text: unexpected end of section ...... fs/bcachefs/btree_update.o: warning: objtool: bch2_trans_update_get_key_cache() falls through to next function flush_new_cached_update() fs/bcachefs/btree_update.o: warning: objtool: flush_new_cached_update() falls through to next function bch2_trans_update_by_path() Signed-off-by: chenchangcheng <ccc194101(a)163.com> --- tools/objtool/noreturns.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index f37614cc2c1b..88a0fa8807be 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -49,3 +49,4 @@ NORETURN(x86_64_start_kernel) NORETURN(x86_64_start_reservations) NORETURN(xen_cpu_bringup_again) NORETURN(xen_start_kernel) +NORETURN(bch2_trans_unlocked_error) -- 2.25.1

10 months, 3 weeks

1
0
0 0

[PATCH V7] mm, compaction: don't use ALLOC_CMA for unmovable allocations

by yangge1116＠126.com

From: yangge <yangge1116(a)126.com> Since commit 984fdba6a32e ("mm, compaction: use proper alloc_flags in __compaction_suitable()") allow compaction to proceed when free pages required for compaction reside in the CMA pageblocks, it's possible that __compaction_suitable() always returns true, and in some cases, it's not acceptable. There are 4 NUMA nodes on my machine, and each NUMA node has 32GB of memory. I have configured 16GB of CMA memory on each NUMA node, and starting a 32GB virtual machine with device passthrough is extremely slow, taking almost an hour. During the start-up of the virtual machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to allocate memory. Long term GUP cannot allocate memory from CMA area, so a maximum of 16 GB of no-CMA memory on a NUMA node can be used as virtual machine memory. Since there is 16G of free CMA memory on the NUMA node, watermark for order-0 always be met for compaction, so __compaction_suitable() always returns true, even if the node is unable to allocate non-CMA memory for the virtual machine. For costly allocations, because __compaction_suitable() always returns true, __alloc_pages_slowpath() can't exit at the appropriate place, resulting in excessively long virtual machine startup times. Call trace: __alloc_pages_slowpath if (compact_result == COMPACT_SKIPPED || compact_result == COMPACT_DEFERRED) goto nopage; // should exit __alloc_pages_slowpath() from here Other unmovable alloctions, like dma_buf, which can be large in a Linux system, are also unable to allocate memory from CMA, and these allocations suffer from the same problems described above. In order to quickly fall back to remote node, we should remove ALLOC_CMA both in __compaction_suitable() and __isolate_free_page() for unmovable alloctions. After this fix, starting a 32GB virtual machine with device passthrough takes only a few seconds. Fixes: 984fdba6a32e ("mm, compaction: use proper alloc_flags in __compaction_suitable()") Cc: <stable(a)vger.kernel.org> Signed-off-by: yangge <yangge1116(a)126.com> Reviewed-by: Baolin Wang <baolin.wang(a)linux.alibaba.com> --- V7: -- fix the changelog and code documentation V6: -- update cc->alloc_flags to keep the original loginc V5: - add 'alloc_flags' parameter for __isolate_free_page() - remove 'usa_cma' variable V4: - rich the commit log description V3: - fix build errors - add ALLOC_CMA both in should_continue_reclaim() and compaction_ready() V2: - using the 'cc->alloc_flags' to determin if 'ALLOC_CMA' is needed - rich the commit log description include/linux/compaction.h | 6 ++++-- mm/compaction.c | 26 +++++++++++++++----------- mm/internal.h | 3 ++- mm/page_alloc.c | 7 +++++-- mm/page_isolation.c | 3 ++- mm/page_reporting.c | 2 +- mm/vmscan.c | 4 ++-- 7 files changed, 31 insertions(+), 20 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index e947764..b4c3ac3 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -90,7 +90,8 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern bool compaction_suitable(struct zone *zone, int order, - int highest_zoneidx); + int highest_zoneidx, + unsigned int alloc_flags); extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); @@ -108,7 +109,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline bool compaction_suitable(struct zone *zone, int order, - int highest_zoneidx) + int highest_zoneidx, + unsigned int alloc_flags) { return false; } diff --git a/mm/compaction.c b/mm/compaction.c index 07bd227..223f2da 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -655,7 +655,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Found a free page, will break it into order-0 pages */ order = buddy_order(page); - isolated = __isolate_free_page(page, order); + isolated = __isolate_free_page(page, order, cc->alloc_flags); if (!isolated) break; set_page_private(page, order); @@ -1634,7 +1634,7 @@ static void fast_isolate_freepages(struct compact_control *cc) /* Isolate the page if available */ if (page) { - if (__isolate_free_page(page, order)) { + if (__isolate_free_page(page, order, cc->alloc_flags)) { set_page_private(page, order); nr_isolated = 1 << order; nr_scanned += nr_isolated - 1; @@ -2381,6 +2381,7 @@ static enum compact_result compact_finished(struct compact_control *cc) static bool __compaction_suitable(struct zone *zone, int order, int highest_zoneidx, + unsigned int alloc_flags, unsigned long wmark_target) { unsigned long watermark; @@ -2395,25 +2396,26 @@ static bool __compaction_suitable(struct zone *zone, int order, * even if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. - * ALLOC_CMA is used, as pages in CMA pageblocks are considered - * suitable migration targets + * In addition to unmovable allocations, ALLOC_CMA is used, as pages in + * CMA pageblocks are considered suitable migration targets */ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx, - ALLOC_CMA, wmark_target); + alloc_flags & ALLOC_CMA, wmark_target); } /* * compaction_suitable: Is this suitable to run compaction on this zone now? */ -bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) +bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx, + unsigned int alloc_flags) { enum compact_result compact_result; bool suitable; - suitable = __compaction_suitable(zone, order, highest_zoneidx, + suitable = __compaction_suitable(zone, order, highest_zoneidx, alloc_flags, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2474,7 +2476,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); if (__compaction_suitable(zone, order, ac->highest_zoneidx, - available)) + alloc_flags, available)) return true; } @@ -2499,7 +2501,7 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order, alloc_flags)) return COMPACT_SUCCESS; - if (!compaction_suitable(zone, order, highest_zoneidx)) + if (!compaction_suitable(zone, order, highest_zoneidx, alloc_flags)) return COMPACT_SKIPPED; return COMPACT_CONTINUE; @@ -2893,6 +2895,7 @@ static int compact_node(pg_data_t *pgdat, bool proactive) struct compact_control cc = { .order = -1, .mode = proactive ? MIGRATE_SYNC_LIGHT : MIGRATE_SYNC, + .alloc_flags = ALLOC_CMA, .ignore_skip_hint = true, .whole_zone = true, .gfp_mask = GFP_KERNEL, @@ -3037,7 +3040,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) ret = compaction_suit_allocation_order(zone, pgdat->kcompactd_max_order, - highest_zoneidx, ALLOC_WMARK_MIN); + highest_zoneidx, ALLOC_CMA | ALLOC_WMARK_MIN); if (ret == COMPACT_CONTINUE) return true; } @@ -3058,6 +3061,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) .search_order = pgdat->kcompactd_max_order, .highest_zoneidx = pgdat->kcompactd_highest_zoneidx, .mode = MIGRATE_SYNC_LIGHT, + .alloc_flags = ALLOC_CMA | ALLOC_WMARK_MIN, .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; @@ -3078,7 +3082,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) continue; ret = compaction_suit_allocation_order(zone, - cc.order, zoneid, ALLOC_WMARK_MIN); + cc.order, zoneid, cc.alloc_flags); if (ret != COMPACT_CONTINUE) continue; diff --git a/mm/internal.h b/mm/internal.h index 3922788..6d257c8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -662,7 +662,8 @@ static inline void clear_zone_contiguous(struct zone *zone) zone->contiguous = false; } -extern int __isolate_free_page(struct page *page, unsigned int order); +extern int __isolate_free_page(struct page *page, unsigned int order, + unsigned int alloc_flags); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dde19db..1bfdca3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2809,7 +2809,8 @@ void split_page(struct page *page, unsigned int order) } EXPORT_SYMBOL_GPL(split_page); -int __isolate_free_page(struct page *page, unsigned int order) +int __isolate_free_page(struct page *page, unsigned int order, + unsigned int alloc_flags) { struct zone *zone = page_zone(page); int mt = get_pageblock_migratetype(page); @@ -2823,7 +2824,8 @@ int __isolate_free_page(struct page *page, unsigned int order) * exists. */ watermark = zone->_watermark[WMARK_MIN] + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!zone_watermark_ok(zone, 0, watermark, 0, + alloc_flags & ALLOC_CMA)) return 0; } @@ -6454,6 +6456,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, .order = -1, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, + .alloc_flags = ALLOC_CMA, .ignore_skip_hint = true, .no_set_skip_hint = true, .alloc_contig = true, diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c608e9d..a1f2c79 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -229,7 +229,8 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) buddy = find_buddy_page_pfn(page, page_to_pfn(page), order, NULL); if (buddy && !is_migrate_isolate_page(buddy)) { - isolated_page = !!__isolate_free_page(page, order); + isolated_page = !!__isolate_free_page(page, order, + ALLOC_CMA); /* * Isolating a free page in an isolated pageblock * is expected to always work as watermarks don't diff --git a/mm/page_reporting.c b/mm/page_reporting.c index e4c428e..fd3813b 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -198,7 +198,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, /* Attempt to pull page from list and place in scatterlist */ if (*offset) { - if (!__isolate_free_page(page, order)) { + if (!__isolate_free_page(page, order, ALLOC_CMA)) { next = page; break; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 5e03a61..33f5b46 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5815,7 +5815,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, sc->reclaim_idx, 0)) return false; - if (compaction_suitable(zone, sc->order, sc->reclaim_idx)) + if (compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA)) return false; } @@ -6043,7 +6043,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) return true; /* Compaction cannot yet proceed. Do reclaim. */ - if (!compaction_suitable(zone, sc->order, sc->reclaim_idx)) + if (!compaction_suitable(zone, sc->order, sc->reclaim_idx, ALLOC_CMA)) return false; /* -- 2.7.4

10 months, 3 weeks

4
6
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror December 2024