The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9a59739bd01f77db6fbe2955a4fce165f0f43568 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg(a)mellanox.com>
Date: Tue, 14 Aug 2018 15:33:02 -0700
Subject: [PATCH] IB/rxe: Revise the ib_wr_opcode enum
This enum has become part of the uABI, as both RXE and the
ib_uverbs_post_send() command expect userspace to supply values from this
enum. So it should be properly placed in include/uapi/rdma.
In userspace this enum is called 'enum ibv_wr_opcode' as part of
libibverbs.h. That enum defines different values for IB_WR_LOCAL_INV,
IB_WR_SEND_WITH_INV, and IB_WR_LSO. These were introduced (incorrectly, it
turns out) into libiberbs in 2015.
The kernel has changed its mind on the numbering for several of the IB_WC
values over the years, but has remained stable on IB_WR_LOCAL_INV and
below.
Based on this we can conclude that there is no real user space user of the
values beyond IB_WR_ATOMIC_FETCH_AND_ADD, as they have never worked via
rdma-core. This is confirmed by inspection, only rxe uses the kernel enum
and implements the latter operations. rxe has clearly never worked with
these attributes from userspace. Other drivers that support these opcodes
implement the functionality without calling out to the kernel.
To make IB_WR_SEND_WITH_INV and related work for RXE in userspace we
choose to renumber the IB_WR enum in the kernel to match the uABI that
userspace has bee using since before Soft RoCE was merged. This is an
overall simpler configuration for the whole software stack, and obviously
can't break anything existing.
Reported-by: Seth Howell <seth.howell(a)intel.com>
Tested-by: Seth Howell <seth.howell(a)intel.com>
Fixes: 8700e3e7c485 ("Soft RoCE driver")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6076c9b72ab9..e463d3007a35 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1281,21 +1281,27 @@ struct ib_qp_attr {
};
enum ib_wr_opcode {
- IB_WR_RDMA_WRITE,
- IB_WR_RDMA_WRITE_WITH_IMM,
- IB_WR_SEND,
- IB_WR_SEND_WITH_IMM,
- IB_WR_RDMA_READ,
- IB_WR_ATOMIC_CMP_AND_SWP,
- IB_WR_ATOMIC_FETCH_AND_ADD,
- IB_WR_LSO,
- IB_WR_SEND_WITH_INV,
- IB_WR_RDMA_READ_WITH_INV,
- IB_WR_LOCAL_INV,
- IB_WR_REG_MR,
- IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
- IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+ /* These are shared with userspace */
+ IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE,
+ IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM,
+ IB_WR_SEND = IB_UVERBS_WR_SEND,
+ IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM,
+ IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ,
+ IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP,
+ IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD,
+ IB_WR_LSO = IB_UVERBS_WR_TSO,
+ IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV,
+ IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV,
+ IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV,
+ IB_WR_MASKED_ATOMIC_CMP_AND_SWP =
+ IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
+ IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
+ IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+
+ /* These are kernel only and can not be issued by userspace */
+ IB_WR_REG_MR = 0x20,
IB_WR_REG_SIG_MR,
+
/* reserve values for low level drivers' internal use.
* These values will not be used at all in the ib core layer.
*/
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 25a16760de2a..1254b51a551a 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -763,10 +763,28 @@ struct ib_uverbs_sge {
__u32 lkey;
};
+enum ib_uverbs_wr_opcode {
+ IB_UVERBS_WR_RDMA_WRITE = 0,
+ IB_UVERBS_WR_RDMA_WRITE_WITH_IMM = 1,
+ IB_UVERBS_WR_SEND = 2,
+ IB_UVERBS_WR_SEND_WITH_IMM = 3,
+ IB_UVERBS_WR_RDMA_READ = 4,
+ IB_UVERBS_WR_ATOMIC_CMP_AND_SWP = 5,
+ IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD = 6,
+ IB_UVERBS_WR_LOCAL_INV = 7,
+ IB_UVERBS_WR_BIND_MW = 8,
+ IB_UVERBS_WR_SEND_WITH_INV = 9,
+ IB_UVERBS_WR_TSO = 10,
+ IB_UVERBS_WR_RDMA_READ_WITH_INV = 11,
+ IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12,
+ IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13,
+ /* Review enum ib_wr_opcode before modifying this */
+};
+
struct ib_uverbs_send_wr {
__aligned_u64 wr_id;
__u32 num_sge;
- __u32 opcode;
+ __u32 opcode; /* see enum ib_uverbs_wr_opcode */
__u32 send_flags;
union {
__be32 imm_data;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9a59739bd01f77db6fbe2955a4fce165f0f43568 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg(a)mellanox.com>
Date: Tue, 14 Aug 2018 15:33:02 -0700
Subject: [PATCH] IB/rxe: Revise the ib_wr_opcode enum
This enum has become part of the uABI, as both RXE and the
ib_uverbs_post_send() command expect userspace to supply values from this
enum. So it should be properly placed in include/uapi/rdma.
In userspace this enum is called 'enum ibv_wr_opcode' as part of
libibverbs.h. That enum defines different values for IB_WR_LOCAL_INV,
IB_WR_SEND_WITH_INV, and IB_WR_LSO. These were introduced (incorrectly, it
turns out) into libiberbs in 2015.
The kernel has changed its mind on the numbering for several of the IB_WC
values over the years, but has remained stable on IB_WR_LOCAL_INV and
below.
Based on this we can conclude that there is no real user space user of the
values beyond IB_WR_ATOMIC_FETCH_AND_ADD, as they have never worked via
rdma-core. This is confirmed by inspection, only rxe uses the kernel enum
and implements the latter operations. rxe has clearly never worked with
these attributes from userspace. Other drivers that support these opcodes
implement the functionality without calling out to the kernel.
To make IB_WR_SEND_WITH_INV and related work for RXE in userspace we
choose to renumber the IB_WR enum in the kernel to match the uABI that
userspace has bee using since before Soft RoCE was merged. This is an
overall simpler configuration for the whole software stack, and obviously
can't break anything existing.
Reported-by: Seth Howell <seth.howell(a)intel.com>
Tested-by: Seth Howell <seth.howell(a)intel.com>
Fixes: 8700e3e7c485 ("Soft RoCE driver")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6076c9b72ab9..e463d3007a35 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1281,21 +1281,27 @@ struct ib_qp_attr {
};
enum ib_wr_opcode {
- IB_WR_RDMA_WRITE,
- IB_WR_RDMA_WRITE_WITH_IMM,
- IB_WR_SEND,
- IB_WR_SEND_WITH_IMM,
- IB_WR_RDMA_READ,
- IB_WR_ATOMIC_CMP_AND_SWP,
- IB_WR_ATOMIC_FETCH_AND_ADD,
- IB_WR_LSO,
- IB_WR_SEND_WITH_INV,
- IB_WR_RDMA_READ_WITH_INV,
- IB_WR_LOCAL_INV,
- IB_WR_REG_MR,
- IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
- IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+ /* These are shared with userspace */
+ IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE,
+ IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM,
+ IB_WR_SEND = IB_UVERBS_WR_SEND,
+ IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM,
+ IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ,
+ IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP,
+ IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD,
+ IB_WR_LSO = IB_UVERBS_WR_TSO,
+ IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV,
+ IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV,
+ IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV,
+ IB_WR_MASKED_ATOMIC_CMP_AND_SWP =
+ IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
+ IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
+ IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+
+ /* These are kernel only and can not be issued by userspace */
+ IB_WR_REG_MR = 0x20,
IB_WR_REG_SIG_MR,
+
/* reserve values for low level drivers' internal use.
* These values will not be used at all in the ib core layer.
*/
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 25a16760de2a..1254b51a551a 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -763,10 +763,28 @@ struct ib_uverbs_sge {
__u32 lkey;
};
+enum ib_uverbs_wr_opcode {
+ IB_UVERBS_WR_RDMA_WRITE = 0,
+ IB_UVERBS_WR_RDMA_WRITE_WITH_IMM = 1,
+ IB_UVERBS_WR_SEND = 2,
+ IB_UVERBS_WR_SEND_WITH_IMM = 3,
+ IB_UVERBS_WR_RDMA_READ = 4,
+ IB_UVERBS_WR_ATOMIC_CMP_AND_SWP = 5,
+ IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD = 6,
+ IB_UVERBS_WR_LOCAL_INV = 7,
+ IB_UVERBS_WR_BIND_MW = 8,
+ IB_UVERBS_WR_SEND_WITH_INV = 9,
+ IB_UVERBS_WR_TSO = 10,
+ IB_UVERBS_WR_RDMA_READ_WITH_INV = 11,
+ IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12,
+ IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13,
+ /* Review enum ib_wr_opcode before modifying this */
+};
+
struct ib_uverbs_send_wr {
__aligned_u64 wr_id;
__u32 num_sge;
- __u32 opcode;
+ __u32 opcode; /* see enum ib_uverbs_wr_opcode */
__u32 send_flags;
union {
__be32 imm_data;
This is backport of the upstream patch
41c73a49df31151f4ff868f28fe4f129f113fa2c. It should be backported to
stable kernels because this performance problem was seen on the android
4.4 kernel.
commit 41c73a49df31151f4ff868f28fe4f129f113fa2c
Author: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Wed Nov 23 17:04:00 2016 -0500
dm bufio: drop the lock when doing GFP_NOIO allocation
If the first allocation attempt using GFP_NOWAIT fails, drop the lock
and retry using GFP_NOIO allocation (lock is dropped because the
allocation can take some time).
Note that we won't do GFP_NOIO allocation when we loop for the second
time, because the lock shouldn't be dropped between __wait_for_free_buffer
and __get_unclaimed_buffer.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Signed-off-by: Mike Snitzer <snitzer(a)redhat.com>
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 9ef88f0e2382..1c2e1dd7ca16 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -822,6 +822,7 @@ enum new_flag {
static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
{
struct dm_buffer *b;
+ bool tried_noio_alloc = false;
/*
* dm-bufio is resistant to allocation failures (it just keeps
@@ -846,6 +847,15 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
if (nf == NF_PREFETCH)
return NULL;
+ if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
+ dm_bufio_unlock(c);
+ b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ dm_bufio_lock(c);
+ if (b)
+ return b;
+ tried_noio_alloc = true;
+ }
+
if (!list_empty(&c->reserved_buffers)) {
b = list_entry(c->reserved_buffers.next,
struct dm_buffer, lru_list);
This is backport of the upstream patch
9ea61cac0b1ad0c09022f39fd97e9b99a2cfc2dc. It should be backported to
stable kernels because this performance problem was seen on the android
4.4 kernel.
commit 9ea61cac0b1ad0c09022f39fd97e9b99a2cfc2dc
Author: Douglas Anderson <dianders(a)chromium.org>
Date: Thu Nov 17 11:24:20 2016 -0800
dm bufio: avoid sleeping while holding the dm_bufio lock
We've seen in-field reports showing _lots_ (18 in one case, 41 in
another) of tasks all sitting there blocked on:
mutex_lock+0x4c/0x68
dm_bufio_shrink_count+0x38/0x78
shrink_slab.part.54.constprop.65+0x100/0x464
shrink_zone+0xa8/0x198
In the two cases analyzed, we see one task that looks like this:
Workqueue: kverityd verity_prefetch_io
__switch_to+0x9c/0xa8
__schedule+0x440/0x6d8
schedule+0x94/0xb4
schedule_timeout+0x204/0x27c
schedule_timeout_uninterruptible+0x44/0x50
wait_iff_congested+0x9c/0x1f0
shrink_inactive_list+0x3a0/0x4cc
shrink_lruvec+0x418/0x5cc
shrink_zone+0x88/0x198
try_to_free_pages+0x51c/0x588
__alloc_pages_nodemask+0x648/0xa88
__get_free_pages+0x34/0x7c
alloc_buffer+0xa4/0x144
__bufio_new+0x84/0x278
dm_bufio_prefetch+0x9c/0x154
verity_prefetch_io+0xe8/0x10c
process_one_work+0x240/0x424
worker_thread+0x2fc/0x424
kthread+0x10c/0x114
...and that looks to be the one holding the mutex.
The problem has been reproduced on fairly easily:
0. Be running Chrome OS w/ verity enabled on the root filesystem
1. Pick test patch: http://crosreview.com/412360
2. Install launchBalloons.sh and balloon.arm from
http://crbug.com/468342
...that's just a memory stress test app.
3. On a 4GB rk3399 machine, run
nice ./launchBalloons.sh 4 900 100000
...that tries to eat 4 * 900 MB of memory and keep accessing.
4. Login to the Chrome web browser and restore many tabs
With that, I've seen printouts like:
DOUG: long bufio 90758 ms
...and stack trace always show's we're in dm_bufio_prefetch().
The problem is that we try to allocate memory with GFP_NOIO while
we're holding the dm_bufio lock. Instead we should be using
GFP_NOWAIT. Using GFP_NOIO can cause us to sleep while holding the
lock and that causes the above problems.
The current behavior explained by David Rientjes:
It will still try reclaim initially because __GFP_WAIT (or
__GFP_KSWAPD_RECLAIM) is set by GFP_NOIO. This is the cause of
contention on dm_bufio_lock() that the thread holds. You want to
pass GFP_NOWAIT instead of GFP_NOIO to alloc_buffer() when holding a
mutex that can be contended by a concurrent slab shrinker (if
count_objects didn't use a trylock, this pattern would trivially
deadlock).
This change significantly increases responsiveness of the system while
in this state. It makes a real difference because it unblocks kswapd.
In the bug report analyzed, kswapd was hung:
kswapd0 D ffffffc000204fd8 0 72 2 0x00000000
Call trace:
[<ffffffc000204fd8>] __switch_to+0x9c/0xa8
[<ffffffc00090b794>] __schedule+0x440/0x6d8
[<ffffffc00090bac0>] schedule+0x94/0xb4
[<ffffffc00090be44>] schedule_preempt_disabled+0x28/0x44
[<ffffffc00090d900>] __mutex_lock_slowpath+0x120/0x1ac
[<ffffffc00090d9d8>] mutex_lock+0x4c/0x68
[<ffffffc000708e7c>] dm_bufio_shrink_count+0x38/0x78
[<ffffffc00030b268>] shrink_slab.part.54.constprop.65+0x100/0x464
[<ffffffc00030dbd8>] shrink_zone+0xa8/0x198
[<ffffffc00030e578>] balance_pgdat+0x328/0x508
[<ffffffc00030eb7c>] kswapd+0x424/0x51c
[<ffffffc00023f06c>] kthread+0x10c/0x114
[<ffffffc000203dd0>] ret_from_fork+0x10/0x40
By unblocking kswapd memory pressure should be reduced.
Suggested-by: David Rientjes <rientjes(a)google.com>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Signed-off-by: Mike Snitzer <snitzer(a)redhat.com>
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 125aedc3875f..b5d3428d109e 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -827,7 +827,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
* dm-bufio is resistant to allocation failures (it just keeps
* one buffer reserved in cases all the allocations fail).
* So set flags to not try too hard:
- * GFP_NOIO: don't recurse into the I/O layer
+ * GFP_NOWAIT: don't wait; if we need to sleep we'll release our
+ * mutex and wait ourselves.
* __GFP_NORETRY: don't retry and rather return failure
* __GFP_NOMEMALLOC: don't use emergency reserves
* __GFP_NOWARN: don't print a warning in case of failure
@@ -837,7 +838,7 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
*/
while (1) {
if (dm_bufio_cache_size_latch != 1) {
- b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
if (b)
return b;
}