From: Andy Lutomirski <luto(a)kernel.org>
Trying to clear DR7 around a #DB from usermode malfunctions if we
schedule when delivering SIGTRAP. Rather than trying to define a
special no-recursion region, just allow a single level of recursion.
We do the same thing for NMI, and it hasn't caused any problems yet.
Fixes: 9f58fdde95c9 ("x86/db: Split out dr6/7 handling")
Reported-by: Kyle Huey <me(a)kylehuey.com>
Debugged-by: Josh Poimboeuf <jpoimboe(a)redhat.com>
Signed-off-by: Andy Lutomirski <luto(a)kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/8b9bd05f187231df008d48cf818a6a311cbd5c98.15978823…
---
arch/x86/kernel/traps.c | 65 ++++++++++++++++++++++--------------------------
1 file changed, 31 insertions(+), 34 deletions(-)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -729,20 +729,9 @@ static bool is_sysenter_singlestep(struc
#endif
}
-static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
+static __always_inline unsigned long debug_read_clear_dr6(void)
{
- /*
- * Disable breakpoints during exception handling; recursive exceptions
- * are exceedingly 'fun'.
- *
- * Since this function is NOKPROBE, and that also applies to
- * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
- * HW_BREAKPOINT_W on our stack)
- *
- * Entry text is excluded for HW_BP_X and cpu_entry_area, which
- * includes the entry stack is excluded for everything.
- */
- *dr7 = local_db_save();
+ unsigned long dr6;
/*
* The Intel SDM says:
@@ -755,15 +744,12 @@ static __always_inline void debug_enter(
*
* Keep it simple: clear DR6 immediately.
*/
- get_debugreg(*dr6, 6);
+ get_debugreg(dr6, 6);
set_debugreg(0, 6);
/* Filter out all the reserved bits which are preset to 1 */
- *dr6 &= ~DR6_RESERVED;
-}
+ dr6 &= ~DR6_RESERVED;
-static __always_inline void debug_exit(unsigned long dr7)
-{
- local_db_restore(dr7);
+ return dr6;
}
/*
@@ -863,6 +849,18 @@ static void handle_debug(struct pt_regs
static __always_inline void exc_debug_kernel(struct pt_regs *regs,
unsigned long dr6)
{
+ /*
+ * Disable breakpoints during exception handling; recursive exceptions
+ * are exceedingly 'fun'.
+ *
+ * Since this function is NOKPROBE, and that also applies to
+ * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
+ * HW_BREAKPOINT_W on our stack)
+ *
+ * Entry text is excluded for HW_BP_X and cpu_entry_area, which
+ * includes the entry stack is excluded for everything.
+ */
+ unsigned long dr7 = local_db_save();
bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin();
@@ -883,6 +881,8 @@ static __always_inline void exc_debug_ke
instrumentation_end();
idtentry_exit_nmi(regs, irq_state);
+
+ local_db_restore(dr7);
}
static __always_inline void exc_debug_user(struct pt_regs *regs,
@@ -894,6 +894,15 @@ static __always_inline void exc_debug_us
*/
WARN_ON_ONCE(!user_mode(regs));
+ /*
+ * NB: We can't easily clear DR7 here because
+ * idtentry_exit_to_usermode() can invoke ptrace, schedule, access
+ * user memory, etc. This means that a recursive #DB is possible. If
+ * this happens, that #DB will hit exc_debug_kernel() and clear DR7.
+ * Since we're not on the IST stack right now, everything will be
+ * fine.
+ */
+
irqentry_enter_from_user_mode(regs);
instrumentation_begin();
@@ -907,36 +916,24 @@ static __always_inline void exc_debug_us
/* IST stack entry */
DEFINE_IDTENTRY_DEBUG(exc_debug)
{
- unsigned long dr6, dr7;
-
- debug_enter(&dr6, &dr7);
- exc_debug_kernel(regs, dr6);
- debug_exit(dr7);
+ exc_debug_kernel(regs, debug_read_clear_dr6());
}
/* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
- unsigned long dr6, dr7;
-
- debug_enter(&dr6, &dr7);
- exc_debug_user(regs, dr6);
- debug_exit(dr7);
+ exc_debug_user(regs, debug_read_clear_dr6());
}
#else
/* 32 bit does not have separate entry points. */
DEFINE_IDTENTRY_RAW(exc_debug)
{
- unsigned long dr6, dr7;
-
- debug_enter(&dr6, &dr7);
+ unsigned long dr6 = debug_read_clear_dr6();
if (user_mode(regs))
exc_debug_user(regs, dr6);
else
exc_debug_kernel(regs, dr6);
-
- debug_exit(dr7);
}
#endif
From: Kai-Heng Feng <kai.heng.feng(a)canonical.com>
Sometimes re-plugging a USB device during system sleep renders the device
useless:
[ 173.418345] xhci_hcd 0000:00:14.0: Get port status 2-4 read: 0x14203e2, return 0x10262
...
[ 176.496485] usb 2-4: Waited 2000ms for CONNECT
[ 176.496781] usb usb2-port4: status 0000.0262 after resume, -19
[ 176.497103] usb 2-4: can't resume, status -19
[ 176.497438] usb usb2-port4: logical disconnect
Because PLS equals to XDEV_RESUME, xHCI driver reports U3 to usbcore,
despite of CAS bit is flagged.
So proritize CAS over XDEV_RESUME to let usbcore handle warm-reset for
the port.
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Kai-Heng Feng <kai.heng.feng(a)canonical.com>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci-hub.c | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index c3554e37e09f..4e14e164cb68 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -740,15 +740,6 @@ static void xhci_hub_report_usb3_link_state(struct xhci_hcd *xhci,
{
u32 pls = status_reg & PORT_PLS_MASK;
- /* resume state is a xHCI internal state.
- * Do not report it to usb core, instead, pretend to be U3,
- * thus usb core knows it's not ready for transfer
- */
- if (pls == XDEV_RESUME) {
- *status |= USB_SS_PORT_LS_U3;
- return;
- }
-
/* When the CAS bit is set then warm reset
* should be performed on port
*/
@@ -770,6 +761,16 @@ static void xhci_hub_report_usb3_link_state(struct xhci_hcd *xhci,
*/
pls |= USB_PORT_STAT_CONNECTION;
} else {
+ /*
+ * Resume state is an xHCI internal state. Do not report it to
+ * usb core, instead, pretend to be U3, thus usb core knows
+ * it's not ready for transfer.
+ */
+ if (pls == XDEV_RESUME) {
+ *status |= USB_SS_PORT_LS_U3;
+ return;
+ }
+
/*
* If CAS bit isn't set but the Port is already at
* Compliance Mode, fake a connection so the USB core
--
2.17.1
From: Marcos Paulo de Souza <mpdesouza(a)suse.com>
[BUG]
After commit 9afc66498a0b ("btrfs: block-group: refactor how we read one
block group item"), cache->length is being assigned after calling
btrfs_create_block_group_cache. This causes a problem since
set_free_space_tree_thresholds is calculate the free-space threshould to
decide is the free-space tree should convert from extents to bitmaps.
The current code calls set_free_space_tree_thresholds with cache->length
being 0, which then makes cache->bitmap_high_thresh being zero. This
implies the system will always use bitmap instead of extents, which is
not desired if the block group is not fragmented.
This behavior can be seen by a test that expects to repair systems
with FREE_SPACE_EXTENT and FREE_SPACE_BITMAP, but the current code only
created FREE_SPACE_BITMAP.
[FIX]
Call set_free_space_tree_thresholds after setting cache->length.
Link: https://github.com/kdave/btrfs-progs/issues/251
Fixes: 9afc66498a0b ("btrfs: block-group: refactor how we read one block group item")
CC: stable(a)vger.kernel.org # 5.8+
Signed-off-by: Marcos Paulo de Souza <mpdesouza(a)suse.com>
---
fs/btrfs/block-group.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 44fdfa2eeb2e..01e8ba1da1d3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1798,7 +1798,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
- set_free_space_tree_thresholds(cache);
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
@@ -1908,6 +1907,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
read_block_group_item(cache, path, key);
+ set_free_space_tree_thresholds(cache);
+
if (need_clear) {
/*
* When we mount with old space cache, we need to
@@ -2128,6 +2129,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
return -ENOMEM;
cache->length = size;
+ set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
--
2.28.0
SWP_FS is used to make swap_{read,write}page() go through
the filesystem, and it's only used for swap files over
NFS. So, !SWP_FS means non NFS for now, it could be either
file backed or device backed. Something similar goes with
legacy SWP_FILE.
So in order to achieve the goal of the original patch,
SWP_BLKDEV should be used instead.
FS corruption can be observed with SSD device + XFS +
fragmented swapfile due to CONFIG_THP_SWAP=y.
I reproduced the issue with the following details:
Environment:
QEMU + upstream kernel + buildroot + NVMe (2 GB)
Kernel config:
CONFIG_BLK_DEV_NVME=y
CONFIG_THP_SWAP=y
Some reproducable steps:
mkfs.xfs -f /dev/nvme0n1
mkdir /tmp/mnt
mount /dev/nvme0n1 /tmp/mnt
bs="32k"
sz="1024m" # doesn't matter too much, I also tried 16m
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -F -S 0 -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fsync" /tmp/mnt/sw
mkswap /tmp/mnt/sw
swapon /tmp/mnt/sw
stress --vm 2 --vm-bytes 600M # doesn't matter too much as well
Symptoms:
- FS corruption (e.g. checksum failure)
- memory corruption at: 0xd2808010
- segfault
Fixes: f0eea189e8e9 ("mm, THP, swap: Don't allocate huge cluster for file backed swap device")
Fixes: 38d8b4e6bdc8 ("mm, THP, swap: delay splitting THP during swap out")
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Yang Shi <yang.shi(a)linux.alibaba.com>
Cc: Rafael Aquini <aquini(a)redhat.com>
Cc: Dave Chinner <david(a)fromorbit.com>
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Gao Xiang <hsiangkao(a)redhat.com>
---
v1: https://lore.kernel.org/r/20200819195613.24269-1-hsiangkao@redhat.com
changes since v1:
- improve commit message description
Hi Andrew,
Kindly consider this one instead if no other concerns...
Thanks,
Gao Xiang
mm/swapfile.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c26916e95fd..2937daf3ca02 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1074,7 +1074,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
- if (!(si->flags & SWP_FS))
+ if (si->flags & SWP_BLKDEV)
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
--
2.18.1
From: Charan Teja Reddy <charante(a)codeaurora.org>
Subject: mm, page_alloc: fix core hung in free_pcppages_bulk()
The following race is observed with the repeated online, offline and a
delay between two successive online of memory blocks of movable zone.
P1 P2
Online the first memory block in
the movable zone. The pcp struct
values are initialized to default
values,i.e., pcp->high = 0 &
pcp->batch = 1.
Allocate the pages from the
movable zone.
Try to Online the second memory
block in the movable zone thus it
entered the online_pages() but yet
to call zone_pcp_update().
This process is entered into
the exit path thus it tries
to release the order-0 pages
to pcp lists through
free_unref_page_commit().
As pcp->high = 0, pcp->count = 1
proceed to call the function
free_pcppages_bulk().
Update the pcp values thus the
new pcp values are like, say,
pcp->high = 378, pcp->batch = 63.
Read the pcp's batch value using
READ_ONCE() and pass the same to
free_pcppages_bulk(), pcp values
passed here are, batch = 63,
count = 1.
Since num of pages in the pcp
lists are less than ->batch,
then it will stuck in
while(list_empty(list)) loop
with interrupts disabled thus
a core hung.
Avoid this by ensuring free_pcppages_bulk() is called with proper count of
pcp list pages.
The mentioned race is some what easily reproducible without [1] because
pcp's are not updated for the first memory block online and thus there is
a enough race window for P2 between alloc+free and pcp struct values
update through onlining of second memory block.
With [1], the race still exists but it is very narrow as we update the pcp
struct values for the first memory block online itself.
This is not limited to the movable zone, it could also happen in cases
with the normal zone (e.g., hotplug to a node that only has DMA memory, or
no other memory yet).
[1]: https://patchwork.kernel.org/patch/11696389/
Link: http://lkml.kernel.org/r/1597150703-19003-1-git-send-email-charante@codeaur…
Fixes: 5f8dcc21211a ("page-allocator: split per-cpu list into one-list-per-migrate-type")
Signed-off-by: Charan Teja Reddy <charante(a)codeaurora.org>
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: David Rientjes <rientjes(a)google.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Vinayak Menon <vinmenon(a)codeaurora.org>
Cc: <stable(a)vger.kernel.org> [2.6+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 5 +++++
1 file changed, 5 insertions(+)
--- a/mm/page_alloc.c~mm-page_alloc-fix-core-hung-in-free_pcppages_bulk
+++ a/mm/page_alloc.c
@@ -1302,6 +1302,11 @@ static void free_pcppages_bulk(struct zo
struct page *page, *tmp;
LIST_HEAD(head);
+ /*
+ * Ensure proper count is passed which otherwise would stuck in the
+ * below while (list_empty(list)) loop.
+ */
+ count = min(pcp->count, count);
while (count) {
struct list_head *list;
_
From: Doug Berger <opendmb(a)gmail.com>
Subject: mm: include CMA pages in lowmem_reserve at boot
The lowmem_reserve arrays provide a means of applying pressure against
allocations from lower zones that were targeted at higher zones. Its
values are a function of the number of pages managed by higher zones and
are assigned by a call to the setup_per_zone_lowmem_reserve() function.
The function is initially called at boot time by the function
init_per_zone_wmark_min() and may be called later by accesses of the
/proc/sys/vm/lowmem_reserve_ratio sysctl file.
The function init_per_zone_wmark_min() was moved up from a module_init to
a core_initcall to resolve a sequencing issue with khugepaged.
Unfortunately this created a sequencing issue with CMA page accounting.
The CMA pages are added to the managed page count of a zone when
cma_init_reserved_areas() is called at boot also as a core_initcall. This
makes it uncertain whether the CMA pages will be added to the managed page
counts of their zones before or after the call to
init_per_zone_wmark_min() as it becomes dependent on link order. With the
current link order the pages are added to the managed count after the
lowmem_reserve arrays are initialized at boot.
This means the lowmem_reserve values at boot may be lower than the values
used later if /proc/sys/vm/lowmem_reserve_ratio is accessed even if the
ratio values are unchanged.
In many cases the difference is not significant, but for example
an ARM platform with 1GB of memory and the following memory layout
[ 0.000000] cma: Reserved 256 MiB at 0x0000000030000000
[ 0.000000] Zone ranges:
[ 0.000000] DMA [mem 0x0000000000000000-0x000000002fffffff]
[ 0.000000] Normal empty
[ 0.000000] HighMem [mem 0x0000000030000000-0x000000003fffffff]
would result in 0 lowmem_reserve for the DMA zone. This would allow
userspace to deplete the DMA zone easily. Funnily enough
$ cat /proc/sys/vm/lowmem_reserve_ratio
would fix up the situation because it forces setup_per_zone_lowmem_reserve
as a side effect.
This commit breaks the link order dependency by invoking
init_per_zone_wmark_min() as a postcore_initcall so that the CMA pages
have the chance to be properly accounted in their zone(s) and allowing the
lowmem_reserve arrays to receive consistent values.
Link: http://lkml.kernel.org/r/1597423766-27849-1-git-send-email-opendmb@gmail.com
Fixes: bc22af74f271 ("mm: update min_free_kbytes from khugepaged after core initialization")
Signed-off-by: Doug Berger <opendmb(a)gmail.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Jason Baron <jbaron(a)akamai.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/page_alloc.c~mm-include-cma-pages-in-lowmem_reserve-at-boot
+++ a/mm/page_alloc.c
@@ -7888,7 +7888,7 @@ int __meminit init_per_zone_wmark_min(vo
return 0;
}
-core_initcall(init_per_zone_wmark_min)
+postcore_initcall(init_per_zone_wmark_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
_
From: Phillip Lougher <phillip(a)squashfs.org.uk>
Subject: squashfs: avoid bio_alloc() failure with 1Mbyte blocks
This is a regression introduced by the patch "migrate from ll_rw_block
usage to BIO".
Bio_alloc() is limited to 256 pages (1 Mbyte). This can cause a failure
when reading 1 Mbyte block filesystems. The problem is a datablock can be
fully (or almost uncompressed), requiring 256 pages, but, because blocks
are not aligned to page boundaries, it may require 257 pages to read.
Bio_kmalloc() can handle 1024 pages, and so use this for the edge
condition.
Link: http://lkml.kernel.org/r/20200815035637.15319-1-phillip@squashfs.org.uk
Fixes: 93e72b3c612a ("squashfs: migrate from ll_rw_block usage to BIO")
Signed-off-by: Phillip Lougher <phillip(a)squashfs.org.uk>
Reported-by: Nicolas Prochazka <nicolas.prochazka(a)gmail.com>
Reported-by: Tomoatsu Shimada <shimada(a)walbrix.com>
Reviewed-by: Guenter Roeck <groeck(a)chromium.org>
Cc: Philippe Liard <pliard(a)google.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Adrien Schildknecht <adrien+dev(a)schischi.me>
Cc: Daniel Rosenberg <drosen(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/squashfs/block.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
--- a/fs/squashfs/block.c~squashfs-avoid-bio_alloc-failure-with-1mbyte-blocks
+++ a/fs/squashfs/block.c
@@ -87,7 +87,11 @@ static int squashfs_bio_read(struct supe
int error, i;
struct bio *bio;
- bio = bio_alloc(GFP_NOIO, page_count);
+ if (page_count <= BIO_MAX_PAGES)
+ bio = bio_alloc(GFP_NOIO, page_count);
+ else
+ bio = bio_kmalloc(GFP_NOIO, page_count);
+
if (!bio)
return -ENOMEM;
_
From: Hugh Dickins <hughd(a)google.com>
Subject: uprobes: __replace_page() avoid BUG in munlock_vma_page()
syzbot crashed on the VM_BUG_ON_PAGE(PageTail) in munlock_vma_page(), when
called from uprobes __replace_page(). Which of many ways to fix it?
Settled on not calling when PageCompound (since Head and Tail are equals
in this context, PageCompound the usual check in uprobes.c, and the prior
use of FOLL_SPLIT_PMD will have cleared PageMlocked already).
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008161338360.20413@eggly.anvils
Fixes: 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Reported-by: syzbot <syzkaller(a)googlegroups.com>
Acked-by: Song Liu <songliubraving(a)fb.com>
Acked-by: Oleg Nesterov <oleg(a)redhat.com>
Reviewed-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> [5.4+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/events/uprobes.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/events/uprobes.c~uprobes-__replace_page-avoid-bug-in-munlock_vma_page
+++ a/kernel/events/uprobes.c
@@ -205,7 +205,7 @@ static int __replace_page(struct vm_area
try_to_free_swap(old_page);
page_vma_mapped_walk_done(&pvmw);
- if (vma->vm_flags & VM_LOCKED)
+ if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
munlock_vma_page(old_page);
put_page(old_page);
_
From: Jann Horn <jannh(a)google.com>
Subject: romfs: fix uninitialized memory leak in romfs_dev_read()
romfs has a superblock field that limits the size of the filesystem; data
beyond that limit is never accessed.
romfs_dev_read() fetches a caller-supplied number of bytes from the
backing device. It returns 0 on success or an error code on failure;
therefore, its API can't represent short reads, it's all-or-nothing.
However, when romfs_dev_read() detects that the requested operation would
cross the filesystem size limit, it currently silently truncates the
requested number of bytes. This e.g. means that when the content of a
file with size 0x1000 starts one byte before the filesystem size limit,
->readpage() will only fill a single byte of the supplied page while
leaving the rest uninitialized, leaking that uninitialized memory to
userspace.
Fix it by returning an error code instead of truncating the read when the
requested read operation would go beyond the end of the filesystem.
Link: http://lkml.kernel.org/r/20200818013202.2246365-1-jannh@google.com
Fixes: da4458bda237 ("NOMMU: Make it possible for RomFS to use MTD devices directly")
Signed-off-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: David Howells <dhowells(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/romfs/storage.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
--- a/fs/romfs/storage.c~romfs-fix-uninitialized-memory-leak-in-romfs_dev_read
+++ a/fs/romfs/storage.c
@@ -217,10 +217,8 @@ int romfs_dev_read(struct super_block *s
size_t limit;
limit = romfs_maxsize(sb);
- if (pos >= limit)
+ if (pos >= limit || buflen > limit - pos)
return -EIO;
- if (buflen > limit - pos)
- buflen = limit - pos;
#ifdef CONFIG_ROMFS_ON_MTD
if (sb->s_mtd)
_
From: Hugh Dickins <hughd(a)google.com>
Subject: khugepaged: adjust VM_BUG_ON_MM() in __khugepaged_enter()
syzbot crashes on the VM_BUG_ON_MM(khugepaged_test_exit(mm), mm) in
__khugepaged_enter(): yes, when one thread is about to dump core, has set
core_state, and is waiting for others, another might do something calling
__khugepaged_enter(), which now crashes because I lumped the core_state
test (known as "mmget_still_valid") into khugepaged_test_exit(). I still
think it's best to lump them together, so just in this exceptional case,
check mm->mm_users directly instead of khugepaged_test_exit().
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008141503370.18085@eggly.anvils
Fixes: bbe98f9cadff ("khugepaged: khugepaged_test_exit() check mmget_still_valid()")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Reported-by: syzbot <syzkaller(a)googlegroups.com>
Acked-by: Yang Shi <shy828301(a)gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/khugepaged.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/khugepaged.c~khugepaged-adjust-vm_bug_on_mm-in-__khugepaged_enter
+++ a/mm/khugepaged.c
@@ -466,7 +466,7 @@ int __khugepaged_enter(struct mm_struct
return -ENOMEM;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
+ VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
_
The patch titled
Subject: mm, THP, swap: fix allocating cluster for swapfile by mistake
has been added to the -mm tree. Its filename is
mm-thp-swap-fix-allocating-cluster-for-swapfile-by-mistake.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-thp-swap-fix-allocating-cluste…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-swap-fix-allocating-cluste…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Gao Xiang <hsiangkao(a)redhat.com>
Subject: mm, THP, swap: fix allocating cluster for swapfile by mistake
SWP_FS is used to make swap_{read,write}page() go through the filesystem,
and it's only used for swap files over NFS. So, !SWP_FS means non NFS for
now, it could be either file backed or device backed. Something similar
goes with legacy SWP_FILE.
So in order to achieve the goal of the original patch, SWP_BLKDEV should
be used instead.
FS corruption can be observed with SSD device + XFS + fragmented swapfile
due to CONFIG_THP_SWAP=y.
I reproduced the issue with the following details:
Environment:
QEMU + upstream kernel + buildroot + NVMe (2 GB)
Kernel config:
CONFIG_BLK_DEV_NVME=y
CONFIG_THP_SWAP=y
Some reproducable steps:
mkfs.xfs -f /dev/nvme0n1
mkdir /tmp/mnt
mount /dev/nvme0n1 /tmp/mnt
bs="32k"
sz="1024m" # doesn't matter too much, I also tried 16m
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -F -S 0 -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fsync" /tmp/mnt/sw
mkswap /tmp/mnt/sw
swapon /tmp/mnt/sw
stress --vm 2 --vm-bytes 600M # doesn't matter too much as well
Symptoms:
- FS corruption (e.g. checksum failure)
- memory corruption at: 0xd2808010
- segfault
Link: https://lkml.kernel.org/r/20200820045323.7809-1-hsiangkao@redhat.com
Fixes: f0eea189e8e9 ("mm, THP, swap: Don't allocate huge cluster for file backed swap device")
Fixes: 38d8b4e6bdc8 ("mm, THP, swap: delay splitting THP during swap out")
Signed-off-by: Gao Xiang <hsiangkao(a)redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Acked-by: Rafael Aquini <aquini(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Carlos Maiolino <cmaiolino(a)redhat.com>
Cc: Eric Sandeen <esandeen(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/swapfile.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/swapfile.c~mm-thp-swap-fix-allocating-cluster-for-swapfile-by-mistake
+++ a/mm/swapfile.c
@@ -1078,7 +1078,7 @@ start_over:
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
- if (!(si->flags & SWP_FS))
+ if (si->flags & SWP_BLKDEV)
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
_
Patches currently in -mm which might be from hsiangkao(a)redhat.com are
mm-thp-swap-fix-allocating-cluster-for-swapfile-by-mistake.patch
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 75802ca66354a39ab8e35822747cd08b3384a99a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx(a)redhat.com>
Date: Thu, 6 Aug 2020 23:26:11 -0700
Subject: [PATCH] mm/hugetlb: fix calculation of
adjust_range_if_pmd_sharing_possible
This is found by code observation only.
Firstly, the worst case scenario should assume the whole range was covered
by pmd sharing. The old algorithm might not work as expected for ranges
like (1g-2m, 1g+2m), where the adjusted range should be (0, 1g+2m) but the
expected range should be (0, 2g).
Since at it, remove the loop since it should not be required. With that,
the new code should be faster too when the invalidating range is huge.
Mike said:
: With range (1g-2m, 1g+2m) within a vma (0, 2g) the existing code will only
: adjust to (0, 1g+2m) which is incorrect.
:
: We should cc stable. The original reason for adjusting the range was to
: prevent data corruption (getting wrong page). Since the range is not
: always adjusted correctly, the potential for corruption still exists.
:
: However, I am fairly confident that adjust_range_if_pmd_sharing_possible
: is only gong to be called in two cases:
:
: 1) for a single page
: 2) for range == entire vma
:
: In those cases, the current code should produce the correct results.
:
: To be safe, let's just cc stable.
Fixes: 017b1660df89 ("mm: migration: fix migration of huge PMD shared pages")
Signed-off-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Link: http://lkml.kernel.org/r/20200730201636.74778-1-peterx@redhat.com
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 27556d4d49fe..e52c878940bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5314,25 +5314,21 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr;
+ unsigned long a_start, a_end;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
- unsigned long a_start = check_addr & PUD_MASK;
- unsigned long a_end = a_start + PUD_SIZE;
+ /* Extend the range to be PUD aligned for a worst case scenario */
+ a_start = ALIGN_DOWN(*start, PUD_SIZE);
+ a_end = ALIGN(*end, PUD_SIZE);
- /*
- * If sharing is possible, adjust start/end if necessary.
- */
- if (range_in_vma(vma, a_start, a_end)) {
- if (a_start < *start)
- *start = a_start;
- if (a_end > *end)
- *end = a_end;
- }
- }
+ /*
+ * Intersect the range with the vma range, since pmd sharing won't be
+ * across vma after all
+ */
+ *start = max(vma->vm_start, a_start);
+ *end = min(vma->vm_end, a_end);
}
/*
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 75802ca66354a39ab8e35822747cd08b3384a99a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx(a)redhat.com>
Date: Thu, 6 Aug 2020 23:26:11 -0700
Subject: [PATCH] mm/hugetlb: fix calculation of
adjust_range_if_pmd_sharing_possible
This is found by code observation only.
Firstly, the worst case scenario should assume the whole range was covered
by pmd sharing. The old algorithm might not work as expected for ranges
like (1g-2m, 1g+2m), where the adjusted range should be (0, 1g+2m) but the
expected range should be (0, 2g).
Since at it, remove the loop since it should not be required. With that,
the new code should be faster too when the invalidating range is huge.
Mike said:
: With range (1g-2m, 1g+2m) within a vma (0, 2g) the existing code will only
: adjust to (0, 1g+2m) which is incorrect.
:
: We should cc stable. The original reason for adjusting the range was to
: prevent data corruption (getting wrong page). Since the range is not
: always adjusted correctly, the potential for corruption still exists.
:
: However, I am fairly confident that adjust_range_if_pmd_sharing_possible
: is only gong to be called in two cases:
:
: 1) for a single page
: 2) for range == entire vma
:
: In those cases, the current code should produce the correct results.
:
: To be safe, let's just cc stable.
Fixes: 017b1660df89 ("mm: migration: fix migration of huge PMD shared pages")
Signed-off-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Link: http://lkml.kernel.org/r/20200730201636.74778-1-peterx@redhat.com
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 27556d4d49fe..e52c878940bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5314,25 +5314,21 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr;
+ unsigned long a_start, a_end;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
- unsigned long a_start = check_addr & PUD_MASK;
- unsigned long a_end = a_start + PUD_SIZE;
+ /* Extend the range to be PUD aligned for a worst case scenario */
+ a_start = ALIGN_DOWN(*start, PUD_SIZE);
+ a_end = ALIGN(*end, PUD_SIZE);
- /*
- * If sharing is possible, adjust start/end if necessary.
- */
- if (range_in_vma(vma, a_start, a_end)) {
- if (a_start < *start)
- *start = a_start;
- if (a_end > *end)
- *end = a_end;
- }
- }
+ /*
+ * Intersect the range with the vma range, since pmd sharing won't be
+ * across vma after all
+ */
+ *start = max(vma->vm_start, a_start);
+ *end = min(vma->vm_end, a_end);
}
/*
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 75802ca66354a39ab8e35822747cd08b3384a99a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx(a)redhat.com>
Date: Thu, 6 Aug 2020 23:26:11 -0700
Subject: [PATCH] mm/hugetlb: fix calculation of
adjust_range_if_pmd_sharing_possible
This is found by code observation only.
Firstly, the worst case scenario should assume the whole range was covered
by pmd sharing. The old algorithm might not work as expected for ranges
like (1g-2m, 1g+2m), where the adjusted range should be (0, 1g+2m) but the
expected range should be (0, 2g).
Since at it, remove the loop since it should not be required. With that,
the new code should be faster too when the invalidating range is huge.
Mike said:
: With range (1g-2m, 1g+2m) within a vma (0, 2g) the existing code will only
: adjust to (0, 1g+2m) which is incorrect.
:
: We should cc stable. The original reason for adjusting the range was to
: prevent data corruption (getting wrong page). Since the range is not
: always adjusted correctly, the potential for corruption still exists.
:
: However, I am fairly confident that adjust_range_if_pmd_sharing_possible
: is only gong to be called in two cases:
:
: 1) for a single page
: 2) for range == entire vma
:
: In those cases, the current code should produce the correct results.
:
: To be safe, let's just cc stable.
Fixes: 017b1660df89 ("mm: migration: fix migration of huge PMD shared pages")
Signed-off-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Link: http://lkml.kernel.org/r/20200730201636.74778-1-peterx@redhat.com
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 27556d4d49fe..e52c878940bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5314,25 +5314,21 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr;
+ unsigned long a_start, a_end;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
- unsigned long a_start = check_addr & PUD_MASK;
- unsigned long a_end = a_start + PUD_SIZE;
+ /* Extend the range to be PUD aligned for a worst case scenario */
+ a_start = ALIGN_DOWN(*start, PUD_SIZE);
+ a_end = ALIGN(*end, PUD_SIZE);
- /*
- * If sharing is possible, adjust start/end if necessary.
- */
- if (range_in_vma(vma, a_start, a_end)) {
- if (a_start < *start)
- *start = a_start;
- if (a_end > *end)
- *end = a_end;
- }
- }
+ /*
+ * Intersect the range with the vma range, since pmd sharing won't be
+ * across vma after all
+ */
+ *start = max(vma->vm_start, a_start);
+ *end = min(vma->vm_end, a_end);
}
/*
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 75802ca66354a39ab8e35822747cd08b3384a99a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx(a)redhat.com>
Date: Thu, 6 Aug 2020 23:26:11 -0700
Subject: [PATCH] mm/hugetlb: fix calculation of
adjust_range_if_pmd_sharing_possible
This is found by code observation only.
Firstly, the worst case scenario should assume the whole range was covered
by pmd sharing. The old algorithm might not work as expected for ranges
like (1g-2m, 1g+2m), where the adjusted range should be (0, 1g+2m) but the
expected range should be (0, 2g).
Since at it, remove the loop since it should not be required. With that,
the new code should be faster too when the invalidating range is huge.
Mike said:
: With range (1g-2m, 1g+2m) within a vma (0, 2g) the existing code will only
: adjust to (0, 1g+2m) which is incorrect.
:
: We should cc stable. The original reason for adjusting the range was to
: prevent data corruption (getting wrong page). Since the range is not
: always adjusted correctly, the potential for corruption still exists.
:
: However, I am fairly confident that adjust_range_if_pmd_sharing_possible
: is only gong to be called in two cases:
:
: 1) for a single page
: 2) for range == entire vma
:
: In those cases, the current code should produce the correct results.
:
: To be safe, let's just cc stable.
Fixes: 017b1660df89 ("mm: migration: fix migration of huge PMD shared pages")
Signed-off-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Link: http://lkml.kernel.org/r/20200730201636.74778-1-peterx@redhat.com
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 27556d4d49fe..e52c878940bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5314,25 +5314,21 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr;
+ unsigned long a_start, a_end;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
- unsigned long a_start = check_addr & PUD_MASK;
- unsigned long a_end = a_start + PUD_SIZE;
+ /* Extend the range to be PUD aligned for a worst case scenario */
+ a_start = ALIGN_DOWN(*start, PUD_SIZE);
+ a_end = ALIGN(*end, PUD_SIZE);
- /*
- * If sharing is possible, adjust start/end if necessary.
- */
- if (range_in_vma(vma, a_start, a_end)) {
- if (a_start < *start)
- *start = a_start;
- if (a_end > *end)
- *end = a_end;
- }
- }
+ /*
+ * Intersect the range with the vma range, since pmd sharing won't be
+ * across vma after all
+ */
+ *start = max(vma->vm_start, a_start);
+ *end = min(vma->vm_end, a_end);
}
/*
When offlining CPU's, fixup_irqs() migrates all interrupts away from the
outgoing CPU to an online CPU. Its always possible the device sent an
interrupt to the previous CPU destination. Pending interrupt bit in IRR in
lapic identifies such interrupts. apic_soft_disable() will not capture any
new interrupts in IRR. This causes interrupts from device to be lost during
cpu offline. The issue was found when explicitly setting MSI affinity to a
CPU and immediately offlining it. It was simple to recreate with a USB
ethernet device and doing I/O to it while the CPU is offlined. Lost
interrupts happen even when Interrupt Remapping is enabled.
Current code does apic_soft_disable() before migrating interrupts.
native_cpu_disable()
{
...
apic_soft_disable();
cpu_disable_common();
--> fixup_irqs(); // Too late to capture anything in IRR.
}
Just fliping the above call sequence seems to hit the IRR checks
and the lost interrupt is fixed for both legacy MSI and when
interrupt remapping is enabled.
Fixes: 60dcaad5736f ("x86/hotplug: Silence APIC and NMI when CPU is dead")
Link: https://lore.kernel.org/lkml/875zdarr4h.fsf@nanos.tec.linutronix.de/
Signed-off-by: Ashok Raj <ashok.raj(a)intel.com>
To: linux-kernel(a)vger.kernel.org
To: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Sukumar Ghorai <sukumar.ghorai(a)intel.com>
Cc: Srikanth Nandamuri <srikanth.nandamuri(a)intel.com>
Cc: Evan Green <evgreen(a)chromium.org>
Cc: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/kernel/smpboot.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ffbd9a3d78d8..278cc9f92f2f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1603,13 +1603,20 @@ int native_cpu_disable(void)
if (ret)
return ret;
+ cpu_disable_common();
/*
* Disable the local APIC. Otherwise IPI broadcasts will reach
* it. It still responds normally to INIT, NMI, SMI, and SIPI
- * messages.
+ * messages. Its important to do apic_soft_disable() after
+ * fixup_irqs(), because fixup_irqs() called from cpu_disable_common()
+ * depends on IRR being set. After apic_soft_disable() CPU preserves
+ * currently set IRR/ISR but new interrupts will not set IRR.
+ * This causes interrupts sent to outgoing cpu before completion
+ * of irq migration to be lost. Check SDM Vol 3 "10.4.7.2 Local
+ * APIC State after It Has been Software Disabled" section for more
+ * details.
*/
apic_soft_disable();
- cpu_disable_common();
return 0;
}
--
2.13.6
As per PAPR we have to look for both EPOW sensor value and event modifier to
identify type of event and take appropriate action.
Sensor value = 3 (EPOW_SYSTEM_SHUTDOWN) schedule system to be shutdown after
OS defined delay (default 10 mins).
EPOW Event Modifier for sensor value = 3:
We have to initiate immediate shutdown for most of the event modifier except
value = 2 (system running on UPS).
Checking with firmware document its clear that we have to wait for predefined
time before initiating shutdown. If power is restored within time we should
cancel the shutdown process. I think commit 79872e35 accidently enabled
immediate poweroff for EPOW_SHUTDOWN_ON_UPS event.
We have user space tool (rtas_errd) on LPAR to monitor for EPOW_SHUTDOWN_ON_UPS.
Once it gets event it initiates shutdown after predefined time. Also starts
monitoring for any new EPOW events. If it receives "Power restored" event
before predefined time it will cancel the shutdown. Otherwise after
predefined time it will shutdown the system.
Fixes: 79872e35 (powerpc/pseries: All events of EPOW_SYSTEM_SHUTDOWN must initiate shutdown)
Cc: stable(a)vger.kernel.org # v4.0+
Cc: Tyrel Datwyler <tyreld(a)linux.ibm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Signed-off-by: Vasant Hegde <hegdevasant(a)linux.vnet.ibm.com>
---
Changes in v2:
- Updated patch description based on mpe, Tyrel comment.
-Vasant
arch/powerpc/platforms/pseries/ras.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fc..13c86a292c6d 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -184,7 +184,6 @@ static void handle_system_shutdown(char event_modifier)
case EPOW_SHUTDOWN_ON_UPS:
pr_emerg("Loss of system power detected. System is running on"
" UPS/battery. Check RTAS error log for details\n");
- orderly_poweroff(true);
break;
case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
--
2.26.2
From: WANG Cong <xiyou.wangcong(a)gmail.com>
[ Upstream commit 199ab00f3cdb6f154ea93fa76fd80192861a821d ]
Andrey reported a out-of-bound access in ip6_tnl_xmit(), this
is because we use an ipv4 dst in ip6_tnl_xmit() and cast an IPv4
neigh key as an IPv6 address:
neigh = dst_neigh_lookup(skb_dst(skb),
&ipv6_hdr(skb)->daddr);
if (!neigh)
goto tx_err_link_failure;
addr6 = (struct in6_addr *)&neigh->primary_key; // <=== HERE
addr_type = ipv6_addr_type(addr6);
if (addr_type == IPV6_ADDR_ANY)
addr6 = &ipv6_hdr(skb)->daddr;
memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
Also the network header of the skb at this point should be still IPv4
for 4in6 tunnels, we shold not just use it as IPv6 header.
This patch fixes it by checking if skb->protocol is ETH_P_IPV6: if it
is, we are safe to do the nexthop lookup using skb_dst() and
ipv6_hdr(skb)->daddr; if not (aka IPv4), we have no clue about which
dest address we can pick here, we have to rely on callers to fill it
from tunnel config, so just fall to ip6_route_output() to make the
decision.
Fixes: ea3dc9601bda ("ip6_tunnel: Add support for wildcard tunnel endpoints.")
Reported-by: Andrey Konovalov <andreyknvl(a)google.com>
Reported-by: syzbot+01400f5fc51cf4747bec(a)syzkaller.appspotmail.com
Tested-by: Andrey Konovalov <andreyknvl(a)google.com>
Cc: Steffen Klassert <steffen.klassert(a)secunet.com>
Signed-off-by: Cong Wang <xiyou.wangcong(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Alessio Balsini <balsini(a)android.com>
---
net/ipv6/ip6_tunnel.c | 32 +++++++++++++++++---------------
1 file changed, 17 insertions(+), 15 deletions(-)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index f072a4c4575c..96563990d654 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -972,26 +972,28 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
/* NBMA tunnel */
if (ipv6_addr_any(&t->parms.raddr)) {
- struct in6_addr *addr6;
- struct neighbour *neigh;
- int addr_type;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct in6_addr *addr6;
+ struct neighbour *neigh;
+ int addr_type;
- if (!skb_dst(skb))
- goto tx_err_link_failure;
+ if (!skb_dst(skb))
+ goto tx_err_link_failure;
- neigh = dst_neigh_lookup(skb_dst(skb),
- &ipv6_hdr(skb)->daddr);
- if (!neigh)
- goto tx_err_link_failure;
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (!neigh)
+ goto tx_err_link_failure;
- addr6 = (struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
+ addr6 = (struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
- if (addr_type == IPV6_ADDR_ANY)
- addr6 = &ipv6_hdr(skb)->daddr;
+ if (addr_type == IPV6_ADDR_ANY)
+ addr6 = &ipv6_hdr(skb)->daddr;
- memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
- neigh_release(neigh);
+ memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
+ neigh_release(neigh);
+ }
} else if (!fl6->flowi6_mark)
dst = dst_cache_get(&t->dst_cache);
--
2.28.0.297.g1956fa8f8d-goog
Let's not try and use PAT attributes for I915_MAP_WC is the CPU doesn't
support PAT.
Fixes: 6056e50033d9 ("drm/i915/gem: Support discontiguous lmem object maps")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: <stable(a)vger.kernel.org> # v5.6+
---
drivers/gpu/drm/i915/gem/i915_gem_pages.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 7050519c87a4..5f1725268988 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -254,6 +254,9 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj,
if (!i915_gem_object_has_struct_page(obj) && type != I915_MAP_WC)
return NULL;
+ if (type == I915_MAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
+ return NULL;
+
/* A single page can always be kmapped */
if (n_pte == 1 && type == I915_MAP_WB)
return kmap(sg_page(sgt->sgl));
--
2.20.1
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a34a0a632dd991a371fec56431d73279f9c54029 Mon Sep 17 00:00:00 2001
From: Xin Xiong <xiongx18(a)fudan.edu.cn>
Date: Sun, 19 Jul 2020 23:45:45 +0800
Subject: [PATCH] drm: fix drm_dp_mst_port refcount leaks in
drm_dp_mst_allocate_vcpi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
drm_dp_mst_allocate_vcpi() invokes
drm_dp_mst_topology_get_port_validated(), which increases the refcount
of the "port".
These reference counting issues take place in two exception handling
paths separately. Either when “slots” is less than 0 or when
drm_dp_init_vcpi() returns a negative value, the function forgets to
reduce the refcnt increased drm_dp_mst_topology_get_port_validated(),
which results in a refcount leak.
Fix these issues by pulling up the error handling when "slots" is less
than 0, and calling drm_dp_mst_topology_put_port() before termination
when drm_dp_init_vcpi() returns a negative value.
Fixes: 1e797f556c61 ("drm/dp: Split drm_dp_mst_allocate_vcpi")
Cc: <stable(a)vger.kernel.org> # v4.12+
Signed-off-by: Xiyu Yang <xiyuyang19(a)fudan.edu.cn>
Signed-off-by: Xin Tan <tanxin.ctf(a)gmail.com>
Signed-off-by: Xin Xiong <xiongx18(a)fudan.edu.cn>
Reviewed-by: Lyude Paul <lyude(a)redhat.com>
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200719154545.GA41231@xin-vi…
diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c
index 09b32289497e..b23cb2fec3f3 100644
--- a/drivers/gpu/drm/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/drm_dp_mst_topology.c
@@ -4308,11 +4308,11 @@ bool drm_dp_mst_allocate_vcpi(struct drm_dp_mst_topology_mgr *mgr,
{
int ret;
- port = drm_dp_mst_topology_get_port_validated(mgr, port);
- if (!port)
+ if (slots < 0)
return false;
- if (slots < 0)
+ port = drm_dp_mst_topology_get_port_validated(mgr, port);
+ if (!port)
return false;
if (port->vcpi.vcpi > 0) {
@@ -4328,6 +4328,7 @@ bool drm_dp_mst_allocate_vcpi(struct drm_dp_mst_topology_mgr *mgr,
if (ret) {
DRM_DEBUG_KMS("failed to init vcpi slots=%d max=63 ret=%d\n",
DIV_ROUND_UP(pbn, mgr->pbn_div), ret);
+ drm_dp_mst_topology_put_port(port);
goto out;
}
DRM_DEBUG_KMS("initing vcpi for pbn=%d slots=%d\n",
The ordering of psp_tmr_terminate() and psp_asd_unload()
got reversed when the patches were applied to stable.
This patch does not exist in Linus' tree because the ordering
is correct there. It got reversed when the patches were applied
to stable. This patch is for stable only.
Fixes: 22ff658396b446 ("drm/amdgpu: asd function needs to be unloaded in suspend phase")
Fixes: 2c41c968c6f648 ("drm/amdgpu: add TMR destory function for psp")
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Cc: stable(a)vger.kernel.org # 5.7.x
Cc: Huang Rui <ray.huang(a)amd.com>
---
Make the description more explicit as to why this patch is only for stable.
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 3c6f60c5b1a5..088f43ebdceb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1679,15 +1679,15 @@ static int psp_suspend(void *handle)
}
}
- ret = psp_tmr_terminate(psp);
+ ret = psp_asd_unload(psp);
if (ret) {
- DRM_ERROR("Falied to terminate tmr\n");
+ DRM_ERROR("Failed to unload asd\n");
return ret;
}
- ret = psp_asd_unload(psp);
+ ret = psp_tmr_terminate(psp);
if (ret) {
- DRM_ERROR("Failed to unload asd\n");
+ DRM_ERROR("Falied to terminate tmr\n");
return ret;
}
--
2.25.4
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ec11fe3705a7d434ec3493bdaddb9b6367e7366a Mon Sep 17 00:00:00 2001
From: hersen wu <hersenxs.wu(a)amd.com>
Date: Mon, 22 Jun 2020 13:29:16 -0400
Subject: [PATCH] drm/amd/display: OLED panel backlight adjust not work with
external display connected
[Why]
amdgpu_dm->backlight_caps is for single eDP only. the caps are upddated
for very connector. Real eDP caps will be overwritten by other external
display. For OLED panel, caps->aux_support is set to 1 for OLED pnael.
after external connected, caps+.aux_support is set to 0. This causes
OLED backlight adjustment not work.
[How]
within update_conector_ext_caps, backlight caps will be updated only for
eDP connector.
Cc: stable(a)vger.kernel.org
Signed-off-by: hersen wu <hersenxs.wu(a)amd.com>
Reviewed-by: Nicholas Kazlauskas <Nicholas.Kazlauskas(a)amd.com>
Acked-by: Rodrigo Siqueira <Rodrigo.Siqueira(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 6f937e25a62c..9c283531e1a2 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -2028,6 +2028,7 @@ static void update_connector_ext_caps(struct amdgpu_dm_connector *aconnector)
struct amdgpu_display_manager *dm;
struct drm_connector *conn_base;
struct amdgpu_device *adev;
+ struct dc_link *link = NULL;
static const u8 pre_computed_values[] = {
50, 51, 52, 53, 55, 56, 57, 58, 59, 61, 62, 63, 65, 66, 68, 69,
71, 72, 74, 75, 77, 79, 81, 82, 84, 86, 88, 90, 92, 94, 96, 98};
@@ -2035,6 +2036,10 @@ static void update_connector_ext_caps(struct amdgpu_dm_connector *aconnector)
if (!aconnector || !aconnector->dc_link)
return;
+ link = aconnector->dc_link;
+ if (link->connector_signal != SIGNAL_TYPE_EDP)
+ return;
+
conn_base = &aconnector->base;
adev = conn_base->dev->dev_private;
dm = &adev->dm;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8425400759749e23aaa831f2482b96211201af33 Mon Sep 17 00:00:00 2001
From: Denis Efremov <efremov(a)linux.com>
Date: Fri, 5 Jun 2020 20:37:43 +0300
Subject: [PATCH] drm/amd/display: Use kvfree() to free coeff in
build_regamma()
Use kvfree() instead of kfree() to free coeff in build_regamma()
because the memory is allocated with kvzalloc().
Fixes: e752058b8671 ("drm/amd/display: Optimize gamma calculations")
Cc: stable(a)vger.kernel.org
Signed-off-by: Denis Efremov <efremov(a)linux.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
index 9431b48aecb4..56bb1f9f77ce 100644
--- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
+++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
@@ -843,7 +843,7 @@ static bool build_regamma(struct pwl_float_data_ex *rgb_regamma,
pow_buffer_ptr = -1; // reset back to no optimize
ret = true;
release:
- kfree(coeff);
+ kvfree(coeff);
return ret;
}
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8425400759749e23aaa831f2482b96211201af33 Mon Sep 17 00:00:00 2001
From: Denis Efremov <efremov(a)linux.com>
Date: Fri, 5 Jun 2020 20:37:43 +0300
Subject: [PATCH] drm/amd/display: Use kvfree() to free coeff in
build_regamma()
Use kvfree() instead of kfree() to free coeff in build_regamma()
because the memory is allocated with kvzalloc().
Fixes: e752058b8671 ("drm/amd/display: Optimize gamma calculations")
Cc: stable(a)vger.kernel.org
Signed-off-by: Denis Efremov <efremov(a)linux.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
index 9431b48aecb4..56bb1f9f77ce 100644
--- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
+++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
@@ -843,7 +843,7 @@ static bool build_regamma(struct pwl_float_data_ex *rgb_regamma,
pow_buffer_ptr = -1; // reset back to no optimize
ret = true;
release:
- kfree(coeff);
+ kvfree(coeff);
return ret;
}
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8425400759749e23aaa831f2482b96211201af33 Mon Sep 17 00:00:00 2001
From: Denis Efremov <efremov(a)linux.com>
Date: Fri, 5 Jun 2020 20:37:43 +0300
Subject: [PATCH] drm/amd/display: Use kvfree() to free coeff in
build_regamma()
Use kvfree() instead of kfree() to free coeff in build_regamma()
because the memory is allocated with kvzalloc().
Fixes: e752058b8671 ("drm/amd/display: Optimize gamma calculations")
Cc: stable(a)vger.kernel.org
Signed-off-by: Denis Efremov <efremov(a)linux.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
index 9431b48aecb4..56bb1f9f77ce 100644
--- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
+++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
@@ -843,7 +843,7 @@ static bool build_regamma(struct pwl_float_data_ex *rgb_regamma,
pow_buffer_ptr = -1; // reset back to no optimize
ret = true;
release:
- kfree(coeff);
+ kvfree(coeff);
return ret;
}
The patch below was submitted to be applied to the 5.8-stable tree.
I fail to see how this patch meets the stable kernel rules as found at
Documentation/process/stable-kernel-rules.rst.
I could be totally wrong, and if so, please respond to
<stable(a)vger.kernel.org> and let me know why this patch should be
applied. Otherwise, it is now dropped from my patch queues, never to be
seen again.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1d8d42ba365101fa68d210c0e2ed2bc9582fda6c Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann(a)suse.de>
Date: Fri, 5 Jun 2020 15:57:50 +0200
Subject: [PATCH] drm/mgag200: Remove declaration of mgag200_mmap() from header
file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Commit 94668ac796a5 ("drm/mgag200: Convert mgag200 driver to VRAM MM")
removed the implementation of mgag200_mmap(). Also remove the declaration.
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Acked-by: Sam Ravnborg <sam(a)ravnborg.org>
Fixes: 94668ac796a5 ("drm/mgag200: Convert mgag200 driver to VRAM MM")
Cc: Gerd Hoffmann <kraxel(a)redhat.com>
Cc: Dave Airlie <airlied(a)redhat.com>
Cc: Krzysztof Kozlowski <krzk(a)kernel.org>
Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: Sam Ravnborg <sam(a)ravnborg.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: "Noralf Trønnes" <noralf(a)tronnes.org>
Cc: Armijn Hemel <armijn(a)tjaldur.nl>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Cc: Emil Velikov <emil.velikov(a)collabora.com>
Cc: <stable(a)vger.kernel.org> # v5.3+
Link: https://patchwork.freedesktop.org/patch/msgid/20200605135803.19811-2-tzimme…
diff --git a/drivers/gpu/drm/mgag200/mgag200_drv.h b/drivers/gpu/drm/mgag200/mgag200_drv.h
index 47df62b1ad29..92b6679029fe 100644
--- a/drivers/gpu/drm/mgag200/mgag200_drv.h
+++ b/drivers/gpu/drm/mgag200/mgag200_drv.h
@@ -198,6 +198,5 @@ void mgag200_i2c_destroy(struct mga_i2c_chan *i2c);
int mgag200_mm_init(struct mga_device *mdev);
void mgag200_mm_fini(struct mga_device *mdev);
-int mgag200_mmap(struct file *filp, struct vm_area_struct *vma);
#endif /* __MGAG200_DRV_H__ */
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 119c53d2d4044c59c450c4f5a568d80b9d861856 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Wed, 8 Jul 2020 16:49:11 +0100
Subject: [PATCH] drm/vgem: Replace opencoded version of
drm_gem_dumb_map_offset()
drm_gem_dumb_map_offset() now exists and does everything
vgem_gem_dump_map does and *ought* to do.
In particular, vgem_gem_dumb_map() was trying to reject mmapping an
imported dmabuf by checking the existence of obj->filp. Unfortunately,
we always allocated an obj->filp, even if unused for an imported dmabuf.
Instead, the drm_gem_dumb_map_offset(), since commit 90378e589192
("drm/gem: drm_gem_dumb_map_offset(): reject dma-buf"), uses the
obj->import_attach to reject such invalid mmaps.
This prevents vgem from allowing userspace mmapping the dumb handle and
attempting to incorrectly fault in remote pages belonging to another
device, where there may not even be a struct page.
v2: Use the default drm_gem_dumb_map_offset() callback
Fixes: af33a9190d02 ("drm/vgem: Enable dmabuf import interfaces")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: <stable(a)vger.kernel.org> # v4.13+
Link: https://patchwork.freedesktop.org/patch/msgid/20200708154911.21236-1-chris@…
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index e4dc7b267a0b..a775feda1cc7 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -230,32 +230,6 @@ static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
return 0;
}
-static int vgem_gem_dumb_map(struct drm_file *file, struct drm_device *dev,
- uint32_t handle, uint64_t *offset)
-{
- struct drm_gem_object *obj;
- int ret;
-
- obj = drm_gem_object_lookup(file, handle);
- if (!obj)
- return -ENOENT;
-
- if (!obj->filp) {
- ret = -EINVAL;
- goto unref;
- }
-
- ret = drm_gem_create_mmap_offset(obj);
- if (ret)
- goto unref;
-
- *offset = drm_vma_node_offset_addr(&obj->vma_node);
-unref:
- drm_gem_object_put(obj);
-
- return ret;
-}
-
static struct drm_ioctl_desc vgem_ioctls[] = {
DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW),
@@ -446,7 +420,6 @@ static struct drm_driver vgem_driver = {
.fops = &vgem_driver_fops,
.dumb_create = vgem_gem_dumb_create,
- .dumb_map_offset = vgem_gem_dumb_map,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 119c53d2d4044c59c450c4f5a568d80b9d861856 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Wed, 8 Jul 2020 16:49:11 +0100
Subject: [PATCH] drm/vgem: Replace opencoded version of
drm_gem_dumb_map_offset()
drm_gem_dumb_map_offset() now exists and does everything
vgem_gem_dump_map does and *ought* to do.
In particular, vgem_gem_dumb_map() was trying to reject mmapping an
imported dmabuf by checking the existence of obj->filp. Unfortunately,
we always allocated an obj->filp, even if unused for an imported dmabuf.
Instead, the drm_gem_dumb_map_offset(), since commit 90378e589192
("drm/gem: drm_gem_dumb_map_offset(): reject dma-buf"), uses the
obj->import_attach to reject such invalid mmaps.
This prevents vgem from allowing userspace mmapping the dumb handle and
attempting to incorrectly fault in remote pages belonging to another
device, where there may not even be a struct page.
v2: Use the default drm_gem_dumb_map_offset() callback
Fixes: af33a9190d02 ("drm/vgem: Enable dmabuf import interfaces")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: <stable(a)vger.kernel.org> # v4.13+
Link: https://patchwork.freedesktop.org/patch/msgid/20200708154911.21236-1-chris@…
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index e4dc7b267a0b..a775feda1cc7 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -230,32 +230,6 @@ static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
return 0;
}
-static int vgem_gem_dumb_map(struct drm_file *file, struct drm_device *dev,
- uint32_t handle, uint64_t *offset)
-{
- struct drm_gem_object *obj;
- int ret;
-
- obj = drm_gem_object_lookup(file, handle);
- if (!obj)
- return -ENOENT;
-
- if (!obj->filp) {
- ret = -EINVAL;
- goto unref;
- }
-
- ret = drm_gem_create_mmap_offset(obj);
- if (ret)
- goto unref;
-
- *offset = drm_vma_node_offset_addr(&obj->vma_node);
-unref:
- drm_gem_object_put(obj);
-
- return ret;
-}
-
static struct drm_ioctl_desc vgem_ioctls[] = {
DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW),
@@ -446,7 +420,6 @@ static struct drm_driver vgem_driver = {
.fops = &vgem_driver_fops,
.dumb_create = vgem_gem_dumb_create,
- .dumb_map_offset = vgem_gem_dumb_map,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 119c53d2d4044c59c450c4f5a568d80b9d861856 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Wed, 8 Jul 2020 16:49:11 +0100
Subject: [PATCH] drm/vgem: Replace opencoded version of
drm_gem_dumb_map_offset()
drm_gem_dumb_map_offset() now exists and does everything
vgem_gem_dump_map does and *ought* to do.
In particular, vgem_gem_dumb_map() was trying to reject mmapping an
imported dmabuf by checking the existence of obj->filp. Unfortunately,
we always allocated an obj->filp, even if unused for an imported dmabuf.
Instead, the drm_gem_dumb_map_offset(), since commit 90378e589192
("drm/gem: drm_gem_dumb_map_offset(): reject dma-buf"), uses the
obj->import_attach to reject such invalid mmaps.
This prevents vgem from allowing userspace mmapping the dumb handle and
attempting to incorrectly fault in remote pages belonging to another
device, where there may not even be a struct page.
v2: Use the default drm_gem_dumb_map_offset() callback
Fixes: af33a9190d02 ("drm/vgem: Enable dmabuf import interfaces")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: <stable(a)vger.kernel.org> # v4.13+
Link: https://patchwork.freedesktop.org/patch/msgid/20200708154911.21236-1-chris@…
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index e4dc7b267a0b..a775feda1cc7 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -230,32 +230,6 @@ static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
return 0;
}
-static int vgem_gem_dumb_map(struct drm_file *file, struct drm_device *dev,
- uint32_t handle, uint64_t *offset)
-{
- struct drm_gem_object *obj;
- int ret;
-
- obj = drm_gem_object_lookup(file, handle);
- if (!obj)
- return -ENOENT;
-
- if (!obj->filp) {
- ret = -EINVAL;
- goto unref;
- }
-
- ret = drm_gem_create_mmap_offset(obj);
- if (ret)
- goto unref;
-
- *offset = drm_vma_node_offset_addr(&obj->vma_node);
-unref:
- drm_gem_object_put(obj);
-
- return ret;
-}
-
static struct drm_ioctl_desc vgem_ioctls[] = {
DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW),
@@ -446,7 +420,6 @@ static struct drm_driver vgem_driver = {
.fops = &vgem_driver_fops,
.dumb_create = vgem_gem_dumb_create,
- .dumb_map_offset = vgem_gem_dumb_map,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 119c53d2d4044c59c450c4f5a568d80b9d861856 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Wed, 8 Jul 2020 16:49:11 +0100
Subject: [PATCH] drm/vgem: Replace opencoded version of
drm_gem_dumb_map_offset()
drm_gem_dumb_map_offset() now exists and does everything
vgem_gem_dump_map does and *ought* to do.
In particular, vgem_gem_dumb_map() was trying to reject mmapping an
imported dmabuf by checking the existence of obj->filp. Unfortunately,
we always allocated an obj->filp, even if unused for an imported dmabuf.
Instead, the drm_gem_dumb_map_offset(), since commit 90378e589192
("drm/gem: drm_gem_dumb_map_offset(): reject dma-buf"), uses the
obj->import_attach to reject such invalid mmaps.
This prevents vgem from allowing userspace mmapping the dumb handle and
attempting to incorrectly fault in remote pages belonging to another
device, where there may not even be a struct page.
v2: Use the default drm_gem_dumb_map_offset() callback
Fixes: af33a9190d02 ("drm/vgem: Enable dmabuf import interfaces")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: <stable(a)vger.kernel.org> # v4.13+
Link: https://patchwork.freedesktop.org/patch/msgid/20200708154911.21236-1-chris@…
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index e4dc7b267a0b..a775feda1cc7 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -230,32 +230,6 @@ static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
return 0;
}
-static int vgem_gem_dumb_map(struct drm_file *file, struct drm_device *dev,
- uint32_t handle, uint64_t *offset)
-{
- struct drm_gem_object *obj;
- int ret;
-
- obj = drm_gem_object_lookup(file, handle);
- if (!obj)
- return -ENOENT;
-
- if (!obj->filp) {
- ret = -EINVAL;
- goto unref;
- }
-
- ret = drm_gem_create_mmap_offset(obj);
- if (ret)
- goto unref;
-
- *offset = drm_vma_node_offset_addr(&obj->vma_node);
-unref:
- drm_gem_object_put(obj);
-
- return ret;
-}
-
static struct drm_ioctl_desc vgem_ioctls[] = {
DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW),
@@ -446,7 +420,6 @@ static struct drm_driver vgem_driver = {
.fops = &vgem_driver_fops,
.dumb_create = vgem_gem_dumb_create,
- .dumb_map_offset = vgem_gem_dumb_map,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 119c53d2d4044c59c450c4f5a568d80b9d861856 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Wed, 8 Jul 2020 16:49:11 +0100
Subject: [PATCH] drm/vgem: Replace opencoded version of
drm_gem_dumb_map_offset()
drm_gem_dumb_map_offset() now exists and does everything
vgem_gem_dump_map does and *ought* to do.
In particular, vgem_gem_dumb_map() was trying to reject mmapping an
imported dmabuf by checking the existence of obj->filp. Unfortunately,
we always allocated an obj->filp, even if unused for an imported dmabuf.
Instead, the drm_gem_dumb_map_offset(), since commit 90378e589192
("drm/gem: drm_gem_dumb_map_offset(): reject dma-buf"), uses the
obj->import_attach to reject such invalid mmaps.
This prevents vgem from allowing userspace mmapping the dumb handle and
attempting to incorrectly fault in remote pages belonging to another
device, where there may not even be a struct page.
v2: Use the default drm_gem_dumb_map_offset() callback
Fixes: af33a9190d02 ("drm/vgem: Enable dmabuf import interfaces")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: <stable(a)vger.kernel.org> # v4.13+
Link: https://patchwork.freedesktop.org/patch/msgid/20200708154911.21236-1-chris@…
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index e4dc7b267a0b..a775feda1cc7 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -230,32 +230,6 @@ static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
return 0;
}
-static int vgem_gem_dumb_map(struct drm_file *file, struct drm_device *dev,
- uint32_t handle, uint64_t *offset)
-{
- struct drm_gem_object *obj;
- int ret;
-
- obj = drm_gem_object_lookup(file, handle);
- if (!obj)
- return -ENOENT;
-
- if (!obj->filp) {
- ret = -EINVAL;
- goto unref;
- }
-
- ret = drm_gem_create_mmap_offset(obj);
- if (ret)
- goto unref;
-
- *offset = drm_vma_node_offset_addr(&obj->vma_node);
-unref:
- drm_gem_object_put(obj);
-
- return ret;
-}
-
static struct drm_ioctl_desc vgem_ioctls[] = {
DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW),
@@ -446,7 +420,6 @@ static struct drm_driver vgem_driver = {
.fops = &vgem_driver_fops,
.dumb_create = vgem_gem_dumb_create,
- .dumb_map_offset = vgem_gem_dumb_map,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 511b6d9aed417739b6aa49d0b6b4354ad21020f1 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Tue, 26 May 2020 10:07:53 +0100
Subject: [PATCH] drm/i915/gt: Do not schedule normal requests immediately
along virtual
When we push a virtual request onto the HW, we update the rq->engine to
point to the physical engine. A request that is then submitted by the
user that waits upon the virtual engine, but along the physical engine
in use, will then see that it is due to be submitted to the same engine
and take a shortcut (and be queued without waiting for the completion
fence). However, the virtual request may be preempted (either by higher
priority users, or by timeslicing) and removed from the physical engine
to be migrated over to one of its siblings. The dependent normal request
however is oblivious to the removal of the virtual request and remains
queued to execute on HW, believing that once it reaches the head of its
queue all of its predecessors will have completed executing!
v2: Beware restriction of signal->execution_mask prior to submission.
Fixes: 6d06779e8672 ("drm/i915: Load balancing across a virtual engine")
Testcase: igt/gem_exec_balancer/sliced
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.3+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200526090753.11329-2-chris@…
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index e64d82f7c830..0d810a62ff46 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1242,6 +1242,25 @@ i915_request_await_execution(struct i915_request *rq,
return 0;
}
+static int
+await_request_submit(struct i915_request *to, struct i915_request *from)
+{
+ /*
+ * If we are waiting on a virtual engine, then it may be
+ * constrained to execute on a single engine *prior* to submission.
+ * When it is submitted, it will be first submitted to the virtual
+ * engine and then passed to the physical engine. We cannot allow
+ * the waiter to be submitted immediately to the physical engine
+ * as it may then bypass the virtual request.
+ */
+ if (to->engine == READ_ONCE(from->engine))
+ return i915_sw_fence_await_sw_fence_gfp(&to->submit,
+ &from->submit,
+ I915_FENCE_GFP);
+ else
+ return __i915_request_await_execution(to, from, NULL);
+}
+
static int
i915_request_await_request(struct i915_request *to, struct i915_request *from)
{
@@ -1263,10 +1282,8 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
return ret;
}
- if (to->engine == READ_ONCE(from->engine))
- ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
- &from->submit,
- I915_FENCE_GFP);
+ if (is_power_of_2(to->execution_mask | READ_ONCE(from->execution_mask)))
+ ret = await_request_submit(to, from);
else
ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
if (ret < 0)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 511b6d9aed417739b6aa49d0b6b4354ad21020f1 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Tue, 26 May 2020 10:07:53 +0100
Subject: [PATCH] drm/i915/gt: Do not schedule normal requests immediately
along virtual
When we push a virtual request onto the HW, we update the rq->engine to
point to the physical engine. A request that is then submitted by the
user that waits upon the virtual engine, but along the physical engine
in use, will then see that it is due to be submitted to the same engine
and take a shortcut (and be queued without waiting for the completion
fence). However, the virtual request may be preempted (either by higher
priority users, or by timeslicing) and removed from the physical engine
to be migrated over to one of its siblings. The dependent normal request
however is oblivious to the removal of the virtual request and remains
queued to execute on HW, believing that once it reaches the head of its
queue all of its predecessors will have completed executing!
v2: Beware restriction of signal->execution_mask prior to submission.
Fixes: 6d06779e8672 ("drm/i915: Load balancing across a virtual engine")
Testcase: igt/gem_exec_balancer/sliced
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.3+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200526090753.11329-2-chris@…
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index e64d82f7c830..0d810a62ff46 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1242,6 +1242,25 @@ i915_request_await_execution(struct i915_request *rq,
return 0;
}
+static int
+await_request_submit(struct i915_request *to, struct i915_request *from)
+{
+ /*
+ * If we are waiting on a virtual engine, then it may be
+ * constrained to execute on a single engine *prior* to submission.
+ * When it is submitted, it will be first submitted to the virtual
+ * engine and then passed to the physical engine. We cannot allow
+ * the waiter to be submitted immediately to the physical engine
+ * as it may then bypass the virtual request.
+ */
+ if (to->engine == READ_ONCE(from->engine))
+ return i915_sw_fence_await_sw_fence_gfp(&to->submit,
+ &from->submit,
+ I915_FENCE_GFP);
+ else
+ return __i915_request_await_execution(to, from, NULL);
+}
+
static int
i915_request_await_request(struct i915_request *to, struct i915_request *from)
{
@@ -1263,10 +1282,8 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
return ret;
}
- if (to->engine == READ_ONCE(from->engine))
- ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
- &from->submit,
- I915_FENCE_GFP);
+ if (is_power_of_2(to->execution_mask | READ_ONCE(from->execution_mask)))
+ ret = await_request_submit(to, from);
else
ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
if (ret < 0)
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 511b6d9aed417739b6aa49d0b6b4354ad21020f1 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Tue, 26 May 2020 10:07:53 +0100
Subject: [PATCH] drm/i915/gt: Do not schedule normal requests immediately
along virtual
When we push a virtual request onto the HW, we update the rq->engine to
point to the physical engine. A request that is then submitted by the
user that waits upon the virtual engine, but along the physical engine
in use, will then see that it is due to be submitted to the same engine
and take a shortcut (and be queued without waiting for the completion
fence). However, the virtual request may be preempted (either by higher
priority users, or by timeslicing) and removed from the physical engine
to be migrated over to one of its siblings. The dependent normal request
however is oblivious to the removal of the virtual request and remains
queued to execute on HW, believing that once it reaches the head of its
queue all of its predecessors will have completed executing!
v2: Beware restriction of signal->execution_mask prior to submission.
Fixes: 6d06779e8672 ("drm/i915: Load balancing across a virtual engine")
Testcase: igt/gem_exec_balancer/sliced
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.3+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200526090753.11329-2-chris@…
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index e64d82f7c830..0d810a62ff46 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1242,6 +1242,25 @@ i915_request_await_execution(struct i915_request *rq,
return 0;
}
+static int
+await_request_submit(struct i915_request *to, struct i915_request *from)
+{
+ /*
+ * If we are waiting on a virtual engine, then it may be
+ * constrained to execute on a single engine *prior* to submission.
+ * When it is submitted, it will be first submitted to the virtual
+ * engine and then passed to the physical engine. We cannot allow
+ * the waiter to be submitted immediately to the physical engine
+ * as it may then bypass the virtual request.
+ */
+ if (to->engine == READ_ONCE(from->engine))
+ return i915_sw_fence_await_sw_fence_gfp(&to->submit,
+ &from->submit,
+ I915_FENCE_GFP);
+ else
+ return __i915_request_await_execution(to, from, NULL);
+}
+
static int
i915_request_await_request(struct i915_request *to, struct i915_request *from)
{
@@ -1263,10 +1282,8 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
return ret;
}
- if (to->engine == READ_ONCE(from->engine))
- ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
- &from->submit,
- I915_FENCE_GFP);
+ if (is_power_of_2(to->execution_mask | READ_ONCE(from->execution_mask)))
+ ret = await_request_submit(to, from);
else
ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
if (ret < 0)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1d9221e9d395c8c80d99d002bea3c9b6dd192d6a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Mon, 13 Jul 2020 15:16:36 +0100
Subject: [PATCH] drm/i915: Skip signaling a signaled request
Preempt-to-busy introduces various fascinating complications in that the
requests may complete as we are unsubmitting them from HW. As they may
then signal after unsubmission, we may find ourselves having to cleanup
the signaling request from within the signaling callback. This causes us
to recurse onto the same i915_request.lock.
However, if the request is already signaled (as it will be before we
enter the signal callbacks), we know we can skip the signaling of that
request during submission, neatly evading the spinlock recursion.
unsubmit(ve.rq0) # timeslice expiration or other preemption
-> virtual_submit_request(ve.rq0)
dma_fence_signal(ve.rq0) # request completed before preemption ack
-> submit_notify(ve.rq1)
-> virtual_submit_request(ve.rq1) # sees that we have completed ve.rq0
-> __i915_request_submit(ve.rq0)
[ 264.210142] BUG: spinlock recursion on CPU#2, sample_multi_tr/2093
[ 264.210150] lock: 0xffff9efd6ac55080, .magic: dead4ead, .owner: sample_multi_tr/2093, .owner_cpu: 2
[ 264.210155] CPU: 2 PID: 2093 Comm: sample_multi_tr Tainted: G U
[ 264.210158] Hardware name: Intel Corporation CoffeeLake Client Platform/CoffeeLake S UDIMM RVP, BIOS CNLSFWR1.R00.X212.B01.1909060036 09/06/2019
[ 264.210160] Call Trace:
[ 264.210167] dump_stack+0x98/0xda
[ 264.210174] spin_dump.cold+0x24/0x3c
[ 264.210178] do_raw_spin_lock+0x9a/0xd0
[ 264.210184] _raw_spin_lock_nested+0x6a/0x70
[ 264.210314] __i915_request_submit+0x10a/0x3c0 [i915]
[ 264.210415] virtual_submit_request+0x9b/0x380 [i915]
[ 264.210516] submit_notify+0xaf/0x14c [i915]
[ 264.210602] __i915_sw_fence_complete+0x8a/0x230 [i915]
[ 264.210692] i915_sw_fence_complete+0x2d/0x40 [i915]
[ 264.210762] __dma_i915_sw_fence_wake+0x19/0x30 [i915]
[ 264.210767] dma_fence_signal_locked+0xb1/0x1c0
[ 264.210772] dma_fence_signal+0x29/0x50
[ 264.210871] i915_request_wait+0x5cb/0x830 [i915]
[ 264.210876] ? dma_resv_get_fences_rcu+0x294/0x5d0
[ 264.210974] i915_gem_object_wait_fence+0x2f/0x40 [i915]
[ 264.211084] i915_gem_object_wait+0xce/0x400 [i915]
[ 264.211178] i915_gem_wait_ioctl+0xff/0x290 [i915]
Fixes: 22b7a426bbe1 ("drm/i915/execlists: Preempt-to-busy")
References: 6d06779e8672 ("drm/i915: Load balancing across a virtual engine")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: "Nayana, Venkata Ramana" <venkata.ramana.nayana(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.4+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200713141636.29326-1-chris@…
diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index d907d538176e..91786310c114 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -314,13 +314,18 @@ bool i915_request_enable_breadcrumb(struct i915_request *rq)
{
lockdep_assert_held(&rq->lock);
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
+ return true;
+
if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) {
struct intel_breadcrumbs *b = &rq->engine->breadcrumbs;
struct intel_context *ce = rq->context;
struct list_head *pos;
spin_lock(&b->irq_lock);
- GEM_BUG_ON(test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags));
+
+ if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags))
+ goto unlock;
if (!__intel_breadcrumbs_arm_irq(b))
goto unlock;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 3bb7320249ae..0b2fe55e6194 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -560,22 +560,25 @@ bool __i915_request_submit(struct i915_request *request)
engine->serial++;
result = true;
-xfer: /* We may be recursing from the signal callback of another i915 fence */
- spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
-
+xfer:
if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) {
list_move_tail(&request->sched.link, &engine->active.requests);
clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags);
- __notify_execute_cb(request);
}
- GEM_BUG_ON(!llist_empty(&request->execute_cb));
- if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
- !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) &&
- !i915_request_enable_breadcrumb(request))
- intel_engine_signal_breadcrumbs(engine);
+ /* We may be recursing from the signal callback of another i915 fence */
+ if (!i915_request_signaled(request)) {
+ spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+
+ __notify_execute_cb(request);
+ if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+ &request->fence.flags) &&
+ !i915_request_enable_breadcrumb(request))
+ intel_engine_signal_breadcrumbs(engine);
- spin_unlock(&request->lock);
+ spin_unlock(&request->lock);
+ GEM_BUG_ON(!llist_empty(&request->execute_cb));
+ }
return result;
}
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1d9221e9d395c8c80d99d002bea3c9b6dd192d6a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Mon, 13 Jul 2020 15:16:36 +0100
Subject: [PATCH] drm/i915: Skip signaling a signaled request
Preempt-to-busy introduces various fascinating complications in that the
requests may complete as we are unsubmitting them from HW. As they may
then signal after unsubmission, we may find ourselves having to cleanup
the signaling request from within the signaling callback. This causes us
to recurse onto the same i915_request.lock.
However, if the request is already signaled (as it will be before we
enter the signal callbacks), we know we can skip the signaling of that
request during submission, neatly evading the spinlock recursion.
unsubmit(ve.rq0) # timeslice expiration or other preemption
-> virtual_submit_request(ve.rq0)
dma_fence_signal(ve.rq0) # request completed before preemption ack
-> submit_notify(ve.rq1)
-> virtual_submit_request(ve.rq1) # sees that we have completed ve.rq0
-> __i915_request_submit(ve.rq0)
[ 264.210142] BUG: spinlock recursion on CPU#2, sample_multi_tr/2093
[ 264.210150] lock: 0xffff9efd6ac55080, .magic: dead4ead, .owner: sample_multi_tr/2093, .owner_cpu: 2
[ 264.210155] CPU: 2 PID: 2093 Comm: sample_multi_tr Tainted: G U
[ 264.210158] Hardware name: Intel Corporation CoffeeLake Client Platform/CoffeeLake S UDIMM RVP, BIOS CNLSFWR1.R00.X212.B01.1909060036 09/06/2019
[ 264.210160] Call Trace:
[ 264.210167] dump_stack+0x98/0xda
[ 264.210174] spin_dump.cold+0x24/0x3c
[ 264.210178] do_raw_spin_lock+0x9a/0xd0
[ 264.210184] _raw_spin_lock_nested+0x6a/0x70
[ 264.210314] __i915_request_submit+0x10a/0x3c0 [i915]
[ 264.210415] virtual_submit_request+0x9b/0x380 [i915]
[ 264.210516] submit_notify+0xaf/0x14c [i915]
[ 264.210602] __i915_sw_fence_complete+0x8a/0x230 [i915]
[ 264.210692] i915_sw_fence_complete+0x2d/0x40 [i915]
[ 264.210762] __dma_i915_sw_fence_wake+0x19/0x30 [i915]
[ 264.210767] dma_fence_signal_locked+0xb1/0x1c0
[ 264.210772] dma_fence_signal+0x29/0x50
[ 264.210871] i915_request_wait+0x5cb/0x830 [i915]
[ 264.210876] ? dma_resv_get_fences_rcu+0x294/0x5d0
[ 264.210974] i915_gem_object_wait_fence+0x2f/0x40 [i915]
[ 264.211084] i915_gem_object_wait+0xce/0x400 [i915]
[ 264.211178] i915_gem_wait_ioctl+0xff/0x290 [i915]
Fixes: 22b7a426bbe1 ("drm/i915/execlists: Preempt-to-busy")
References: 6d06779e8672 ("drm/i915: Load balancing across a virtual engine")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: "Nayana, Venkata Ramana" <venkata.ramana.nayana(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.4+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200713141636.29326-1-chris@…
diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index d907d538176e..91786310c114 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -314,13 +314,18 @@ bool i915_request_enable_breadcrumb(struct i915_request *rq)
{
lockdep_assert_held(&rq->lock);
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
+ return true;
+
if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) {
struct intel_breadcrumbs *b = &rq->engine->breadcrumbs;
struct intel_context *ce = rq->context;
struct list_head *pos;
spin_lock(&b->irq_lock);
- GEM_BUG_ON(test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags));
+
+ if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags))
+ goto unlock;
if (!__intel_breadcrumbs_arm_irq(b))
goto unlock;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 3bb7320249ae..0b2fe55e6194 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -560,22 +560,25 @@ bool __i915_request_submit(struct i915_request *request)
engine->serial++;
result = true;
-xfer: /* We may be recursing from the signal callback of another i915 fence */
- spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
-
+xfer:
if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) {
list_move_tail(&request->sched.link, &engine->active.requests);
clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags);
- __notify_execute_cb(request);
}
- GEM_BUG_ON(!llist_empty(&request->execute_cb));
- if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
- !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) &&
- !i915_request_enable_breadcrumb(request))
- intel_engine_signal_breadcrumbs(engine);
+ /* We may be recursing from the signal callback of another i915 fence */
+ if (!i915_request_signaled(request)) {
+ spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+
+ __notify_execute_cb(request);
+ if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+ &request->fence.flags) &&
+ !i915_request_enable_breadcrumb(request))
+ intel_engine_signal_breadcrumbs(engine);
- spin_unlock(&request->lock);
+ spin_unlock(&request->lock);
+ GEM_BUG_ON(!llist_empty(&request->execute_cb));
+ }
return result;
}
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1d9221e9d395c8c80d99d002bea3c9b6dd192d6a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Mon, 13 Jul 2020 15:16:36 +0100
Subject: [PATCH] drm/i915: Skip signaling a signaled request
Preempt-to-busy introduces various fascinating complications in that the
requests may complete as we are unsubmitting them from HW. As they may
then signal after unsubmission, we may find ourselves having to cleanup
the signaling request from within the signaling callback. This causes us
to recurse onto the same i915_request.lock.
However, if the request is already signaled (as it will be before we
enter the signal callbacks), we know we can skip the signaling of that
request during submission, neatly evading the spinlock recursion.
unsubmit(ve.rq0) # timeslice expiration or other preemption
-> virtual_submit_request(ve.rq0)
dma_fence_signal(ve.rq0) # request completed before preemption ack
-> submit_notify(ve.rq1)
-> virtual_submit_request(ve.rq1) # sees that we have completed ve.rq0
-> __i915_request_submit(ve.rq0)
[ 264.210142] BUG: spinlock recursion on CPU#2, sample_multi_tr/2093
[ 264.210150] lock: 0xffff9efd6ac55080, .magic: dead4ead, .owner: sample_multi_tr/2093, .owner_cpu: 2
[ 264.210155] CPU: 2 PID: 2093 Comm: sample_multi_tr Tainted: G U
[ 264.210158] Hardware name: Intel Corporation CoffeeLake Client Platform/CoffeeLake S UDIMM RVP, BIOS CNLSFWR1.R00.X212.B01.1909060036 09/06/2019
[ 264.210160] Call Trace:
[ 264.210167] dump_stack+0x98/0xda
[ 264.210174] spin_dump.cold+0x24/0x3c
[ 264.210178] do_raw_spin_lock+0x9a/0xd0
[ 264.210184] _raw_spin_lock_nested+0x6a/0x70
[ 264.210314] __i915_request_submit+0x10a/0x3c0 [i915]
[ 264.210415] virtual_submit_request+0x9b/0x380 [i915]
[ 264.210516] submit_notify+0xaf/0x14c [i915]
[ 264.210602] __i915_sw_fence_complete+0x8a/0x230 [i915]
[ 264.210692] i915_sw_fence_complete+0x2d/0x40 [i915]
[ 264.210762] __dma_i915_sw_fence_wake+0x19/0x30 [i915]
[ 264.210767] dma_fence_signal_locked+0xb1/0x1c0
[ 264.210772] dma_fence_signal+0x29/0x50
[ 264.210871] i915_request_wait+0x5cb/0x830 [i915]
[ 264.210876] ? dma_resv_get_fences_rcu+0x294/0x5d0
[ 264.210974] i915_gem_object_wait_fence+0x2f/0x40 [i915]
[ 264.211084] i915_gem_object_wait+0xce/0x400 [i915]
[ 264.211178] i915_gem_wait_ioctl+0xff/0x290 [i915]
Fixes: 22b7a426bbe1 ("drm/i915/execlists: Preempt-to-busy")
References: 6d06779e8672 ("drm/i915: Load balancing across a virtual engine")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: "Nayana, Venkata Ramana" <venkata.ramana.nayana(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.4+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200713141636.29326-1-chris@…
diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index d907d538176e..91786310c114 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -314,13 +314,18 @@ bool i915_request_enable_breadcrumb(struct i915_request *rq)
{
lockdep_assert_held(&rq->lock);
+ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
+ return true;
+
if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) {
struct intel_breadcrumbs *b = &rq->engine->breadcrumbs;
struct intel_context *ce = rq->context;
struct list_head *pos;
spin_lock(&b->irq_lock);
- GEM_BUG_ON(test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags));
+
+ if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags))
+ goto unlock;
if (!__intel_breadcrumbs_arm_irq(b))
goto unlock;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 3bb7320249ae..0b2fe55e6194 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -560,22 +560,25 @@ bool __i915_request_submit(struct i915_request *request)
engine->serial++;
result = true;
-xfer: /* We may be recursing from the signal callback of another i915 fence */
- spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
-
+xfer:
if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) {
list_move_tail(&request->sched.link, &engine->active.requests);
clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags);
- __notify_execute_cb(request);
}
- GEM_BUG_ON(!llist_empty(&request->execute_cb));
- if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
- !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) &&
- !i915_request_enable_breadcrumb(request))
- intel_engine_signal_breadcrumbs(engine);
+ /* We may be recursing from the signal callback of another i915 fence */
+ if (!i915_request_signaled(request)) {
+ spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+
+ __notify_execute_cb(request);
+ if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+ &request->fence.flags) &&
+ !i915_request_enable_breadcrumb(request))
+ intel_engine_signal_breadcrumbs(engine);
- spin_unlock(&request->lock);
+ spin_unlock(&request->lock);
+ GEM_BUG_ON(!llist_empty(&request->execute_cb));
+ }
return result;
}
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 24ebec25fb270100e252b19c288e21bd7d8cc7f7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will(a)kernel.org>
Date: Fri, 29 May 2020 14:12:18 +0100
Subject: [PATCH] arm64: hw_breakpoint: Don't invoke overflow handler on
uaccess watchpoints
Unprivileged memory accesses generated by the so-called "translated"
instructions (e.g. STTR) at EL1 can cause EL0 watchpoints to fire
unexpectedly if kernel debugging is enabled. In such cases, the
hw_breakpoint logic will invoke the user overflow handler which will
typically raise a SIGTRAP back to the current task. This is futile when
returning back to the kernel because (a) the signal won't have been
delivered and (b) userspace can't handle the thing anyway.
Avoid invoking the user overflow handler for watchpoints triggered by
kernel uaccess routines, and instead single-step over the faulting
instruction as we would if no overflow handler had been installed.
(Fixes tag identifies the introduction of unprivileged memory accesses,
which exposed this latent bug in the hw_breakpoint code)
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: James Morse <james.morse(a)arm.com>
Fixes: 57f4959bad0a ("arm64: kernel: Add support for User Access Override")
Reported-by: Luis Machado <luis.machado(a)linaro.org>
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 0b727edf4104..af234a1e08b7 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -730,6 +730,27 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
return 0;
}
+static int watchpoint_report(struct perf_event *wp, unsigned long addr,
+ struct pt_regs *regs)
+{
+ int step = is_default_overflow_handler(wp);
+ struct arch_hw_breakpoint *info = counter_arch_bp(wp);
+
+ info->trigger = addr;
+
+ /*
+ * If we triggered a user watchpoint from a uaccess routine, then
+ * handle the stepping ourselves since userspace really can't help
+ * us with this.
+ */
+ if (!user_mode(regs) && info->ctrl.privilege == AARCH64_BREAKPOINT_EL0)
+ step = 1;
+ else
+ perf_bp_event(wp, regs);
+
+ return step;
+}
+
static int watchpoint_handler(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
@@ -739,7 +760,6 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
u64 val;
struct perf_event *wp, **slots;
struct debug_info *debug_info;
- struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;
slots = this_cpu_ptr(wp_on_reg);
@@ -777,25 +797,13 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
if (dist != 0)
continue;
- info = counter_arch_bp(wp);
- info->trigger = addr;
- perf_bp_event(wp, regs);
-
- /* Do we need to handle the stepping? */
- if (is_default_overflow_handler(wp))
- step = 1;
+ step = watchpoint_report(wp, addr, regs);
}
- if (min_dist > 0 && min_dist != -1) {
- /* No exact match found. */
- wp = slots[closest_match];
- info = counter_arch_bp(wp);
- info->trigger = addr;
- perf_bp_event(wp, regs);
- /* Do we need to handle the stepping? */
- if (is_default_overflow_handler(wp))
- step = 1;
- }
+ /* No exact match found? */
+ if (min_dist > 0 && min_dist != -1)
+ step = watchpoint_report(slots[closest_match], addr, regs);
+
rcu_read_unlock();
if (!step)
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 24ebec25fb270100e252b19c288e21bd7d8cc7f7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will(a)kernel.org>
Date: Fri, 29 May 2020 14:12:18 +0100
Subject: [PATCH] arm64: hw_breakpoint: Don't invoke overflow handler on
uaccess watchpoints
Unprivileged memory accesses generated by the so-called "translated"
instructions (e.g. STTR) at EL1 can cause EL0 watchpoints to fire
unexpectedly if kernel debugging is enabled. In such cases, the
hw_breakpoint logic will invoke the user overflow handler which will
typically raise a SIGTRAP back to the current task. This is futile when
returning back to the kernel because (a) the signal won't have been
delivered and (b) userspace can't handle the thing anyway.
Avoid invoking the user overflow handler for watchpoints triggered by
kernel uaccess routines, and instead single-step over the faulting
instruction as we would if no overflow handler had been installed.
(Fixes tag identifies the introduction of unprivileged memory accesses,
which exposed this latent bug in the hw_breakpoint code)
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: James Morse <james.morse(a)arm.com>
Fixes: 57f4959bad0a ("arm64: kernel: Add support for User Access Override")
Reported-by: Luis Machado <luis.machado(a)linaro.org>
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 0b727edf4104..af234a1e08b7 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -730,6 +730,27 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
return 0;
}
+static int watchpoint_report(struct perf_event *wp, unsigned long addr,
+ struct pt_regs *regs)
+{
+ int step = is_default_overflow_handler(wp);
+ struct arch_hw_breakpoint *info = counter_arch_bp(wp);
+
+ info->trigger = addr;
+
+ /*
+ * If we triggered a user watchpoint from a uaccess routine, then
+ * handle the stepping ourselves since userspace really can't help
+ * us with this.
+ */
+ if (!user_mode(regs) && info->ctrl.privilege == AARCH64_BREAKPOINT_EL0)
+ step = 1;
+ else
+ perf_bp_event(wp, regs);
+
+ return step;
+}
+
static int watchpoint_handler(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
@@ -739,7 +760,6 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
u64 val;
struct perf_event *wp, **slots;
struct debug_info *debug_info;
- struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;
slots = this_cpu_ptr(wp_on_reg);
@@ -777,25 +797,13 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
if (dist != 0)
continue;
- info = counter_arch_bp(wp);
- info->trigger = addr;
- perf_bp_event(wp, regs);
-
- /* Do we need to handle the stepping? */
- if (is_default_overflow_handler(wp))
- step = 1;
+ step = watchpoint_report(wp, addr, regs);
}
- if (min_dist > 0 && min_dist != -1) {
- /* No exact match found. */
- wp = slots[closest_match];
- info = counter_arch_bp(wp);
- info->trigger = addr;
- perf_bp_event(wp, regs);
- /* Do we need to handle the stepping? */
- if (is_default_overflow_handler(wp))
- step = 1;
- }
+ /* No exact match found? */
+ if (min_dist > 0 && min_dist != -1)
+ step = watchpoint_report(slots[closest_match], addr, regs);
+
rcu_read_unlock();
if (!step)
From: Mike Snitzer <snitzer(a)redhat.com>
Discontinue issuing writethrough write IO in series to the origin and
then cache.
Use bio_clone_fast() to create a new origin clone bio that will be
mapped to the origin device and then bio_chain() it to the bio that gets
remapped to the cache device. The origin clone bio does _not_ have a
copy of the per_bio_data -- as such check_if_tick_bio_needed() will not
be called.
The cache bio (parent bio) will not complete until the origin bio has
completed -- this fulfills bio_clone_fast()'s requirements as well as
the requirement to not complete the original IO until the write IO has
completed to both the origin and cache device.
Signed-off-by: Mike Snitzer <snitzer(a)redhat.com>
(cherry picked from commit 2df3bae9a6543e90042291707b8db0cbfbae9ee9)
Fixes: 4ec34f2196d125ff781170ddc6c3058c08ec5e73 (dm bio record:
save/restore bi_end_io and bi_integrity )
4ec34f21 introduced a mkfs.ext4 hang on a LVM device that has been
modified with lvconvert --cachemode=writethrough.
CC:stable@vger.kernel.org for 4.14.y
Signed-off-by: John Donnelly <john.p.donnelly(a)oracle.com>
Reviewed-by: Somasundaram Krishnasamy <somasundaram.krishnasamy(a)oracle.com>
conflicts:
drivers/md/dm-cache-target.c. - Corrected usage of
writethrough_mode(&cache->feature) that was caught by
compiler, and removed unused static functions : writethrough_endio(),
defer_writethrough_bio(), wake_deferred_writethrough_worker()
that generated warnings.
---
drivers/md/dm-cache-target.c | 92 ++++++++++++++++++--------------------------
1 file changed, 37 insertions(+), 55 deletions(-)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 69cdb29ef6be..2732d1df05fa 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -450,6 +450,7 @@ struct cache {
struct work_struct migration_worker;
struct delayed_work waker;
struct dm_bio_prison_v2 *prison;
+ struct bio_set *bs;
mempool_t *migration_pool;
@@ -537,11 +538,6 @@ static void wake_deferred_bio_worker(struct cache *cache)
queue_work(cache->wq, &cache->deferred_bio_worker);
}
-static void wake_deferred_writethrough_worker(struct cache *cache)
-{
- queue_work(cache->wq, &cache->deferred_writethrough_worker);
-}
-
static void wake_migration_worker(struct cache *cache)
{
if (passthrough_mode(&cache->features))
@@ -868,16 +864,23 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
spin_unlock_irqrestore(&cache->lock, flags);
}
-static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
- dm_oblock_t oblock)
+static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock, bool bio_has_pbd)
{
- // FIXME: this is called way too much.
- check_if_tick_bio_needed(cache, bio);
+ if (bio_has_pbd)
+ check_if_tick_bio_needed(cache, bio);
remap_to_origin(cache, bio);
if (bio_data_dir(bio) == WRITE)
clear_discard(cache, oblock_to_dblock(cache, oblock));
}
+static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock)
+{
+ // FIXME: check_if_tick_bio_needed() is called way too much through this interface
+ __remap_to_origin_clear_discard(cache, bio, oblock, true);
+}
+
static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
dm_oblock_t oblock, dm_cblock_t cblock)
{
@@ -937,57 +940,26 @@ static void issue_op(struct bio *bio, void *context)
accounted_request(cache, bio);
}
-static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
- bio_list_add(&cache->deferred_writethrough_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
-
- wake_deferred_writethrough_worker(cache);
-}
-
-static void writethrough_endio(struct bio *bio)
-{
- struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
-
- dm_unhook_bio(&pb->hook_info, bio);
-
- if (bio->bi_status) {
- bio_endio(bio);
- return;
- }
-
- dm_bio_restore(&pb->bio_details, bio);
- remap_to_cache(pb->cache, bio, pb->cblock);
-
- /*
- * We can't issue this bio directly, since we're in interrupt
- * context. So it gets put on a bio list for processing by the
- * worker thread.
- */
- defer_writethrough_bio(pb->cache, bio);
-}
-
/*
- * FIXME: send in parallel, huge latency as is.
* When running in writethrough mode we need to send writes to clean blocks
- * to both the cache and origin devices. In future we'd like to clone the
- * bio and send them in parallel, but for now we're doing them in
- * series as this is easier.
+ * to both the cache and origin devices. Clone the bio and send them in parallel.
*/
-static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
- dm_oblock_t oblock, dm_cblock_t cblock)
+static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock, dm_cblock_t cblock)
{
- struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+ struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs);
+
+ BUG_ON(!origin_bio);
- pb->cache = cache;
- pb->cblock = cblock;
- dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
- dm_bio_record(&pb->bio_details, bio);
+ bio_chain(origin_bio, bio);
+ /*
+ * Passing false to __remap_to_origin_clear_discard() skips
+ * all code that might use per_bio_data (since clone doesn't have it)
+ */
+ __remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
+ submit_bio(origin_bio);
- remap_to_origin_clear_discard(pb->cache, bio, oblock);
+ remap_to_cache(cache, bio, cblock);
}
/*----------------------------------------------------------------
@@ -1873,7 +1845,7 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
} else {
if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
!is_dirty(cache, cblock)) {
- remap_to_origin_then_cache(cache, bio, block, cblock);
+ remap_to_origin_and_cache(cache, bio, block, cblock);
accounted_begin(cache, bio);
} else
remap_to_cache_dirty(cache, bio, block, cblock);
@@ -2132,6 +2104,9 @@ static void destroy(struct cache *cache)
kfree(cache->ctr_args[i]);
kfree(cache->ctr_args);
+ if (cache->bs)
+ bioset_free(cache->bs);
+
kfree(cache);
}
@@ -2589,6 +2564,13 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cache->features = ca->features;
ti->per_io_data_size = get_per_bio_data_size(cache);
+ if (writethrough_mode(&cache->features)) {
+ /* Create bioset for writethrough bios issued to origin */
+ cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0);
+ if (!cache->bs)
+ goto bad;
+ }
+
cache->callbacks.congested_fn = cache_is_congested;
dm_table_add_target_callbacks(ti->table, &cache->callbacks);
--
1.8.3.1
Hello, stable-kernel maintainers!
Could you please cherry-pick these commits into the v5.7.x kernel?
commit 0de6db30ef79b391cedd749801a49c485d2daf4b
Author: Sowjanya Komatineni <skomatineni(a)nvidia.com>
Date: Mon Jan 13 23:24:17 2020 -0800
ASoC: tegra: Use device managed resource APIs to get the clock
commit 1e4e0bf136aa4b4aa59c1e6af19844bd6d807794
Author: Sowjanya Komatineni <skomatineni(a)nvidia.com>
Date: Mon Jan 13 23:24:23 2020 -0800
ASoC: tegra: Add audio mclk parent configuration
commit ff5d18cb04f4ecccbcf05b7f83ab6df2a0d95c16
Author: Sowjanya Komatineni <skomatineni(a)nvidia.com>
Date: Mon Jan 13 23:24:24 2020 -0800
ASoC: tegra: Enable audio mclk during tegra_asoc_utils_init()
It will fix a huge warnings splat during of kernel boot on NVIDIA Tegra
SoCs. For some reason these patches haven't made into 5.7 when it was
released and several people complained about the warnings. Thanks in
advance!
Recently a customer of ours experienced a crash when booting the
system while enabling memory-hotplug.
The problem is that Normal zones on different nodes don't get their private
zone->pageset allocated, and keep sharing the initial boot_pageset.
The sharing between zones is normally safe as explained by the comment for
boot_pageset - it's a percpu structure, and manipulations are done with
disabled interrupts, and boot_pageset is set up in a way that any page placed
on its pcplist is immediately flushed to shared zone's freelist, because
pcp->high == 1.
However, the hotplug operation updates pcp->high to a higher value as it
expects to be operating on a private pageset.
The problem is in build_all_zonelists(), which is called when the first range
of pages is onlined for the Normal zone of node X or Y:
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
#ifdef CONFIG_MEMORY_HOTPLUG
if (zone)
setup_zone_pageset(zone);
#endif
/* we have to stop all cpus to guarantee there is no user
of zonelist */
stop_machine(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
When called during hotplug, it should execute the setup_zone_pageset(zone)
which allocates the private pageset.
However, with memhp_default_state=online, this happens early while
system_state == SYSTEM_BOOTING is still true, hence this step is skipped.
(and build_all_zonelists_init() is probably unsafe anyway at this point).
Another hotplug operation on the same zone then leads to zone_pcp_update(zone)
called from online_pages(), which updates the pcp->high for the shared
boot_pageset to a value higher than 1.
At that point, pages freed from Node X and Y Normal zones can end up on the same
pcplist and from there they can be freed to the wrong zone's freelist,
leading to the corruption and crashes.
Please, note that upstream has fixed that differently (and unintentionally) by
adding another boot state (SYSTEM_SCHEDULING), which is set before smp_init().
That should happen before memory hotplug events even with memhp_default_state=online.
Backporting that would be too intrusive.
Signed-off-by: Oscar Salvador <osalvador(a)suse.de>
Debugged-by: Vlastimil Babka <vbabka(a)suse.cz>
---
include/linux/mmzone.h | 3 ++-
init/main.c | 2 +-
mm/memory_hotplug.c | 10 +++++-----
mm/page_alloc.c | 7 ++++---
4 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e3d7754f25f0..5c7645e156a5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -756,7 +756,8 @@ static inline bool is_dev_zone(const struct zone *zone)
#include <linux/memory_hotplug.h>
extern struct mutex zonelists_mutex;
-void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
+void build_all_zonelists(pg_data_t *pgdat, struct zone *zone,
+ bool hotplug_context);
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int classzone_idx, unsigned int alloc_flags,
diff --git a/init/main.c b/init/main.c
index d47860dbe896..7ad08957dd18 100644
--- a/init/main.c
+++ b/init/main.c
@@ -512,7 +512,7 @@ asmlinkage __visible void __init start_kernel(void)
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
boot_cpu_hotplug_init();
- build_all_zonelists(NULL, NULL);
+ build_all_zonelists(NULL, NULL, false);
page_alloc_init();
pr_notice("Kernel command line: %s\n", boot_command_line);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 449999657c0b..a4ffe5996317 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1125,7 +1125,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
mutex_lock(&zonelists_mutex);
if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
- build_all_zonelists(NULL, zone);
+ build_all_zonelists(NULL, zone, true);
}
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
@@ -1146,7 +1146,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages) {
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
- build_all_zonelists(NULL, NULL);
+ build_all_zonelists(NULL, NULL, true);
else
zone_pcp_update(zone);
}
@@ -1220,7 +1220,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
* to access not-initialized zonelist, build here.
*/
mutex_lock(&zonelists_mutex);
- build_all_zonelists(pgdat, NULL);
+ build_all_zonelists(pgdat, NULL, true);
mutex_unlock(&zonelists_mutex);
/*
@@ -1276,7 +1276,7 @@ int try_online_node(int nid)
if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
+ build_all_zonelists(NULL, NULL, true);
mutex_unlock(&zonelists_mutex);
}
@@ -2016,7 +2016,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
+ build_all_zonelists(NULL, NULL, true);
mutex_unlock(&zonelists_mutex);
} else
zone_pcp_update(zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index de00e0fec484..f394dd87fa03 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4608,7 +4608,7 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
+ build_all_zonelists(NULL, NULL, false);
mutex_unlock(&zonelists_mutex);
}
}
@@ -4988,11 +4988,12 @@ build_all_zonelists_init(void)
* (2) call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
*/
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone,
+ bool hotplug_context)
{
set_zonelist_order();
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state == SYSTEM_BOOTING && !hotplug_context) {
build_all_zonelists_init();
} else {
#ifdef CONFIG_MEMORY_HOTPLUG
--
2.26.2
From: Thomas Gleixner <tglx(a)linutronix.de>
commit baedb87d1b53532f81b4bd0387f83b05d4f7eb9a upstream.
Setting interrupt affinity on inactive interrupts is inconsistent when
hierarchical irq domains are enabled. The core code should just store the
affinity and not call into the irq chip driver for inactive interrupts
because the chip drivers may not be in a state to handle such requests.
X86 has a hacky workaround for that but all other irq chips have not which
causes problems e.g. on GIC V3 ITS.
Instead of adding more ugly hacks all over the place, solve the problem in
the core code. If the affinity is set on an inactive interrupt then:
- Store it in the irq descriptors affinity mask
- Update the effective affinity to reflect that so user space has
a consistent view
- Don't call into the irq chip driver
This is the core equivalent of the X86 workaround and works correctly
because the affinity setting is established in the irq chip when the
interrupt is activated later on.
Note, that this is only effective when hierarchical irq domains are enabled
by the architecture. Doing it unconditionally would break legacy irq chip
implementations.
For hierarchial irq domains this works correctly as none of the drivers can
have a dependency on affinity setting in inactive state by design.
Remove the X86 workaround as it is not longer required.
Fixes: 02edee152d6e ("x86/apic/vector: Ignore set_affinity call for inactive interrupts")
Reported-by: Ali Saidi <alisaidi(a)amazon.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Ali Saidi <alisaidi(a)amazon.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20200529015501.15771-1-alisaidi@amazon.com
Link: https://lkml.kernel.org/r/877dv2rv25.fsf@nanos.tec.linutronix.de
[fllinden(a)amazon.com - 4.14 never had the x86 workaround, so skip x86 changes]
Signed-off-by: Frank van der Linden <fllinden(a)amazon.com>
---
kernel/irq/manage.c | 37 +++++++++++++++++++++++++++++++++++--
1 file changed, 35 insertions(+), 2 deletions(-)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5277949e82e0..ce6fba446814 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,9 +168,9 @@ void irq_set_thread_affinity(struct irq_desc *desc)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
}
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
static void irq_validate_effective_affinity(struct irq_data *data)
{
-#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -178,9 +178,19 @@ static void irq_validate_effective_affinity(struct irq_data *data)
return;
pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
chip->name, data->irq);
-#endif
}
+static inline void irq_init_effective_affinity(struct irq_data *data,
+ const struct cpumask *mask)
+{
+ cpumask_copy(irq_data_get_effective_affinity_mask(data), mask);
+}
+#else
+static inline void irq_validate_effective_affinity(struct irq_data *data) { }
+static inline void irq_init_effective_affinity(struct irq_data *data,
+ const struct cpumask *mask) { }
+#endif
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -205,6 +215,26 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
return ret;
}
+static bool irq_set_affinity_deactivated(struct irq_data *data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+
+ /*
+ * If the interrupt is not yet activated, just store the affinity
+ * mask and do not call the chip driver at all. On activation the
+ * driver has to make sure anyway that the interrupt is in a
+ * useable state so startup works.
+ */
+ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data))
+ return false;
+
+ cpumask_copy(desc->irq_common_data.affinity, mask);
+ irq_init_effective_affinity(data, mask);
+ irqd_set(data, IRQD_AFFINITY_SET);
+ return true;
+}
+
int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -215,6 +245,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
+ if (irq_set_affinity_deactivated(data, mask, force))
+ return 0;
+
if (irq_can_move_pcntxt(data)) {
ret = irq_do_set_affinity(data, mask, force);
} else {
--
2.17.2
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 18e77600f7a1ed69f8ce46c9e11cad0985712dfa Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:22 -0700
Subject: [PATCH] khugepaged: retract_page_tables() remember to test exit
Only once have I seen this scenario (and forgot even to notice what forced
the eventual crash): a sequence of "BUG: Bad page map" alerts from
vm_normal_page(), from zap_pte_range() servicing exit_mmap();
pmd:00000000, pte values corresponding to data in physical page 0.
The pte mappings being zapped in this case were supposed to be from a huge
page of ext4 text (but could as well have been shmem): my belief is that
it was racing with collapse_file()'s retract_page_tables(), found *pmd
pointing to a page table, locked it, but *pmd had become 0 by the time
start_pte was decided.
In most cases, that possibility is excluded by holding mmap lock; but
exit_mmap() proceeds without mmap lock. Most of what's run by khugepaged
checks khugepaged_test_exit() after acquiring mmap lock:
khugepaged_collapse_pte_mapped_thps() and hugepage_vma_revalidate() do so,
for example. But retract_page_tables() did not: fix that.
The fix is for retract_page_tables() to check khugepaged_test_exit(),
after acquiring mmap lock, before doing anything to the page table.
Getting the mmap lock serializes with __mmput(), which briefly takes and
drops it in __khugepaged_exit(); then the khugepaged_test_exit() check on
mm_users makes sure we don't touch the page table once exit_mmap() might
reach it, since exit_mmap() will be proceeding without mmap lock, not
expecting anyone to be racing with it.
Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021215400.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a9aca9b71d6f..ac04b332a373 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1532,6 +1532,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
+ struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
@@ -1560,7 +1561,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
- pmd = mm_find_pmd(vma->vm_mm, addr);
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
@@ -1570,17 +1572,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (mmap_write_trylock(vma->vm_mm)) {
- spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mmap_write_unlock(vma->vm_mm);
- mm_dec_nr_ptes(vma->vm_mm);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (mmap_write_trylock(mm)) {
+ if (!khugepaged_test_exit(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
} else {
/* Try again later */
- khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+ khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
For support of long running hypercalls xen_maybe_preempt_hcall() is
calling cond_resched() in case a hypercall marked as preemptible has
been interrupted.
Normally this is no problem, as only hypercalls done via some ioctl()s
are marked to be preemptible. In rare cases when during such a
preemptible hypercall an interrupt occurs and any softirq action is
started from irq_exit(), a further hypercall issued by the softirq
handler will be regarded to be preemptible, too. This might lead to
rescheduling in spite of the softirq handler potentially having set
preempt_disable(), leading to splats like:
BUG: sleeping function called from invalid context at drivers/xen/preempt.c:37
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 20775, name: xl
INFO: lockdep is turned off.
CPU: 1 PID: 20775 Comm: xl Tainted: G D W 5.4.46-1_prgmr_debug.el7.x86_64 #1
Call Trace:
<IRQ>
dump_stack+0x8f/0xd0
___might_sleep.cold.76+0xb2/0x103
xen_maybe_preempt_hcall+0x48/0x70
xen_do_hypervisor_callback+0x37/0x40
RIP: e030:xen_hypercall_xen_version+0xa/0x20
Code: ...
RSP: e02b:ffffc900400dcc30 EFLAGS: 00000246
RAX: 000000000004000d RBX: 0000000000000200 RCX: ffffffff8100122a
RDX: ffff88812e788000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffffffff83ee3ad0 R08: 0000000000000001 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000246 R12: ffff8881824aa0b0
R13: 0000000865496000 R14: 0000000865496000 R15: ffff88815d040000
? xen_hypercall_xen_version+0xa/0x20
? xen_force_evtchn_callback+0x9/0x10
? check_events+0x12/0x20
? xen_restore_fl_direct+0x1f/0x20
? _raw_spin_unlock_irqrestore+0x53/0x60
? debug_dma_sync_single_for_cpu+0x91/0xc0
? _raw_spin_unlock_irqrestore+0x53/0x60
? xen_swiotlb_sync_single_for_cpu+0x3d/0x140
? mlx4_en_process_rx_cq+0x6b6/0x1110 [mlx4_en]
? mlx4_en_poll_rx_cq+0x64/0x100 [mlx4_en]
? net_rx_action+0x151/0x4a0
? __do_softirq+0xed/0x55b
? irq_exit+0xea/0x100
? xen_evtchn_do_upcall+0x2c/0x40
? xen_do_hypervisor_callback+0x29/0x40
</IRQ>
? xen_hypercall_domctl+0xa/0x20
? xen_hypercall_domctl+0x8/0x20
? privcmd_ioctl+0x221/0x990 [xen_privcmd]
? do_vfs_ioctl+0xa5/0x6f0
? ksys_ioctl+0x60/0x90
? trace_hardirqs_off_thunk+0x1a/0x20
? __x64_sys_ioctl+0x16/0x20
? do_syscall_64+0x62/0x250
? entry_SYSCALL_64_after_hwframe+0x49/0xbe
Fix that by testing preempt_count() before calling cond_resched().
In kernel 5.8 this can't happen any more due to the entry code rework.
Reported-by: Sarah Newman <srn(a)prgmr.com>
Fixes: 0fa2f5cb2b0ecd8 ("sched/preempt, xen: Use need_resched() instead of should_resched()")
Cc: Sarah Newman <srn(a)prgmr.com>
Signed-off-by: Juergen Gross <jgross(a)suse.com>
---
drivers/xen/preempt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
index 17240c5325a3..6ad87b5c95ed 100644
--- a/drivers/xen/preempt.c
+++ b/drivers/xen/preempt.c
@@ -27,7 +27,7 @@ EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
asmlinkage __visible void xen_maybe_preempt_hcall(void)
{
if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
- && need_resched())) {
+ && need_resched() && !preempt_count())) {
/*
* Clear flag as we may be rescheduled on a different
* cpu.
--
2.26.2
LLVM implemented a recent "libcall optimization" that lowers calls to
`sprintf(dest, "%s", str)` where the return value is used to
`stpcpy(dest, str) - dest`. This generally avoids the machinery involved
in parsing format strings. This optimization was introduced into
clang-12. Because the kernel does not provide an implementation of
stpcpy, we observe linkage failures for almost all targets when building
with ToT clang.
The interface is unsafe as it does not perform any bounds checking.
Disable this "libcall optimization" via `-fno-builtin-stpcpy`.
Cc: stable(a)vger.kernel.org # 4.4
Link: https://bugs.llvm.org/show_bug.cgi?id=47162
Link: https://github.com/ClangBuiltLinux/linux/issues/1126
Link: https://reviews.llvm.org/D85963
Reported-by: Sami Tolvanen <samitolvanen(a)google.com>
Suggested-by: Dávid Bolvanský <david.bolvansky(a)gmail.com>
Suggested-by: Kees Cook <keescook(a)chromium.org>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Signed-off-by: Nick Desaulniers <ndesaulniers(a)google.com>
---
Makefile | 1 +
1 file changed, 1 insertion(+)
diff --git a/Makefile b/Makefile
index 9cac6fde3479..e523dc8d30e0 100644
--- a/Makefile
+++ b/Makefile
@@ -578,6 +578,7 @@ ifneq ($(LLVM_IAS),1)
CLANG_FLAGS += -no-integrated-as
endif
CLANG_FLAGS += -Werror=unknown-warning-option
+CLANG_FLAGS += -fno-builtin-stpcpy
KBUILD_CFLAGS += $(CLANG_FLAGS)
KBUILD_AFLAGS += $(CLANG_FLAGS)
export CLANG_FLAGS
--
2.28.0.297.g1956fa8f8d-goog
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 18e77600f7a1ed69f8ce46c9e11cad0985712dfa Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:22 -0700
Subject: [PATCH] khugepaged: retract_page_tables() remember to test exit
Only once have I seen this scenario (and forgot even to notice what forced
the eventual crash): a sequence of "BUG: Bad page map" alerts from
vm_normal_page(), from zap_pte_range() servicing exit_mmap();
pmd:00000000, pte values corresponding to data in physical page 0.
The pte mappings being zapped in this case were supposed to be from a huge
page of ext4 text (but could as well have been shmem): my belief is that
it was racing with collapse_file()'s retract_page_tables(), found *pmd
pointing to a page table, locked it, but *pmd had become 0 by the time
start_pte was decided.
In most cases, that possibility is excluded by holding mmap lock; but
exit_mmap() proceeds without mmap lock. Most of what's run by khugepaged
checks khugepaged_test_exit() after acquiring mmap lock:
khugepaged_collapse_pte_mapped_thps() and hugepage_vma_revalidate() do so,
for example. But retract_page_tables() did not: fix that.
The fix is for retract_page_tables() to check khugepaged_test_exit(),
after acquiring mmap lock, before doing anything to the page table.
Getting the mmap lock serializes with __mmput(), which briefly takes and
drops it in __khugepaged_exit(); then the khugepaged_test_exit() check on
mm_users makes sure we don't touch the page table once exit_mmap() might
reach it, since exit_mmap() will be proceeding without mmap lock, not
expecting anyone to be racing with it.
Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021215400.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a9aca9b71d6f..ac04b332a373 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1532,6 +1532,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
+ struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
@@ -1560,7 +1561,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
- pmd = mm_find_pmd(vma->vm_mm, addr);
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
@@ -1570,17 +1572,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (mmap_write_trylock(vma->vm_mm)) {
- spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mmap_write_unlock(vma->vm_mm);
- mm_dec_nr_ptes(vma->vm_mm);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (mmap_write_trylock(mm)) {
+ if (!khugepaged_test_exit(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
} else {
/* Try again later */
- khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+ khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 18e77600f7a1ed69f8ce46c9e11cad0985712dfa Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:22 -0700
Subject: [PATCH] khugepaged: retract_page_tables() remember to test exit
Only once have I seen this scenario (and forgot even to notice what forced
the eventual crash): a sequence of "BUG: Bad page map" alerts from
vm_normal_page(), from zap_pte_range() servicing exit_mmap();
pmd:00000000, pte values corresponding to data in physical page 0.
The pte mappings being zapped in this case were supposed to be from a huge
page of ext4 text (but could as well have been shmem): my belief is that
it was racing with collapse_file()'s retract_page_tables(), found *pmd
pointing to a page table, locked it, but *pmd had become 0 by the time
start_pte was decided.
In most cases, that possibility is excluded by holding mmap lock; but
exit_mmap() proceeds without mmap lock. Most of what's run by khugepaged
checks khugepaged_test_exit() after acquiring mmap lock:
khugepaged_collapse_pte_mapped_thps() and hugepage_vma_revalidate() do so,
for example. But retract_page_tables() did not: fix that.
The fix is for retract_page_tables() to check khugepaged_test_exit(),
after acquiring mmap lock, before doing anything to the page table.
Getting the mmap lock serializes with __mmput(), which briefly takes and
drops it in __khugepaged_exit(); then the khugepaged_test_exit() check on
mm_users makes sure we don't touch the page table once exit_mmap() might
reach it, since exit_mmap() will be proceeding without mmap lock, not
expecting anyone to be racing with it.
Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021215400.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a9aca9b71d6f..ac04b332a373 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1532,6 +1532,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
+ struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
@@ -1560,7 +1561,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
- pmd = mm_find_pmd(vma->vm_mm, addr);
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
@@ -1570,17 +1572,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (mmap_write_trylock(vma->vm_mm)) {
- spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mmap_write_unlock(vma->vm_mm);
- mm_dec_nr_ptes(vma->vm_mm);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (mmap_write_trylock(mm)) {
+ if (!khugepaged_test_exit(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
} else {
/* Try again later */
- khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+ khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 18e77600f7a1ed69f8ce46c9e11cad0985712dfa Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:22 -0700
Subject: [PATCH] khugepaged: retract_page_tables() remember to test exit
Only once have I seen this scenario (and forgot even to notice what forced
the eventual crash): a sequence of "BUG: Bad page map" alerts from
vm_normal_page(), from zap_pte_range() servicing exit_mmap();
pmd:00000000, pte values corresponding to data in physical page 0.
The pte mappings being zapped in this case were supposed to be from a huge
page of ext4 text (but could as well have been shmem): my belief is that
it was racing with collapse_file()'s retract_page_tables(), found *pmd
pointing to a page table, locked it, but *pmd had become 0 by the time
start_pte was decided.
In most cases, that possibility is excluded by holding mmap lock; but
exit_mmap() proceeds without mmap lock. Most of what's run by khugepaged
checks khugepaged_test_exit() after acquiring mmap lock:
khugepaged_collapse_pte_mapped_thps() and hugepage_vma_revalidate() do so,
for example. But retract_page_tables() did not: fix that.
The fix is for retract_page_tables() to check khugepaged_test_exit(),
after acquiring mmap lock, before doing anything to the page table.
Getting the mmap lock serializes with __mmput(), which briefly takes and
drops it in __khugepaged_exit(); then the khugepaged_test_exit() check on
mm_users makes sure we don't touch the page table once exit_mmap() might
reach it, since exit_mmap() will be proceeding without mmap lock, not
expecting anyone to be racing with it.
Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021215400.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a9aca9b71d6f..ac04b332a373 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1532,6 +1532,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
+ struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
@@ -1560,7 +1561,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
- pmd = mm_find_pmd(vma->vm_mm, addr);
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
@@ -1570,17 +1572,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (mmap_write_trylock(vma->vm_mm)) {
- spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mmap_write_unlock(vma->vm_mm);
- mm_dec_nr_ptes(vma->vm_mm);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (mmap_write_trylock(mm)) {
+ if (!khugepaged_test_exit(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
} else {
/* Try again later */
- khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+ khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 18e77600f7a1ed69f8ce46c9e11cad0985712dfa Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:22 -0700
Subject: [PATCH] khugepaged: retract_page_tables() remember to test exit
Only once have I seen this scenario (and forgot even to notice what forced
the eventual crash): a sequence of "BUG: Bad page map" alerts from
vm_normal_page(), from zap_pte_range() servicing exit_mmap();
pmd:00000000, pte values corresponding to data in physical page 0.
The pte mappings being zapped in this case were supposed to be from a huge
page of ext4 text (but could as well have been shmem): my belief is that
it was racing with collapse_file()'s retract_page_tables(), found *pmd
pointing to a page table, locked it, but *pmd had become 0 by the time
start_pte was decided.
In most cases, that possibility is excluded by holding mmap lock; but
exit_mmap() proceeds without mmap lock. Most of what's run by khugepaged
checks khugepaged_test_exit() after acquiring mmap lock:
khugepaged_collapse_pte_mapped_thps() and hugepage_vma_revalidate() do so,
for example. But retract_page_tables() did not: fix that.
The fix is for retract_page_tables() to check khugepaged_test_exit(),
after acquiring mmap lock, before doing anything to the page table.
Getting the mmap lock serializes with __mmput(), which briefly takes and
drops it in __khugepaged_exit(); then the khugepaged_test_exit() check on
mm_users makes sure we don't touch the page table once exit_mmap() might
reach it, since exit_mmap() will be proceeding without mmap lock, not
expecting anyone to be racing with it.
Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021215400.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a9aca9b71d6f..ac04b332a373 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1532,6 +1532,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
+ struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
@@ -1560,7 +1561,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
- pmd = mm_find_pmd(vma->vm_mm, addr);
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
@@ -1570,17 +1572,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (mmap_write_trylock(vma->vm_mm)) {
- spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mmap_write_unlock(vma->vm_mm);
- mm_dec_nr_ptes(vma->vm_mm);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (mmap_write_trylock(mm)) {
+ if (!khugepaged_test_exit(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
} else {
/* Try again later */
- khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+ khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
If the value of the link register is not correct (tail call from asm
that didn't set it, stack corruption, memory no longer mapped), then
using it for an address calculation may trigger an exception. Without a
fixup handler, this will lead to a panic, which will unwind, which will
trigger the fault repeatedly in an infinite loop.
We don't observe such failures currently, but we have. Just to be safe,
add a fixup handler here so that at least we don't have an infinite
loop.
Cc: stable(a)vger.kernel.org
Fixes: commit 6dc5fd93b2f1 ("ARM: 8900/1: UNWINDER_FRAME_POINTER implementation for Clang")
Reported-by: Miles Chen <miles.chen(a)mediatek.com>
Signed-off-by: Nick Desaulniers <ndesaulniers(a)google.com>
---
arch/arm/lib/backtrace-clang.S | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/arch/arm/lib/backtrace-clang.S b/arch/arm/lib/backtrace-clang.S
index 5388ac664c12..40eb2215eaf4 100644
--- a/arch/arm/lib/backtrace-clang.S
+++ b/arch/arm/lib/backtrace-clang.S
@@ -146,7 +146,7 @@ for_each_frame: tst frame, mask @ Check for address exceptions
tst sv_lr, #0 @ If there's no previous lr,
beq finished_setup @ we're done.
- ldr r0, [sv_lr, #-4] @ get call instruction
+prev_call: ldr r0, [sv_lr, #-4] @ get call instruction
ldr r3, .Lopcode+4
and r2, r3, r0 @ is this a bl call
teq r2, r3
@@ -206,6 +206,13 @@ finished_setup:
mov r2, frame
bl printk
no_frame: ldmfd sp!, {r4 - r9, fp, pc}
+/*
+ * Accessing the address pointed to by the link register triggered an
+ * exception, don't try to unwind through it.
+ */
+bad_lr: mov sv_fp, #0
+ mov sv_lr, #0
+ b finished_setup
ENDPROC(c_backtrace)
.pushsection __ex_table,"a"
.align 3
@@ -214,6 +221,7 @@ ENDPROC(c_backtrace)
.long 1003b, 1006b
.long 1004b, 1006b
.long 1005b, 1006b
+ .long prev_call, bad_lr
.popsection
.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n"
--
2.28.0.163.g6104cc2f0b6-goog
An earlier commit:
b7db41c9e03b ("io_uring: fix regression with always ignoring signals in io_cqring_wait()")
ensured that we didn't get stuck waiting for eventfd reads when it's
registered with the io_uring ring for event notification, but we still
have a gap where the task can be waiting on other events in the kernel
and need a bigger nudge to make forward progress.
Ensure that we use signaled notifications for a task that isn't currently
running, to be certain the work is seen and processed immediately.
Cc: stable(a)vger.kernel.org # v5.7+
Reported-by: Josef <josef.grieb(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
fs/io_uring.c | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e9b27cdaa735..443eecdfeda9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1712,21 +1712,27 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
struct io_ring_ctx *ctx = req->ctx;
int ret, notify = TWA_RESUME;
+ ret = __task_work_add(tsk, cb);
+ if (unlikely(ret))
+ return ret;
+
/*
* SQPOLL kernel thread doesn't need notification, just a wakeup.
- * If we're not using an eventfd, then TWA_RESUME is always fine,
- * as we won't have dependencies between request completions for
- * other kernel wait conditions.
+ * For any other work, use signaled wakeups if the task isn't
+ * running to avoid dependencies between tasks or threads. If
+ * the issuing task is currently waiting in the kernel on a thread,
+ * and same thread is waiting for a completion event, then we need
+ * to ensure that the issuing task processes task_work. TWA_SIGNAL
+ * is needed for that.
*/
if (ctx->flags & IORING_SETUP_SQPOLL)
notify = 0;
- else if (ctx->cq_ev_fd)
+ else if (READ_ONCE(tsk->state) != TASK_RUNNING)
notify = TWA_SIGNAL;
- ret = task_work_add(tsk, cb, notify);
- if (!ret)
- wake_up_process(tsk);
- return ret;
+ __task_work_notify(tsk, notify);
+ wake_up_process(tsk);
+ return 0;
}
static void __io_req_task_cancel(struct io_kiocb *req, int error)
--
2.28.0
Hi
[This is an automated email]
This commit has been processed because it contains a "Fixes:" tag
fixing commit: b189817cf789 ("crypto: caam/qi - add ablkcipher and authenc algorithms").
The bot has tested the following trees: v5.8.1, v5.7.15, v5.4.58, v4.19.139, v4.14.193.
v5.8.1: Failed to apply! Possible dependencies:
297142490236 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.7.15: Failed to apply! Possible dependencies:
297142490236 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.4.58: Failed to apply! Possible dependencies:
297142490236 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
64db5e7439fb ("crypto: sparc/aes - convert to skcipher API")
66d7fb94e4ff ("crypto: blake2s - generic C library implementation and selftest")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
746b2e024c67 ("crypto: lib - tidy up lib/crypto Kconfig and Makefile")
7988fb2c03c8 ("crypto: s390/aes - convert to skcipher API")
7f725f41f627 ("crypto: powerpc - convert SPE AES algorithms to skcipher API")
7f9b0880925f ("crypto: blake2s - implement generic shash driver")
91d689337fe8 ("crypto: blake2b - add blake2b generic implementation")
b4d0c0aad57a ("crypto: arm - use Kconfig based compiler checks for crypto opcodes")
b95bba5d0114 ("crypto: skcipher - rename the crypto_blkcipher module and kconfig option")
d00c06398154 ("crypto: s390/paes - convert to skcipher API")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
v4.19.139: Failed to apply! Possible dependencies:
0a5dff9882e5 ("crypto: arm/ghash - provide a synchronous version")
1ca1b917940c ("crypto: chacha20-generic - refactor to allow varying number of rounds")
297142490236 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
5ca7badb1f62 ("crypto: caam/jr - ablkcipher -> skcipher conversion")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
8a5a79d5556b ("crypto: x86/chacha20 - Add a 4-block AVX2 variant")
99680c5e9182 ("crypto: arm - convert to use crypto_simd_usable()")
9b17608f15b9 ("crypto: x86/chacha20 - Use larger block functions more aggressively")
9dbe3072c6b1 ("crypto: caam/qi - ablkcipher -> skcipher conversion")
a5dd97f86211 ("crypto: x86/chacha20 - Add a 2-block AVX2 variant")
aec48adce85d ("crypto: caam/qi - remove ablkcipher IV generation")
c3b734dd325d ("crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant")
cf5448b5c3d8 ("crypto: caam/jr - remove ablkcipher IV generation")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
db8e15a24957 ("crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant")
e4e72063d3c0 ("crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant")
v4.14.193: Failed to apply! Possible dependencies:
297142490236 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
5ca7badb1f62 ("crypto: caam/jr - ablkcipher -> skcipher conversion")
662f70ede597 ("crypto: caam - remove needless ablkcipher key copy")
7e0880b9fbbe ("crypto: caam - add Derived Key Protocol (DKP) support")
9dbe3072c6b1 ("crypto: caam/qi - ablkcipher -> skcipher conversion")
cf5448b5c3d8 ("crypto: caam/jr - remove ablkcipher IV generation")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
Hi
[This is an automated email]
This commit has been processed because it contains a "Fixes:" tag
fixing commit: b189817cf789 ("crypto: caam/qi - add ablkcipher and authenc algorithms").
The bot has tested the following trees: v5.8.1, v5.7.15, v5.4.58, v4.19.139, v4.14.193.
v5.8.1: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
6359e4e1bca8 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.7.15: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
6359e4e1bca8 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.4.58: Failed to apply! Possible dependencies:
6359e4e1bca8 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
64db5e7439fb ("crypto: sparc/aes - convert to skcipher API")
66d7fb94e4ff ("crypto: blake2s - generic C library implementation and selftest")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
746b2e024c67 ("crypto: lib - tidy up lib/crypto Kconfig and Makefile")
7988fb2c03c8 ("crypto: s390/aes - convert to skcipher API")
7f725f41f627 ("crypto: powerpc - convert SPE AES algorithms to skcipher API")
7f9b0880925f ("crypto: blake2s - implement generic shash driver")
91d689337fe8 ("crypto: blake2b - add blake2b generic implementation")
b4d0c0aad57a ("crypto: arm - use Kconfig based compiler checks for crypto opcodes")
b95bba5d0114 ("crypto: skcipher - rename the crypto_blkcipher module and kconfig option")
d00c06398154 ("crypto: s390/paes - convert to skcipher API")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
v4.19.139: Failed to apply! Possible dependencies:
0a5dff9882e5 ("crypto: arm/ghash - provide a synchronous version")
1ca1b917940c ("crypto: chacha20-generic - refactor to allow varying number of rounds")
5ca7badb1f62 ("crypto: caam/jr - ablkcipher -> skcipher conversion")
6359e4e1bca8 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
8a5a79d5556b ("crypto: x86/chacha20 - Add a 4-block AVX2 variant")
99680c5e9182 ("crypto: arm - convert to use crypto_simd_usable()")
9b17608f15b9 ("crypto: x86/chacha20 - Use larger block functions more aggressively")
9dbe3072c6b1 ("crypto: caam/qi - ablkcipher -> skcipher conversion")
a5dd97f86211 ("crypto: x86/chacha20 - Add a 2-block AVX2 variant")
aec48adce85d ("crypto: caam/qi - remove ablkcipher IV generation")
c3b734dd325d ("crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant")
cf5448b5c3d8 ("crypto: caam/jr - remove ablkcipher IV generation")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
db8e15a24957 ("crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant")
e4e72063d3c0 ("crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant")
v4.14.193: Failed to apply! Possible dependencies:
5ca7badb1f62 ("crypto: caam/jr - ablkcipher -> skcipher conversion")
6359e4e1bca8 ("crypto: caam/qi - add fallback for XTS with more than 8B IV")
662f70ede597 ("crypto: caam - remove needless ablkcipher key copy")
7e0880b9fbbe ("crypto: caam - add Derived Key Protocol (DKP) support")
9dbe3072c6b1 ("crypto: caam/qi - ablkcipher -> skcipher conversion")
cf5448b5c3d8 ("crypto: caam/jr - remove ablkcipher IV generation")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
Hi
[This is an automated email]
This commit has been processed because it contains a "Fixes:" tag
fixing commit: 226853ac3ebe ("crypto: caam/qi2 - add skcipher algorithms").
The bot has tested the following trees: v5.8.1, v5.7.15, v5.4.58.
v5.8.1: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
f9665aa69c99 ("crypto: caam/qi2 - add fallback for XTS with more than 8B IV")
v5.7.15: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
f9665aa69c99 ("crypto: caam/qi2 - add fallback for XTS with more than 8B IV")
v5.4.58: Failed to apply! Possible dependencies:
64db5e7439fb ("crypto: sparc/aes - convert to skcipher API")
66d7fb94e4ff ("crypto: blake2s - generic C library implementation and selftest")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
746b2e024c67 ("crypto: lib - tidy up lib/crypto Kconfig and Makefile")
7988fb2c03c8 ("crypto: s390/aes - convert to skcipher API")
7f725f41f627 ("crypto: powerpc - convert SPE AES algorithms to skcipher API")
7f9b0880925f ("crypto: blake2s - implement generic shash driver")
91d689337fe8 ("crypto: blake2b - add blake2b generic implementation")
b4d0c0aad57a ("crypto: arm - use Kconfig based compiler checks for crypto opcodes")
b95bba5d0114 ("crypto: skcipher - rename the crypto_blkcipher module and kconfig option")
d00c06398154 ("crypto: s390/paes - convert to skcipher API")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
f9665aa69c99 ("crypto: caam/qi2 - add fallback for XTS with more than 8B IV")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
Hi
[This is an automated email]
This commit has been processed because it contains a "Fixes:" tag
fixing commit: 226853ac3ebe ("crypto: caam/qi2 - add skcipher algorithms").
The bot has tested the following trees: v5.8.1, v5.7.15, v5.4.58.
v5.8.1: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.7.15: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.4.58: Failed to apply! Possible dependencies:
64db5e7439fb ("crypto: sparc/aes - convert to skcipher API")
66d7fb94e4ff ("crypto: blake2s - generic C library implementation and selftest")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
746b2e024c67 ("crypto: lib - tidy up lib/crypto Kconfig and Makefile")
7988fb2c03c8 ("crypto: s390/aes - convert to skcipher API")
7f725f41f627 ("crypto: powerpc - convert SPE AES algorithms to skcipher API")
7f9b0880925f ("crypto: blake2s - implement generic shash driver")
91d689337fe8 ("crypto: blake2b - add blake2b generic implementation")
b4d0c0aad57a ("crypto: arm - use Kconfig based compiler checks for crypto opcodes")
b95bba5d0114 ("crypto: skcipher - rename the crypto_blkcipher module and kconfig option")
d00c06398154 ("crypto: s390/paes - convert to skcipher API")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
Hi
[This is an automated email]
This commit has been processed because it contains a "Fixes:" tag
fixing commit: b189817cf789 ("crypto: caam/qi - add ablkcipher and authenc algorithms").
The bot has tested the following trees: v5.8.1, v5.7.15, v5.4.58, v4.19.139, v4.14.193.
v5.8.1: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.7.15: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.4.58: Failed to apply! Possible dependencies:
64db5e7439fb ("crypto: sparc/aes - convert to skcipher API")
66d7fb94e4ff ("crypto: blake2s - generic C library implementation and selftest")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
746b2e024c67 ("crypto: lib - tidy up lib/crypto Kconfig and Makefile")
7988fb2c03c8 ("crypto: s390/aes - convert to skcipher API")
7f725f41f627 ("crypto: powerpc - convert SPE AES algorithms to skcipher API")
7f9b0880925f ("crypto: blake2s - implement generic shash driver")
91d689337fe8 ("crypto: blake2b - add blake2b generic implementation")
b4d0c0aad57a ("crypto: arm - use Kconfig based compiler checks for crypto opcodes")
b95bba5d0114 ("crypto: skcipher - rename the crypto_blkcipher module and kconfig option")
d00c06398154 ("crypto: s390/paes - convert to skcipher API")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
v4.19.139: Failed to apply! Possible dependencies:
0a5dff9882e5 ("crypto: arm/ghash - provide a synchronous version")
1ca1b917940c ("crypto: chacha20-generic - refactor to allow varying number of rounds")
5ca7badb1f62 ("crypto: caam/jr - ablkcipher -> skcipher conversion")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
8a5a79d5556b ("crypto: x86/chacha20 - Add a 4-block AVX2 variant")
99680c5e9182 ("crypto: arm - convert to use crypto_simd_usable()")
9b17608f15b9 ("crypto: x86/chacha20 - Use larger block functions more aggressively")
9dbe3072c6b1 ("crypto: caam/qi - ablkcipher -> skcipher conversion")
a5dd97f86211 ("crypto: x86/chacha20 - Add a 2-block AVX2 variant")
aec48adce85d ("crypto: caam/qi - remove ablkcipher IV generation")
c3b734dd325d ("crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant")
cf5448b5c3d8 ("crypto: caam/jr - remove ablkcipher IV generation")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
db8e15a24957 ("crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant")
e4e72063d3c0 ("crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant")
v4.14.193: Failed to apply! Possible dependencies:
5ca7badb1f62 ("crypto: caam/jr - ablkcipher -> skcipher conversion")
662f70ede597 ("crypto: caam - remove needless ablkcipher key copy")
7e0880b9fbbe ("crypto: caam - add Derived Key Protocol (DKP) support")
87ec3a0b1c2d ("crypto: caam - prepare for gcm(aes) support over QI interface")
9dbe3072c6b1 ("crypto: caam/qi - ablkcipher -> skcipher conversion")
cf5448b5c3d8 ("crypto: caam/jr - remove ablkcipher IV generation")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
Hi
[This is an automated email]
This commit has been processed because it contains a "Fixes:" tag
fixing commit: 226853ac3ebe ("crypto: caam/qi2 - add skcipher algorithms").
The bot has tested the following trees: v5.8.1, v5.7.15, v5.4.58.
v5.8.1: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
93f83eb04603 ("crypto: caam/qi2 - add fallback for XTS with more than 8B IV")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.7.15: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
93f83eb04603 ("crypto: caam/qi2 - add fallback for XTS with more than 8B IV")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.4.58: Failed to apply! Possible dependencies:
64db5e7439fb ("crypto: sparc/aes - convert to skcipher API")
66d7fb94e4ff ("crypto: blake2s - generic C library implementation and selftest")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
746b2e024c67 ("crypto: lib - tidy up lib/crypto Kconfig and Makefile")
7988fb2c03c8 ("crypto: s390/aes - convert to skcipher API")
7f725f41f627 ("crypto: powerpc - convert SPE AES algorithms to skcipher API")
7f9b0880925f ("crypto: blake2s - implement generic shash driver")
91d689337fe8 ("crypto: blake2b - add blake2b generic implementation")
93f83eb04603 ("crypto: caam/qi2 - add fallback for XTS with more than 8B IV")
b4d0c0aad57a ("crypto: arm - use Kconfig based compiler checks for crypto opcodes")
b95bba5d0114 ("crypto: skcipher - rename the crypto_blkcipher module and kconfig option")
d00c06398154 ("crypto: s390/paes - convert to skcipher API")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
Hi
[This is an automated email]
This commit has been processed because it contains a "Fixes:" tag
fixing commit: 226853ac3ebe ("crypto: caam/qi2 - add skcipher algorithms").
The bot has tested the following trees: v5.8.1, v5.7.15, v5.4.58.
v5.8.1: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.7.15: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.4.58: Failed to apply! Possible dependencies:
64db5e7439fb ("crypto: sparc/aes - convert to skcipher API")
66d7fb94e4ff ("crypto: blake2s - generic C library implementation and selftest")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
746b2e024c67 ("crypto: lib - tidy up lib/crypto Kconfig and Makefile")
7988fb2c03c8 ("crypto: s390/aes - convert to skcipher API")
7f725f41f627 ("crypto: powerpc - convert SPE AES algorithms to skcipher API")
7f9b0880925f ("crypto: blake2s - implement generic shash driver")
91d689337fe8 ("crypto: blake2b - add blake2b generic implementation")
b4d0c0aad57a ("crypto: arm - use Kconfig based compiler checks for crypto opcodes")
b95bba5d0114 ("crypto: skcipher - rename the crypto_blkcipher module and kconfig option")
d00c06398154 ("crypto: s390/paes - convert to skcipher API")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
Hi
[This is an automated email]
This commit has been processed because it contains a "Fixes:" tag
fixing commit: b189817cf789 ("crypto: caam/qi - add ablkcipher and authenc algorithms").
The bot has tested the following trees: v5.8.1, v5.7.15, v5.4.58, v4.19.139, v4.14.193.
v5.8.1: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.7.15: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
v5.4.58: Failed to apply! Possible dependencies:
64db5e7439fb ("crypto: sparc/aes - convert to skcipher API")
66d7fb94e4ff ("crypto: blake2s - generic C library implementation and selftest")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
746b2e024c67 ("crypto: lib - tidy up lib/crypto Kconfig and Makefile")
7988fb2c03c8 ("crypto: s390/aes - convert to skcipher API")
7f725f41f627 ("crypto: powerpc - convert SPE AES algorithms to skcipher API")
7f9b0880925f ("crypto: blake2s - implement generic shash driver")
91d689337fe8 ("crypto: blake2b - add blake2b generic implementation")
b4d0c0aad57a ("crypto: arm - use Kconfig based compiler checks for crypto opcodes")
b95bba5d0114 ("crypto: skcipher - rename the crypto_blkcipher module and kconfig option")
d00c06398154 ("crypto: s390/paes - convert to skcipher API")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
v4.19.139: Failed to apply! Possible dependencies:
0a5dff9882e5 ("crypto: arm/ghash - provide a synchronous version")
1ca1b917940c ("crypto: chacha20-generic - refactor to allow varying number of rounds")
5ca7badb1f62 ("crypto: caam/jr - ablkcipher -> skcipher conversion")
674f368a952c ("crypto: remove CRYPTO_TFM_RES_BAD_KEY_LEN")
8a5a79d5556b ("crypto: x86/chacha20 - Add a 4-block AVX2 variant")
99680c5e9182 ("crypto: arm - convert to use crypto_simd_usable()")
9b17608f15b9 ("crypto: x86/chacha20 - Use larger block functions more aggressively")
9dbe3072c6b1 ("crypto: caam/qi - ablkcipher -> skcipher conversion")
a5dd97f86211 ("crypto: x86/chacha20 - Add a 2-block AVX2 variant")
aec48adce85d ("crypto: caam/qi - remove ablkcipher IV generation")
c3b734dd325d ("crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant")
cf5448b5c3d8 ("crypto: caam/jr - remove ablkcipher IV generation")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")
db8e15a24957 ("crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant")
e4e72063d3c0 ("crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant")
v4.14.193: Failed to apply! Possible dependencies:
5ca7badb1f62 ("crypto: caam/jr - ablkcipher -> skcipher conversion")
662f70ede597 ("crypto: caam - remove needless ablkcipher key copy")
7e0880b9fbbe ("crypto: caam - add Derived Key Protocol (DKP) support")
87ec3a0b1c2d ("crypto: caam - prepare for gcm(aes) support over QI interface")
9dbe3072c6b1 ("crypto: caam/qi - ablkcipher -> skcipher conversion")
cf5448b5c3d8 ("crypto: caam/jr - remove ablkcipher IV generation")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
Checks for partial overlaps on insertion assume that end elements
are always descendant nodes of their corresponding start, because
they are inserted later. However, this is not the case if a
previous delete operation caused a tree rotation as part of
rebalancing.
Taking the issue reported by Andreas Fischer as an example, if we
omit delete operations, the existing procedure works because,
equivalently, we are inserting a start item with value 40 in the
this region of the red-black tree with single-sized intervals:
overlap flag
10 (start)
/ \ false
20 (start)
/ \ false
30 (start)
/ \ false
60 (start)
/ \ false
50 (end)
/ \ false
20 (end)
/ \ false
40 (start)
if we now delete interval 30 - 30, the tree can be rearranged in
a way similar to this (note the rotation involving 50 - 50):
overlap flag
10 (start)
/ \ false
20 (start)
/ \ false
25 (start)
/ \ false
70 (start)
/ \ false
50 (end)
/ \ true (from rule a1.)
50 (start)
/ \ true
40 (start)
and we traverse interval 50 - 50 from the opposite direction
compared to what was expected.
To deal with those cases, add a start-before-start rule, b4.,
that covers traversal of existing intervals from the right.
We now need to restrict start-after-end rule b3. to cases
where there are no occurring nodes between existing start and
end elements, because addition of rule b4. isn't sufficient to
ensure that the pre-existing end element we encounter while
descending the tree corresponds to a start element of an
interval that we already traversed entirely.
Different types of overlap detection on trees with rotations
resulting from re-balancing will be covered by nft test case
sets/0044interval_overlap_1.
Reported-by: Andreas Fischer <netfilter(a)d9c.eu>
Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1449
Cc: <stable(a)vger.kernel.org> # 5.6.x
Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion")
Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com>
---
net/netfilter/nft_set_rbtree.c | 23 ++++++++++++++---------
1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 4b2834fd17b2..27668f4e44ea 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -238,21 +238,27 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
*
* b1. _ _ __>| !_ _ __| (insert end before existing start)
* b2. _ _ ___| !_ _ _>| (insert end after existing start)
- * b3. _ _ ___! >|_ _ __| (insert start after existing end)
+ * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf)
+ * '--' no nodes falling in this range
+ * b4. >|_ _ ! (insert start before existing start)
*
* Case a3. resolves to b3.:
* - if the inserted start element is the leftmost, because the '0'
* element in the tree serves as end element
- * - otherwise, if an existing end is found. Note that end elements are
- * always inserted after corresponding start elements.
+ * - otherwise, if an existing end is found immediately to the left. If
+ * there are existing nodes in between, we need to further descend the
+ * tree before we can conclude the new start isn't causing an overlap
+ *
+ * or to b4., which, preceded by a3., means we already traversed one or
+ * more existing intervals entirely, from the right.
*
* For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
* in that order.
*
* The flag is also cleared in two special cases:
*
- * b4. |__ _ _!|<_ _ _ (insert start right before existing end)
- * b5. |__ _ >|!__ _ _ (insert end right after existing start)
+ * b5. |__ _ _!|<_ _ _ (insert start right before existing end)
+ * b6. |__ _ >|!__ _ _ (insert end right after existing start)
*
* which always happen as last step and imply that no further
* overlapping is possible.
@@ -272,7 +278,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
if (nft_rbtree_interval_start(new)) {
if (nft_rbtree_interval_end(rbe) &&
nft_set_elem_active(&rbe->ext, genmask) &&
- !nft_set_elem_expired(&rbe->ext))
+ !nft_set_elem_expired(&rbe->ext) && !*p)
overlap = false;
} else {
overlap = nft_rbtree_interval_end(rbe) &&
@@ -288,10 +294,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
nft_set_elem_active(&rbe->ext,
genmask) &&
!nft_set_elem_expired(&rbe->ext);
- } else if (nft_rbtree_interval_end(rbe) &&
- nft_set_elem_active(&rbe->ext, genmask) &&
+ } else if (nft_set_elem_active(&rbe->ext, genmask) &&
!nft_set_elem_expired(&rbe->ext)) {
- overlap = true;
+ overlap = nft_rbtree_interval_end(rbe);
}
} else {
if (nft_rbtree_interval_end(rbe) &&
--
2.28.0
The patch titled
Subject: mm, THP, swap: fix allocating cluster for swapfile by mistake
has been added to the -mm tree. Its filename is
mm-thp-swap-fix-allocating-cluster-for-swapfile-by-mistake.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-thp-swap-fix-allocating-cluste…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-swap-fix-allocating-cluste…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Gao Xiang <hsiangkao(a)redhat.com>
Subject: mm, THP, swap: fix allocating cluster for swapfile by mistake
SWP_FS doesn't mean the device is file-backed swap device, which just
means each writeback request should go through fs by DIO. Or it'll just
use extents added by .swap_activate(), but it also works as file-backed
swap device.
So in order to achieve the goal of the original patch, SWP_BLKDEV should
be used instead.
FS corruption can be observed with SSD device + XFS + fragmented swapfile
due to CONFIG_THP_SWAP=y.
I reproduced the issue with the following details:
Environment:
QEMU + upstream kernel + buildroot + NVMe (2 GB)
Kernel config:
CONFIG_BLK_DEV_NVME=y
CONFIG_THP_SWAP=y
Some reproducable steps:
mkfs.xfs -f /dev/nvme0n1
mkdir /tmp/mnt
mount /dev/nvme0n1 /tmp/mnt
bs="32k"
sz="1024m" # doesn't matter too much, I also tried 16m
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -F -S 0 -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw
xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fsync" /tmp/mnt/sw
mkswap /tmp/mnt/sw
swapon /tmp/mnt/sw
stress --vm 2 --vm-bytes 600M # doesn't matter too much as well
Symptoms:
- FS corruption (e.g. checksum failure)
- memory corruption at: 0xd2808010
- segfault
Link: https://lkml.kernel.org/r/20200819195613.24269-1-hsiangkao@redhat.com
Fixes: f0eea189e8e9 ("mm, THP, swap: Don't allocate huge cluster for file backed swap device")
Fixes: 38d8b4e6bdc8 ("mm, THP, swap: delay splitting THP during swap out")
Signed-off-by: Gao Xiang <hsiangkao(a)redhat.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/swapfile.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/swapfile.c~mm-thp-swap-fix-allocating-cluster-for-swapfile-by-mistake
+++ a/mm/swapfile.c
@@ -1078,7 +1078,7 @@ start_over:
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
- if (!(si->flags & SWP_FS))
+ if (si->flags & SWP_BLKDEV)
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
_
Patches currently in -mm which might be from hsiangkao(a)redhat.com are
mm-thp-swap-fix-allocating-cluster-for-swapfile-by-mistake.patch
Adjust the calls to `user_regset_copyout' and `user_regset_copyin' in
`riscv_fpr_get' and `riscv_fpr_set' respectively so as to use @start_pos
and @end_pos according to API documentation in <linux/regset.h>, that is
to point at the beginning and the end respectively of the data chunk to
be copied. Update @data accordingly, also for the first call, to make
it clear which structure member is accessed.
We currently have @start_pos fixed at 0 across all calls, which works as
a result of the implementation, in particular because we have no padding
between the FP general registers and the FP control and status register,
but appears not to have been the intent of the API and is not what other
ports do, requiring one to study the copy handlers to understand what is
going on here.
Signed-off-by: Maciej W. Rozycki <macro(a)wdc.com>
Fixes: b8c8a9590e4f ("RISC-V: Add FP register ptrace support for gdb.")
Cc: stable(a)vger.kernel.org # 4.20+
---
arch/riscv/kernel/ptrace.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
linux-riscv-ptrace-fcsr.diff
Index: linux-hv/arch/riscv/kernel/ptrace.c
===================================================================
--- linux-hv.orig/arch/riscv/kernel/ptrace.c
+++ linux-hv/arch/riscv/kernel/ptrace.c
@@ -61,10 +61,13 @@ static int riscv_fpr_get(struct task_str
int ret;
struct __riscv_d_ext_state *fstate = &target->thread.fstate;
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, fstate, 0,
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fstate->f, 0,
offsetof(struct __riscv_d_ext_state, fcsr));
if (!ret) {
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, fstate, 0,
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &fstate->fcsr,
+ offsetof(struct __riscv_d_ext_state,
+ fcsr),
offsetof(struct __riscv_d_ext_state, fcsr) +
sizeof(fstate->fcsr));
}
@@ -80,10 +83,13 @@ static int riscv_fpr_set(struct task_str
int ret;
struct __riscv_d_ext_state *fstate = &target->thread.fstate;
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, fstate, 0,
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fstate->f, 0,
offsetof(struct __riscv_d_ext_state, fcsr));
if (!ret) {
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, fstate, 0,
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &fstate->fcsr,
+ offsetof(struct __riscv_d_ext_state,
+ fcsr),
offsetof(struct __riscv_d_ext_state, fcsr) +
sizeof(fstate->fcsr));
}
Hello,
We ran automated tests on a recent commit from this kernel tree:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Commit: ad8c735b1497 - Linux 5.8.2
The results of these automated tests are provided below.
Overall result: FAILED (see details below)
Merge: OK
Compile: OK
Tests: FAILED
All kernel binaries, config files, and logs are available for download here:
https://cki-artifacts.s3.us-east-2.amazonaws.com/index.html?prefix=dataware…
One or more kernel tests failed:
s390x:
❌ LTP
ppc64le:
❌ LTP
aarch64:
❌ LTP
We hope that these logs can help you find the problem quickly. For the full
detail on our testing procedures, please scroll to the bottom of this message.
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Compile testing
---------------
We compiled the kernel for 4 architectures:
aarch64:
make options: make -j30 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: make -j30 INSTALL_MOD_STRIP=1 targz-pkg
s390x:
make options: make -j30 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: make -j30 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
Host 1:
✅ Boot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ selinux-policy: serge-testsuite
✅ storage: software RAID testing
✅ stress: stress-ng
🚧 ✅ xfstests - btrfs
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
🚧 ✅ Storage blktests
Host 2:
✅ Boot test
✅ ACPI table test
✅ ACPI enabled test
✅ Podman system integration test - as root
✅ Podman system integration test - as user
❌ LTP
✅ Loopdev Sanity
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking: igmp conformance test
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: SCSI VPD
🚧 ✅ CIFS Connectathon
🚧 ✅ POSIX pjd-fstest suites
🚧 ✅ jvm - jcstress tests
🚧 ✅ Memory function: kaslr
🚧 ✅ Networking firewall: basic netfilter test
🚧 ✅ audit: audit testsuite test
🚧 ✅ trace: ftrace/tracer
🚧 ✅ kdump - kexec_boot
ppc64le:
Host 1:
✅ Boot test
✅ Podman system integration test - as root
✅ Podman system integration test - as user
❌ LTP
✅ Loopdev Sanity
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
🚧 ✅ CIFS Connectathon
🚧 ✅ POSIX pjd-fstest suites
🚧 ✅ jvm - jcstress tests
🚧 ✅ Memory function: kaslr
🚧 ✅ Networking firewall: basic netfilter test
🚧 ✅ audit: audit testsuite test
🚧 ✅ trace: ftrace/tracer
Host 2:
✅ Boot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ selinux-policy: serge-testsuite
✅ storage: software RAID testing
🚧 ✅ xfstests - btrfs
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
🚧 ✅ Storage blktests
Host 3:
✅ Boot test
🚧 ✅ kdump - sysrq-c
s390x:
Host 1:
✅ Boot test
✅ selinux-policy: serge-testsuite
✅ stress: stress-ng
🚧 ✅ Storage blktests
Host 2:
✅ Boot test
✅ Podman system integration test - as root
✅ Podman system integration test - as user
❌ LTP
✅ Loopdev Sanity
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
🚧 ✅ CIFS Connectathon
🚧 ✅ POSIX pjd-fstest suites
🚧 ✅ jvm - jcstress tests
🚧 ✅ Memory function: kaslr
🚧 ✅ Networking firewall: basic netfilter test
🚧 ✅ audit: audit testsuite test
🚧 ✅ trace: ftrace/tracer
x86_64:
Host 1:
✅ Boot test
✅ ACPI table test
✅ Podman system integration test - as root
✅ Podman system integration test - as user
✅ LTP
✅ Loopdev Sanity
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking: igmp conformance test
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: sanity smoke test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: SCSI VPD
🚧 ✅ CIFS Connectathon
🚧 ✅ POSIX pjd-fstest suites
🚧 ✅ jvm - jcstress tests
🚧 ✅ Memory function: kaslr
🚧 ✅ Networking firewall: basic netfilter test
🚧 ✅ audit: audit testsuite test
🚧 ✅ trace: ftrace/tracer
🚧 ✅ kdump - kexec_boot
Host 2:
✅ Boot test
🚧 ✅ kdump - sysrq-c
🚧 ✅ kdump - file-load
Host 3:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ selinux-policy: serge-testsuite
✅ storage: software RAID testing
✅ stress: stress-ng
🚧 ❌ CPU: Frequency Driver Test
🚧 ✅ CPU: Idle Test
🚧 ✅ xfstests - btrfs
🚧 ⚡⚡⚡ IOMMU boot test
🚧 ⚡⚡⚡ IPMI driver test
🚧 ⚡⚡⚡ IPMItool loop stress test
🚧 ⚡⚡⚡ power-management: cpupower/sanity test
🚧 ⚡⚡⚡ Storage blktests
Test sources: https://gitlab.com/cki-project/kernel-tests
💚 Pull requests are welcome for new tests or improvements to existing tests!
Aborted tests
-------------
Tests that didn't complete running successfully are marked with ⚡⚡⚡.
If this was caused by an infrastructure issue, we try to mark that
explicitly in the report.
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
Testing timeout
---------------
We aim to provide a report within reasonable timeframe. Tests that haven't
finished running yet are marked with ⏱.
The patch titled
Subject: ia64: fix build error with !COREDUMP
has been added to the -mm tree. Its filename is
ia64-fix-build-error-with-coredump.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/ia64-fix-build-error-with-coredum…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/ia64-fix-build-error-with-coredum…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Krzysztof Kozlowski <krzk(a)kernel.org>
Subject: ia64: fix build error with !COREDUMP
Fix linkage error when CONFIG_BINFMT_ELF is selected but CONFIG_COREDUMP
is not:
ia64-linux-ld: arch/ia64/kernel/elfcore.o: in function `elf_core_write_extra_phdrs':
elfcore.c:(.text+0x172): undefined reference to `dump_emit'
ia64-linux-ld: arch/ia64/kernel/elfcore.o: in function `elf_core_write_extra_data':
elfcore.c:(.text+0x2b2): undefined reference to `dump_emit'
Link: https://lkml.kernel.org/r/20200819064146.12529-1-krzk@kernel.org
Fixes: 1fcccbac89f5 ("elf coredump: replace ELF_CORE_EXTRA_* macros by functions")
Signed-off-by: Krzysztof Kozlowski <krzk(a)kernel.org>
Reported-by: kernel test robot <lkp(a)intel.com>
Cc: Tony Luck <tony.luck(a)intel.com>
Cc: Fenghua Yu <fenghua.yu(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/ia64/kernel/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/ia64/kernel/Makefile~ia64-fix-build-error-with-coredump
+++ a/arch/ia64/kernel/Makefile
@@ -41,7 +41,7 @@ obj-y += esi_stub.o # must be in kern
endif
obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_ELF_CORE) += elfcore.o
# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
_
Patches currently in -mm which might be from krzk(a)kernel.org are
ia64-fix-build-error-with-coredump.patch
By default 'sdo_limit' is initialized with a default value of '8'
as per spec. This is overridden in cases where a different value is
required. However this is getting reset when snd_hdac_bus_init_chip()
is called again, which happens during runtime PM cycle.
Avoid this reset by moving 'sdo_limit' setup to 'snd_hdac_bus_init()'
function which would be called only once.
Fixes: 67ae482a59e9 ("ALSA: hda: add member to store ratio for stripe control")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Sameer Pujar <spujar(a)nvidia.com>
---
sound/hda/hdac_bus.c | 12 ++++++++++++
sound/hda/hdac_controller.c | 11 -----------
2 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/sound/hda/hdac_bus.c b/sound/hda/hdac_bus.c
index 09ddab5..9766f6a 100644
--- a/sound/hda/hdac_bus.c
+++ b/sound/hda/hdac_bus.c
@@ -46,6 +46,18 @@ int snd_hdac_bus_init(struct hdac_bus *bus, struct device *dev,
INIT_LIST_HEAD(&bus->hlink_list);
init_waitqueue_head(&bus->rirb_wq);
bus->irq = -1;
+
+ /*
+ * Default value of '8' is as per the HD audio specification (Rev 1.0a).
+ * Following relation is used to derive STRIPE control value.
+ * For sample rate <= 48K:
+ * { ((num_channels * bits_per_sample) / number of SDOs) >= 8 }
+ * For sample rate > 48K:
+ * { ((num_channels * bits_per_sample * rate/48000) /
+ * number of SDOs) >= 8 }
+ */
+ bus->sdo_limit = 8;
+
return 0;
}
EXPORT_SYMBOL_GPL(snd_hdac_bus_init);
diff --git a/sound/hda/hdac_controller.c b/sound/hda/hdac_controller.c
index 011b17c..b98449f 100644
--- a/sound/hda/hdac_controller.c
+++ b/sound/hda/hdac_controller.c
@@ -529,17 +529,6 @@ bool snd_hdac_bus_init_chip(struct hdac_bus *bus, bool full_reset)
bus->chip_init = true;
- /*
- * Default value of '8' is as per the HD audio specification (Rev 1.0a).
- * Following relation is used to derive STRIPE control value.
- * For sample rate <= 48K:
- * { ((num_channels * bits_per_sample) / number of SDOs) >= 8 }
- * For sample rate > 48K:
- * { ((num_channels * bits_per_sample * rate/48000) /
- * number of SDOs) >= 8 }
- */
- bus->sdo_limit = 8;
-
return true;
}
EXPORT_SYMBOL_GPL(snd_hdac_bus_init_chip);
--
2.7.4
By default 'sdo_limit' is initialized with a default value of '8'
as per spec. This is overridden in cases where a different value is
required. However this is getting reset when snd_hdac_bus_init_chip()
is called again, which happens during runtime PM cycle. Avoid reset
by not initializing to default value everytime.
Fixes: 67ae482a59e9 ("ALSA: hda: add member to store ratio for stripe control")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Sameer Pujar <spujar(a)nvidia.com>
---
sound/hda/hdac_controller.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/sound/hda/hdac_controller.c b/sound/hda/hdac_controller.c
index 011b17c..0e26e96 100644
--- a/sound/hda/hdac_controller.c
+++ b/sound/hda/hdac_controller.c
@@ -538,7 +538,8 @@ bool snd_hdac_bus_init_chip(struct hdac_bus *bus, bool full_reset)
* { ((num_channels * bits_per_sample * rate/48000) /
* number of SDOs) >= 8 }
*/
- bus->sdo_limit = 8;
+ if (!bus->sdo_limit)
+ bus->sdo_limit = 8;
return true;
}
--
2.7.4
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 84e7a946da71f678affacea301f6d5cb4d9784e8 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul(a)crapouillou.net>
Date: Mon, 22 Jun 2020 23:45:48 +0200
Subject: [PATCH] pinctrl: ingenic: Properly detect GPIO direction when
configured for IRQ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The PAT1 register contains information about the IRQ type (edge/level)
for input GPIOs with IRQ enabled, and the direction for non-IRQ GPIOs.
So it makes sense to read it only if the GPIO has no interrupt
configured, otherwise input GPIOs configured for level IRQs are
misdetected as output GPIOs.
Fixes: ebd6651418b6 ("pinctrl: ingenic: Implement .get_direction for GPIO chips")
Reported-by: João Henrique <johnnyonflame(a)hotmail.com>
Signed-off-by: Paul Cercueil <paul(a)crapouillou.net>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20200622214548.265417-2-paul@crapouillou.net
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c
index 241e563d5814..a8d1b53ec4c1 100644
--- a/drivers/pinctrl/pinctrl-ingenic.c
+++ b/drivers/pinctrl/pinctrl-ingenic.c
@@ -1958,7 +1958,8 @@ static int ingenic_gpio_get_direction(struct gpio_chip *gc, unsigned int offset)
unsigned int pin = gc->base + offset;
if (jzpc->info->version >= ID_JZ4760) {
- if (ingenic_get_pin_config(jzpc, pin, JZ4760_GPIO_PAT1))
+ if (ingenic_get_pin_config(jzpc, pin, JZ4760_GPIO_INT) ||
+ ingenic_get_pin_config(jzpc, pin, JZ4760_GPIO_PAT1))
return GPIO_LINE_DIRECTION_IN;
return GPIO_LINE_DIRECTION_OUT;
}
upstream commit id: ec95f1dedc9c64ac5a8b0bdb7c276936c70fdedd
I verified that ec95f1de "orangefs: get rid of knob code..."
will apply to 5.4 and I compiled and ran a patched 5.4 kernel
against my normal xfstests... I wish that ec95f1de could be
in the 5.4 long term stable kernel.
ec95f1de went upstream in 5.7. When I sent up the patch it was
just a theoretical race condition to me: I accepted what Christoph
said about it. We now have experienced in-the-real-world how
important the patch is...
Someone was trying to read a whole large (more than 100 meg)
file from orangefs into some kind of cloud bucket. The
resulting read failed with a "Bad address" error. I
immediately thought of this patch. I reproduced the
"Bad address" error with dd in kernel versions that
lack ec95f1de. The "Bad address" error does not occur
in kernels that include ec95f1de:
5.7.11-100.fc31.x86_64:
$ ./wr.sh 10000000 > /pvfsmnt/wr.10000000
$ dd if=/pvfsmnt/wr.10000000 of=/tmp/wr.10000000 count=10 bs=419430400
$ ls -l /pvfsmnt/wr.10000000 /tmp/wr.10000000
-rw-rw-r--. 1 hubcap hubcap 498888897 Aug 14 15:41 /pvfsmnt/wr.10000000
-rw-rw-r--. 1 hubcap hubcap 498888897 Aug 14 16:51 /tmp/wr.10000000
$ md5sum /pvfsmnt/wr.10000000 /tmp/wr.10000000
669daa04f91f561f5fb2851fb30e4ffe /pvfsmnt/wr.10000000
669daa04f91f561f5fb2851fb30e4ffe /tmp/wr.10000000
5.6.0hubcap:
$ ./wr.sh 10000000 > /pvfsmnt/wr.10000000
$ dd if=/pvfsmnt/wr.10000000 of=/tmp/wr.10000000 count=10 bs=419430400
dd: error reading '/pvfsmnt/wr.10000000': Bad address
0+0 records in
0+0 records out
0 bytes copied, 10.3365 s, 0.0 kB/s
Thanks!
-Mike
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bad20a2dbfdfaf01560026909506b6ed69d65ba2 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul(a)crapouillou.net>
Date: Thu, 16 Jul 2020 14:56:46 +0200
Subject: [PATCH] drm/panel-simple: Fix inverted V/H SYNC for Frida
FRD350H54004 panel
The FRD350H54004 panel was marked as having active-high VSYNC and HSYNC
signals, which sorts-of worked, but resulted in the picture fading out
under certain circumstances.
Fix this issue by marking VSYNC and HSYNC signals active-low.
v2: Rebase on drm-misc-next
Fixes: 7b6bd8433609 ("drm/panel: simple: Add support for the Frida FRD350H54004 panel")
Cc: stable(a)vger.kernel.org # v5.5
Signed-off-by: Paul Cercueil <paul(a)crapouillou.net>
Signed-off-by: Sam Ravnborg <sam(a)ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20200716125647.10964-1-paul@c…
diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c
index f42249b72548..8b0bab9dd075 100644
--- a/drivers/gpu/drm/panel/panel-simple.c
+++ b/drivers/gpu/drm/panel/panel-simple.c
@@ -1763,7 +1763,7 @@ static const struct drm_display_mode frida_frd350h54004_mode = {
.vsync_start = 240 + 2,
.vsync_end = 240 + 2 + 6,
.vtotal = 240 + 2 + 6 + 2,
- .flags = DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC,
+ .flags = DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC,
};
static const struct panel_desc frida_frd350h54004 = {
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bad20a2dbfdfaf01560026909506b6ed69d65ba2 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul(a)crapouillou.net>
Date: Thu, 16 Jul 2020 14:56:46 +0200
Subject: [PATCH] drm/panel-simple: Fix inverted V/H SYNC for Frida
FRD350H54004 panel
The FRD350H54004 panel was marked as having active-high VSYNC and HSYNC
signals, which sorts-of worked, but resulted in the picture fading out
under certain circumstances.
Fix this issue by marking VSYNC and HSYNC signals active-low.
v2: Rebase on drm-misc-next
Fixes: 7b6bd8433609 ("drm/panel: simple: Add support for the Frida FRD350H54004 panel")
Cc: stable(a)vger.kernel.org # v5.5
Signed-off-by: Paul Cercueil <paul(a)crapouillou.net>
Signed-off-by: Sam Ravnborg <sam(a)ravnborg.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20200716125647.10964-1-paul@c…
diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c
index f42249b72548..8b0bab9dd075 100644
--- a/drivers/gpu/drm/panel/panel-simple.c
+++ b/drivers/gpu/drm/panel/panel-simple.c
@@ -1763,7 +1763,7 @@ static const struct drm_display_mode frida_frd350h54004_mode = {
.vsync_start = 240 + 2,
.vsync_end = 240 + 2 + 6,
.vtotal = 240 + 2 + 6 + 2,
- .flags = DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC,
+ .flags = DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC,
};
static const struct panel_desc frida_frd350h54004 = {
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3b2a999582c467d1883716b37ffcc00178a13713 Mon Sep 17 00:00:00 2001
From: Liu Ying <victor.liu(a)nxp.com>
Date: Thu, 9 Jul 2020 10:28:52 +0800
Subject: [PATCH] drm/imx: imx-ldb: Disable both channels for split mode in
enc->disable()
Both of the two LVDS channels should be disabled for split mode
in the encoder's ->disable() callback, because they are enabled
in the encoder's ->enable() callback.
Fixes: 6556f7f82b9c ("drm: imx: Move imx-drm driver out of staging")
Cc: Philipp Zabel <p.zabel(a)pengutronix.de>
Cc: Sascha Hauer <s.hauer(a)pengutronix.de>
Cc: Pengutronix Kernel Team <kernel(a)pengutronix.de>
Cc: NXP Linux Team <linux-imx(a)nxp.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Liu Ying <victor.liu(a)nxp.com>
Signed-off-by: Philipp Zabel <p.zabel(a)pengutronix.de>
diff --git a/drivers/gpu/drm/imx/imx-ldb.c b/drivers/gpu/drm/imx/imx-ldb.c
index 909682a74474..8791d60be92e 100644
--- a/drivers/gpu/drm/imx/imx-ldb.c
+++ b/drivers/gpu/drm/imx/imx-ldb.c
@@ -296,18 +296,19 @@ static void imx_ldb_encoder_disable(struct drm_encoder *encoder)
{
struct imx_ldb_channel *imx_ldb_ch = enc_to_imx_ldb_ch(encoder);
struct imx_ldb *ldb = imx_ldb_ch->ldb;
+ int dual = ldb->ldb_ctrl & LDB_SPLIT_MODE_EN;
int mux, ret;
drm_panel_disable(imx_ldb_ch->panel);
- if (imx_ldb_ch == &ldb->channel[0])
+ if (imx_ldb_ch == &ldb->channel[0] || dual)
ldb->ldb_ctrl &= ~LDB_CH0_MODE_EN_MASK;
- else if (imx_ldb_ch == &ldb->channel[1])
+ if (imx_ldb_ch == &ldb->channel[1] || dual)
ldb->ldb_ctrl &= ~LDB_CH1_MODE_EN_MASK;
regmap_write(ldb->regmap, IOMUXC_GPR2, ldb->ldb_ctrl);
- if (ldb->ldb_ctrl & LDB_SPLIT_MODE_EN) {
+ if (dual) {
clk_disable_unprepare(ldb->clk[0]);
clk_disable_unprepare(ldb->clk[1]);
}
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3b2a999582c467d1883716b37ffcc00178a13713 Mon Sep 17 00:00:00 2001
From: Liu Ying <victor.liu(a)nxp.com>
Date: Thu, 9 Jul 2020 10:28:52 +0800
Subject: [PATCH] drm/imx: imx-ldb: Disable both channels for split mode in
enc->disable()
Both of the two LVDS channels should be disabled for split mode
in the encoder's ->disable() callback, because they are enabled
in the encoder's ->enable() callback.
Fixes: 6556f7f82b9c ("drm: imx: Move imx-drm driver out of staging")
Cc: Philipp Zabel <p.zabel(a)pengutronix.de>
Cc: Sascha Hauer <s.hauer(a)pengutronix.de>
Cc: Pengutronix Kernel Team <kernel(a)pengutronix.de>
Cc: NXP Linux Team <linux-imx(a)nxp.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Liu Ying <victor.liu(a)nxp.com>
Signed-off-by: Philipp Zabel <p.zabel(a)pengutronix.de>
diff --git a/drivers/gpu/drm/imx/imx-ldb.c b/drivers/gpu/drm/imx/imx-ldb.c
index 909682a74474..8791d60be92e 100644
--- a/drivers/gpu/drm/imx/imx-ldb.c
+++ b/drivers/gpu/drm/imx/imx-ldb.c
@@ -296,18 +296,19 @@ static void imx_ldb_encoder_disable(struct drm_encoder *encoder)
{
struct imx_ldb_channel *imx_ldb_ch = enc_to_imx_ldb_ch(encoder);
struct imx_ldb *ldb = imx_ldb_ch->ldb;
+ int dual = ldb->ldb_ctrl & LDB_SPLIT_MODE_EN;
int mux, ret;
drm_panel_disable(imx_ldb_ch->panel);
- if (imx_ldb_ch == &ldb->channel[0])
+ if (imx_ldb_ch == &ldb->channel[0] || dual)
ldb->ldb_ctrl &= ~LDB_CH0_MODE_EN_MASK;
- else if (imx_ldb_ch == &ldb->channel[1])
+ if (imx_ldb_ch == &ldb->channel[1] || dual)
ldb->ldb_ctrl &= ~LDB_CH1_MODE_EN_MASK;
regmap_write(ldb->regmap, IOMUXC_GPR2, ldb->ldb_ctrl);
- if (ldb->ldb_ctrl & LDB_SPLIT_MODE_EN) {
+ if (dual) {
clk_disable_unprepare(ldb->clk[0]);
clk_disable_unprepare(ldb->clk[1]);
}
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 244d012801dae30c91983b360457c78d481584b0 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann(a)suse.de>
Date: Thu, 16 Jul 2020 14:53:52 +0200
Subject: [PATCH] drm/ast: Initialize DRAM type before posting GPU
Posting the GPU requires the correct DRAM type to be stored in
struct ast_private. Therefore first initialize the DRAM info and
then post the GPU. This restores the original order of instructions
in this function.
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Reviewed-by: Sam Ravnborg <sam(a)ravnborg.org>
Acked-by: Benjamin Herrenschmidt <benh(a)kernel.crashing.org>
Fixes: bad09da6deab ("drm/ast: Fixed vram size incorrect issue on POWER")
Cc: Joel Stanley <joel(a)jms.id.au>
Cc: Y.C. Chen <yc_chen(a)aspeedtech.com>
Cc: Benjamin Herrenschmidt <benh(a)kernel.crashing.org>
Cc: Dave Airlie <airlied(a)redhat.com>
Cc: Thomas Zimmermann <tzimmermann(a)suse.de>
Cc: Gerd Hoffmann <kraxel(a)redhat.com>
Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: Sam Ravnborg <sam(a)ravnborg.org>
Cc: Emil Velikov <emil.l.velikov(a)gmail.com>
Cc: "Y.C. Chen" <yc_chen(a)aspeedtech.com>
Cc: <stable(a)vger.kernel.org> # v4.11+
Link: https://patchwork.freedesktop.org/patch/msgid/20200716125353.31512-6-tzimme…
diff --git a/drivers/gpu/drm/ast/ast_main.c b/drivers/gpu/drm/ast/ast_main.c
index b162cc82204d..87e5baded2a7 100644
--- a/drivers/gpu/drm/ast/ast_main.c
+++ b/drivers/gpu/drm/ast/ast_main.c
@@ -418,15 +418,15 @@ int ast_driver_load(struct drm_device *dev, unsigned long flags)
ast_detect_chip(dev, &need_post);
- if (need_post)
- ast_post_gpu(dev);
-
ret = ast_get_dram_info(dev);
if (ret)
goto out_free;
drm_info(dev, "dram MCLK=%u Mhz type=%d bus_width=%d\n",
ast->mclk, ast->dram_type, ast->dram_bus_width);
+ if (need_post)
+ ast_post_gpu(dev);
+
ret = ast_mm_init(ast);
if (ret)
goto out_free;
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 92fe2aa859f52ce6aa595ca97fec110dc7100e63 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Mon, 20 Jul 2020 15:07:30 -0700
Subject: [PATCH] libnvdimm: Validate command family indices
The ND_CMD_CALL format allows for a general passthrough of passlisted
commands targeting a given command set. However there is no validation
of the family index relative to what the bus supports.
- Update the NFIT bus implementation (the only one that supports
ND_CMD_CALL passthrough) to also passlist the valid set of command
family indices.
- Update the generic __nd_ioctl() path to validate that field on behalf
of all implementations.
Fixes: 31eca76ba2fc ("nfit, libnvdimm: limited/whitelisted dimm command marshaling mechanism")
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: "Rafael J. Wysocki" <rjw(a)rjwysocki.net>
Cc: Len Brown <lenb(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 7c138a4edc03..1f72ce1a782b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1823,6 +1823,7 @@ static void populate_shutdown_status(struct nfit_mem *nfit_mem)
static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
struct nfit_mem *nfit_mem, u32 device_handle)
{
+ struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
struct acpi_device *adev, *adev_dimm;
struct device *dev = acpi_desc->dev;
unsigned long dsm_mask, label_mask;
@@ -1834,6 +1835,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
/* nfit test assumes 1:1 relationship between commands and dsms */
nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
nfit_mem->family = NVDIMM_FAMILY_INTEL;
+ set_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID)
sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x",
@@ -1886,10 +1888,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
* Note, that checking for function0 (bit0) tells us if any commands
* are reachable through this GUID.
*/
+ clear_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
- if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
+ if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) {
+ set_bit(i, &nd_desc->dimm_family_mask);
if (family < 0 || i == default_dsm_family)
family = i;
+ }
/* limit the supported commands to those that are publicly documented */
nfit_mem->family = family;
@@ -2153,6 +2158,9 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
+ set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
+ set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask);
+
adev = to_acpi_dev(acpi_desc);
if (!adev)
return;
@@ -2160,7 +2168,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
set_bit(i, &nd_desc->cmd_mask);
- set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
dsm_mask =
(1 << ND_CMD_ARS_CAP) |
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index f5525f8bb770..5c5e7ebba8dc 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -33,7 +33,6 @@
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
-#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
#define NVDIMM_CMD_MAX 31
#define NVDIMM_STANDARD_CMDMASK \
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 09087c38fabd..955265656b96 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -1037,9 +1037,25 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
dimm_name = "bus";
}
+ /* Validate command family support against bus declared support */
if (cmd == ND_CMD_CALL) {
+ unsigned long *mask;
+
if (copy_from_user(&pkg, p, sizeof(pkg)))
return -EFAULT;
+
+ if (nvdimm) {
+ if (pkg.nd_family > NVDIMM_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->dimm_family_mask;
+ } else {
+ if (pkg.nd_family > NVDIMM_BUS_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->bus_family_mask;
+ }
+
+ if (!test_bit(pkg.nd_family, mask))
+ return -EINVAL;
}
if (!desc ||
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 18da4059be09..bd39a2cf7972 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -78,6 +78,8 @@ struct nvdimm_bus_descriptor {
const struct attribute_group **attr_groups;
unsigned long bus_dsm_mask;
unsigned long cmd_mask;
+ unsigned long dimm_family_mask;
+ unsigned long bus_family_mask;
struct module *module;
char *provider_name;
struct device_node *of_node;
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 0e09dc5cec19..e9468b9332bd 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -245,6 +245,10 @@ struct nd_cmd_pkg {
#define NVDIMM_FAMILY_MSFT 3
#define NVDIMM_FAMILY_HYPERV 4
#define NVDIMM_FAMILY_PAPR 5
+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_PAPR
+
+#define NVDIMM_BUS_FAMILY_NFIT 0
+#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_NFIT
#define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\
struct nd_cmd_pkg)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 92fe2aa859f52ce6aa595ca97fec110dc7100e63 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Mon, 20 Jul 2020 15:07:30 -0700
Subject: [PATCH] libnvdimm: Validate command family indices
The ND_CMD_CALL format allows for a general passthrough of passlisted
commands targeting a given command set. However there is no validation
of the family index relative to what the bus supports.
- Update the NFIT bus implementation (the only one that supports
ND_CMD_CALL passthrough) to also passlist the valid set of command
family indices.
- Update the generic __nd_ioctl() path to validate that field on behalf
of all implementations.
Fixes: 31eca76ba2fc ("nfit, libnvdimm: limited/whitelisted dimm command marshaling mechanism")
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: "Rafael J. Wysocki" <rjw(a)rjwysocki.net>
Cc: Len Brown <lenb(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 7c138a4edc03..1f72ce1a782b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1823,6 +1823,7 @@ static void populate_shutdown_status(struct nfit_mem *nfit_mem)
static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
struct nfit_mem *nfit_mem, u32 device_handle)
{
+ struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
struct acpi_device *adev, *adev_dimm;
struct device *dev = acpi_desc->dev;
unsigned long dsm_mask, label_mask;
@@ -1834,6 +1835,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
/* nfit test assumes 1:1 relationship between commands and dsms */
nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
nfit_mem->family = NVDIMM_FAMILY_INTEL;
+ set_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID)
sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x",
@@ -1886,10 +1888,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
* Note, that checking for function0 (bit0) tells us if any commands
* are reachable through this GUID.
*/
+ clear_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
- if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
+ if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) {
+ set_bit(i, &nd_desc->dimm_family_mask);
if (family < 0 || i == default_dsm_family)
family = i;
+ }
/* limit the supported commands to those that are publicly documented */
nfit_mem->family = family;
@@ -2153,6 +2158,9 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
+ set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
+ set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask);
+
adev = to_acpi_dev(acpi_desc);
if (!adev)
return;
@@ -2160,7 +2168,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
set_bit(i, &nd_desc->cmd_mask);
- set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
dsm_mask =
(1 << ND_CMD_ARS_CAP) |
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index f5525f8bb770..5c5e7ebba8dc 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -33,7 +33,6 @@
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
-#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
#define NVDIMM_CMD_MAX 31
#define NVDIMM_STANDARD_CMDMASK \
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 09087c38fabd..955265656b96 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -1037,9 +1037,25 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
dimm_name = "bus";
}
+ /* Validate command family support against bus declared support */
if (cmd == ND_CMD_CALL) {
+ unsigned long *mask;
+
if (copy_from_user(&pkg, p, sizeof(pkg)))
return -EFAULT;
+
+ if (nvdimm) {
+ if (pkg.nd_family > NVDIMM_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->dimm_family_mask;
+ } else {
+ if (pkg.nd_family > NVDIMM_BUS_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->bus_family_mask;
+ }
+
+ if (!test_bit(pkg.nd_family, mask))
+ return -EINVAL;
}
if (!desc ||
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 18da4059be09..bd39a2cf7972 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -78,6 +78,8 @@ struct nvdimm_bus_descriptor {
const struct attribute_group **attr_groups;
unsigned long bus_dsm_mask;
unsigned long cmd_mask;
+ unsigned long dimm_family_mask;
+ unsigned long bus_family_mask;
struct module *module;
char *provider_name;
struct device_node *of_node;
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 0e09dc5cec19..e9468b9332bd 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -245,6 +245,10 @@ struct nd_cmd_pkg {
#define NVDIMM_FAMILY_MSFT 3
#define NVDIMM_FAMILY_HYPERV 4
#define NVDIMM_FAMILY_PAPR 5
+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_PAPR
+
+#define NVDIMM_BUS_FAMILY_NFIT 0
+#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_NFIT
#define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\
struct nd_cmd_pkg)
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 92fe2aa859f52ce6aa595ca97fec110dc7100e63 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Mon, 20 Jul 2020 15:07:30 -0700
Subject: [PATCH] libnvdimm: Validate command family indices
The ND_CMD_CALL format allows for a general passthrough of passlisted
commands targeting a given command set. However there is no validation
of the family index relative to what the bus supports.
- Update the NFIT bus implementation (the only one that supports
ND_CMD_CALL passthrough) to also passlist the valid set of command
family indices.
- Update the generic __nd_ioctl() path to validate that field on behalf
of all implementations.
Fixes: 31eca76ba2fc ("nfit, libnvdimm: limited/whitelisted dimm command marshaling mechanism")
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: "Rafael J. Wysocki" <rjw(a)rjwysocki.net>
Cc: Len Brown <lenb(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 7c138a4edc03..1f72ce1a782b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1823,6 +1823,7 @@ static void populate_shutdown_status(struct nfit_mem *nfit_mem)
static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
struct nfit_mem *nfit_mem, u32 device_handle)
{
+ struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
struct acpi_device *adev, *adev_dimm;
struct device *dev = acpi_desc->dev;
unsigned long dsm_mask, label_mask;
@@ -1834,6 +1835,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
/* nfit test assumes 1:1 relationship between commands and dsms */
nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
nfit_mem->family = NVDIMM_FAMILY_INTEL;
+ set_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID)
sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x",
@@ -1886,10 +1888,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
* Note, that checking for function0 (bit0) tells us if any commands
* are reachable through this GUID.
*/
+ clear_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
- if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
+ if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) {
+ set_bit(i, &nd_desc->dimm_family_mask);
if (family < 0 || i == default_dsm_family)
family = i;
+ }
/* limit the supported commands to those that are publicly documented */
nfit_mem->family = family;
@@ -2153,6 +2158,9 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
+ set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
+ set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask);
+
adev = to_acpi_dev(acpi_desc);
if (!adev)
return;
@@ -2160,7 +2168,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
set_bit(i, &nd_desc->cmd_mask);
- set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
dsm_mask =
(1 << ND_CMD_ARS_CAP) |
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index f5525f8bb770..5c5e7ebba8dc 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -33,7 +33,6 @@
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
-#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
#define NVDIMM_CMD_MAX 31
#define NVDIMM_STANDARD_CMDMASK \
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 09087c38fabd..955265656b96 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -1037,9 +1037,25 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
dimm_name = "bus";
}
+ /* Validate command family support against bus declared support */
if (cmd == ND_CMD_CALL) {
+ unsigned long *mask;
+
if (copy_from_user(&pkg, p, sizeof(pkg)))
return -EFAULT;
+
+ if (nvdimm) {
+ if (pkg.nd_family > NVDIMM_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->dimm_family_mask;
+ } else {
+ if (pkg.nd_family > NVDIMM_BUS_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->bus_family_mask;
+ }
+
+ if (!test_bit(pkg.nd_family, mask))
+ return -EINVAL;
}
if (!desc ||
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 18da4059be09..bd39a2cf7972 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -78,6 +78,8 @@ struct nvdimm_bus_descriptor {
const struct attribute_group **attr_groups;
unsigned long bus_dsm_mask;
unsigned long cmd_mask;
+ unsigned long dimm_family_mask;
+ unsigned long bus_family_mask;
struct module *module;
char *provider_name;
struct device_node *of_node;
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 0e09dc5cec19..e9468b9332bd 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -245,6 +245,10 @@ struct nd_cmd_pkg {
#define NVDIMM_FAMILY_MSFT 3
#define NVDIMM_FAMILY_HYPERV 4
#define NVDIMM_FAMILY_PAPR 5
+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_PAPR
+
+#define NVDIMM_BUS_FAMILY_NFIT 0
+#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_NFIT
#define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\
struct nd_cmd_pkg)
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 92fe2aa859f52ce6aa595ca97fec110dc7100e63 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Mon, 20 Jul 2020 15:07:30 -0700
Subject: [PATCH] libnvdimm: Validate command family indices
The ND_CMD_CALL format allows for a general passthrough of passlisted
commands targeting a given command set. However there is no validation
of the family index relative to what the bus supports.
- Update the NFIT bus implementation (the only one that supports
ND_CMD_CALL passthrough) to also passlist the valid set of command
family indices.
- Update the generic __nd_ioctl() path to validate that field on behalf
of all implementations.
Fixes: 31eca76ba2fc ("nfit, libnvdimm: limited/whitelisted dimm command marshaling mechanism")
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: "Rafael J. Wysocki" <rjw(a)rjwysocki.net>
Cc: Len Brown <lenb(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 7c138a4edc03..1f72ce1a782b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1823,6 +1823,7 @@ static void populate_shutdown_status(struct nfit_mem *nfit_mem)
static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
struct nfit_mem *nfit_mem, u32 device_handle)
{
+ struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
struct acpi_device *adev, *adev_dimm;
struct device *dev = acpi_desc->dev;
unsigned long dsm_mask, label_mask;
@@ -1834,6 +1835,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
/* nfit test assumes 1:1 relationship between commands and dsms */
nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
nfit_mem->family = NVDIMM_FAMILY_INTEL;
+ set_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID)
sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x",
@@ -1886,10 +1888,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
* Note, that checking for function0 (bit0) tells us if any commands
* are reachable through this GUID.
*/
+ clear_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
- if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
+ if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) {
+ set_bit(i, &nd_desc->dimm_family_mask);
if (family < 0 || i == default_dsm_family)
family = i;
+ }
/* limit the supported commands to those that are publicly documented */
nfit_mem->family = family;
@@ -2153,6 +2158,9 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
+ set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
+ set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask);
+
adev = to_acpi_dev(acpi_desc);
if (!adev)
return;
@@ -2160,7 +2168,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
set_bit(i, &nd_desc->cmd_mask);
- set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
dsm_mask =
(1 << ND_CMD_ARS_CAP) |
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index f5525f8bb770..5c5e7ebba8dc 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -33,7 +33,6 @@
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
-#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
#define NVDIMM_CMD_MAX 31
#define NVDIMM_STANDARD_CMDMASK \
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 09087c38fabd..955265656b96 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -1037,9 +1037,25 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
dimm_name = "bus";
}
+ /* Validate command family support against bus declared support */
if (cmd == ND_CMD_CALL) {
+ unsigned long *mask;
+
if (copy_from_user(&pkg, p, sizeof(pkg)))
return -EFAULT;
+
+ if (nvdimm) {
+ if (pkg.nd_family > NVDIMM_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->dimm_family_mask;
+ } else {
+ if (pkg.nd_family > NVDIMM_BUS_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->bus_family_mask;
+ }
+
+ if (!test_bit(pkg.nd_family, mask))
+ return -EINVAL;
}
if (!desc ||
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 18da4059be09..bd39a2cf7972 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -78,6 +78,8 @@ struct nvdimm_bus_descriptor {
const struct attribute_group **attr_groups;
unsigned long bus_dsm_mask;
unsigned long cmd_mask;
+ unsigned long dimm_family_mask;
+ unsigned long bus_family_mask;
struct module *module;
char *provider_name;
struct device_node *of_node;
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 0e09dc5cec19..e9468b9332bd 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -245,6 +245,10 @@ struct nd_cmd_pkg {
#define NVDIMM_FAMILY_MSFT 3
#define NVDIMM_FAMILY_HYPERV 4
#define NVDIMM_FAMILY_PAPR 5
+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_PAPR
+
+#define NVDIMM_BUS_FAMILY_NFIT 0
+#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_NFIT
#define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\
struct nd_cmd_pkg)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 92fe2aa859f52ce6aa595ca97fec110dc7100e63 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Mon, 20 Jul 2020 15:07:30 -0700
Subject: [PATCH] libnvdimm: Validate command family indices
The ND_CMD_CALL format allows for a general passthrough of passlisted
commands targeting a given command set. However there is no validation
of the family index relative to what the bus supports.
- Update the NFIT bus implementation (the only one that supports
ND_CMD_CALL passthrough) to also passlist the valid set of command
family indices.
- Update the generic __nd_ioctl() path to validate that field on behalf
of all implementations.
Fixes: 31eca76ba2fc ("nfit, libnvdimm: limited/whitelisted dimm command marshaling mechanism")
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: "Rafael J. Wysocki" <rjw(a)rjwysocki.net>
Cc: Len Brown <lenb(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 7c138a4edc03..1f72ce1a782b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1823,6 +1823,7 @@ static void populate_shutdown_status(struct nfit_mem *nfit_mem)
static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
struct nfit_mem *nfit_mem, u32 device_handle)
{
+ struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
struct acpi_device *adev, *adev_dimm;
struct device *dev = acpi_desc->dev;
unsigned long dsm_mask, label_mask;
@@ -1834,6 +1835,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
/* nfit test assumes 1:1 relationship between commands and dsms */
nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
nfit_mem->family = NVDIMM_FAMILY_INTEL;
+ set_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID)
sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x",
@@ -1886,10 +1888,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
* Note, that checking for function0 (bit0) tells us if any commands
* are reachable through this GUID.
*/
+ clear_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask);
for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
- if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
+ if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) {
+ set_bit(i, &nd_desc->dimm_family_mask);
if (family < 0 || i == default_dsm_family)
family = i;
+ }
/* limit the supported commands to those that are publicly documented */
nfit_mem->family = family;
@@ -2153,6 +2158,9 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
+ set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
+ set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask);
+
adev = to_acpi_dev(acpi_desc);
if (!adev)
return;
@@ -2160,7 +2168,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i))
set_bit(i, &nd_desc->cmd_mask);
- set_bit(ND_CMD_CALL, &nd_desc->cmd_mask);
dsm_mask =
(1 << ND_CMD_ARS_CAP) |
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index f5525f8bb770..5c5e7ebba8dc 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -33,7 +33,6 @@
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
-#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
#define NVDIMM_CMD_MAX 31
#define NVDIMM_STANDARD_CMDMASK \
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 09087c38fabd..955265656b96 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -1037,9 +1037,25 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
dimm_name = "bus";
}
+ /* Validate command family support against bus declared support */
if (cmd == ND_CMD_CALL) {
+ unsigned long *mask;
+
if (copy_from_user(&pkg, p, sizeof(pkg)))
return -EFAULT;
+
+ if (nvdimm) {
+ if (pkg.nd_family > NVDIMM_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->dimm_family_mask;
+ } else {
+ if (pkg.nd_family > NVDIMM_BUS_FAMILY_MAX)
+ return -EINVAL;
+ mask = &nd_desc->bus_family_mask;
+ }
+
+ if (!test_bit(pkg.nd_family, mask))
+ return -EINVAL;
}
if (!desc ||
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 18da4059be09..bd39a2cf7972 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -78,6 +78,8 @@ struct nvdimm_bus_descriptor {
const struct attribute_group **attr_groups;
unsigned long bus_dsm_mask;
unsigned long cmd_mask;
+ unsigned long dimm_family_mask;
+ unsigned long bus_family_mask;
struct module *module;
char *provider_name;
struct device_node *of_node;
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 0e09dc5cec19..e9468b9332bd 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -245,6 +245,10 @@ struct nd_cmd_pkg {
#define NVDIMM_FAMILY_MSFT 3
#define NVDIMM_FAMILY_HYPERV 4
#define NVDIMM_FAMILY_PAPR 5
+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_PAPR
+
+#define NVDIMM_BUS_FAMILY_NFIT 0
+#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_NFIT
#define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\
struct nd_cmd_pkg)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 135b9e8d1cd8ba5ac9ad9bcf24b464b7b052e5b8 Mon Sep 17 00:00:00 2001
From: Sibi Sankar <sibis(a)codeaurora.org>
Date: Thu, 23 Jul 2020 01:40:46 +0530
Subject: [PATCH] remoteproc: qcom_q6v5_mss: Validate modem blob firmware size
before load
The following mem abort is observed when one of the modem blob firmware
size exceeds the allocated mpss region. Fix this by restricting the copy
size to segment size using request_firmware_into_buf before load.
Err Logs:
Unable to handle kernel paging request at virtual address
Mem abort info:
...
Call trace:
__memcpy+0x110/0x180
rproc_start+0xd0/0x190
rproc_boot+0x404/0x550
state_store+0x54/0xf8
dev_attr_store+0x44/0x60
sysfs_kf_write+0x58/0x80
kernfs_fop_write+0x140/0x230
vfs_write+0xc4/0x208
ksys_write+0x74/0xf8
...
Reviewed-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
Fixes: 051fb70fd4ea4 ("remoteproc: qcom: Driver for the self-authenticating Hexagon v5")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sibi Sankar <sibis(a)codeaurora.org>
Link: https://lore.kernel.org/r/20200722201047.12975-3-sibis@codeaurora.org
Signed-off-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index 7826f229957d..8199d9f59209 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -1173,15 +1173,14 @@ static int q6v5_mpss_load(struct q6v5 *qproc)
} else if (phdr->p_filesz) {
/* Replace "xxx.xxx" with "xxx.bxx" */
sprintf(fw_name + fw_name_len - 3, "b%02d", i);
- ret = request_firmware(&seg_fw, fw_name, qproc->dev);
+ ret = request_firmware_into_buf(&seg_fw, fw_name, qproc->dev,
+ ptr, phdr->p_filesz);
if (ret) {
dev_err(qproc->dev, "failed to load %s\n", fw_name);
iounmap(ptr);
goto release_firmware;
}
- memcpy(ptr, seg_fw->data, seg_fw->size);
-
release_firmware(seg_fw);
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 135b9e8d1cd8ba5ac9ad9bcf24b464b7b052e5b8 Mon Sep 17 00:00:00 2001
From: Sibi Sankar <sibis(a)codeaurora.org>
Date: Thu, 23 Jul 2020 01:40:46 +0530
Subject: [PATCH] remoteproc: qcom_q6v5_mss: Validate modem blob firmware size
before load
The following mem abort is observed when one of the modem blob firmware
size exceeds the allocated mpss region. Fix this by restricting the copy
size to segment size using request_firmware_into_buf before load.
Err Logs:
Unable to handle kernel paging request at virtual address
Mem abort info:
...
Call trace:
__memcpy+0x110/0x180
rproc_start+0xd0/0x190
rproc_boot+0x404/0x550
state_store+0x54/0xf8
dev_attr_store+0x44/0x60
sysfs_kf_write+0x58/0x80
kernfs_fop_write+0x140/0x230
vfs_write+0xc4/0x208
ksys_write+0x74/0xf8
...
Reviewed-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
Fixes: 051fb70fd4ea4 ("remoteproc: qcom: Driver for the self-authenticating Hexagon v5")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sibi Sankar <sibis(a)codeaurora.org>
Link: https://lore.kernel.org/r/20200722201047.12975-3-sibis@codeaurora.org
Signed-off-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index 7826f229957d..8199d9f59209 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -1173,15 +1173,14 @@ static int q6v5_mpss_load(struct q6v5 *qproc)
} else if (phdr->p_filesz) {
/* Replace "xxx.xxx" with "xxx.bxx" */
sprintf(fw_name + fw_name_len - 3, "b%02d", i);
- ret = request_firmware(&seg_fw, fw_name, qproc->dev);
+ ret = request_firmware_into_buf(&seg_fw, fw_name, qproc->dev,
+ ptr, phdr->p_filesz);
if (ret) {
dev_err(qproc->dev, "failed to load %s\n", fw_name);
iounmap(ptr);
goto release_firmware;
}
- memcpy(ptr, seg_fw->data, seg_fw->size);
-
release_firmware(seg_fw);
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e013f455d95add874f310dc47c608e8c70692ae5 Mon Sep 17 00:00:00 2001
From: Sibi Sankar <sibis(a)codeaurora.org>
Date: Thu, 23 Jul 2020 01:40:45 +0530
Subject: [PATCH] remoteproc: qcom_q6v5_mss: Validate MBA firmware size before
load
The following mem abort is observed when the mba firmware size exceeds
the allocated mba region. MBA firmware size is restricted to a maximum
size of 1M and remaining memory region is used by modem debug policy
firmware when available. Hence verify whether the MBA firmware size lies
within the allocated memory region and is not greater than 1M before
loading.
Err Logs:
Unable to handle kernel paging request at virtual address
Mem abort info:
...
Call trace:
__memcpy+0x110/0x180
rproc_start+0x40/0x218
rproc_boot+0x5b4/0x608
state_store+0x54/0xf8
dev_attr_store+0x44/0x60
sysfs_kf_write+0x58/0x80
kernfs_fop_write+0x140/0x230
vfs_write+0xc4/0x208
ksys_write+0x74/0xf8
__arm64_sys_write+0x24/0x30
...
Reviewed-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
Fixes: 051fb70fd4ea4 ("remoteproc: qcom: Driver for the self-authenticating Hexagon v5")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sibi Sankar <sibis(a)codeaurora.org>
Link: https://lore.kernel.org/r/20200722201047.12975-2-sibis@codeaurora.org
Signed-off-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index 03d7f3d702b3..7826f229957d 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -411,6 +411,12 @@ static int q6v5_load(struct rproc *rproc, const struct firmware *fw)
{
struct q6v5 *qproc = rproc->priv;
+ /* MBA is restricted to a maximum size of 1M */
+ if (fw->size > qproc->mba_size || fw->size > SZ_1M) {
+ dev_err(qproc->dev, "MBA firmware load failed\n");
+ return -EINVAL;
+ }
+
memcpy(qproc->mba_region, fw->data, fw->size);
return 0;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e013f455d95add874f310dc47c608e8c70692ae5 Mon Sep 17 00:00:00 2001
From: Sibi Sankar <sibis(a)codeaurora.org>
Date: Thu, 23 Jul 2020 01:40:45 +0530
Subject: [PATCH] remoteproc: qcom_q6v5_mss: Validate MBA firmware size before
load
The following mem abort is observed when the mba firmware size exceeds
the allocated mba region. MBA firmware size is restricted to a maximum
size of 1M and remaining memory region is used by modem debug policy
firmware when available. Hence verify whether the MBA firmware size lies
within the allocated memory region and is not greater than 1M before
loading.
Err Logs:
Unable to handle kernel paging request at virtual address
Mem abort info:
...
Call trace:
__memcpy+0x110/0x180
rproc_start+0x40/0x218
rproc_boot+0x5b4/0x608
state_store+0x54/0xf8
dev_attr_store+0x44/0x60
sysfs_kf_write+0x58/0x80
kernfs_fop_write+0x140/0x230
vfs_write+0xc4/0x208
ksys_write+0x74/0xf8
__arm64_sys_write+0x24/0x30
...
Reviewed-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
Fixes: 051fb70fd4ea4 ("remoteproc: qcom: Driver for the self-authenticating Hexagon v5")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sibi Sankar <sibis(a)codeaurora.org>
Link: https://lore.kernel.org/r/20200722201047.12975-2-sibis@codeaurora.org
Signed-off-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index 03d7f3d702b3..7826f229957d 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -411,6 +411,12 @@ static int q6v5_load(struct rproc *rproc, const struct firmware *fw)
{
struct q6v5 *qproc = rproc->priv;
+ /* MBA is restricted to a maximum size of 1M */
+ if (fw->size > qproc->mba_size || fw->size > SZ_1M) {
+ dev_err(qproc->dev, "MBA firmware load failed\n");
+ return -EINVAL;
+ }
+
memcpy(qproc->mba_region, fw->data, fw->size);
return 0;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 12d572e785b15bc764e956caaa8a4c846fd15694 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat(a)kernel.org>
Date: Fri, 10 Jul 2020 22:11:23 +0900
Subject: [PATCH] perf probe: Fix memory leakage when the probe point is not
found
Fix the memory leakage in debuginfo__find_trace_events() when the probe
point is not found in the debuginfo. If there is no probe point found in
the debuginfo, debuginfo__find_probes() will NOT return -ENOENT, but 0.
Thus the caller of debuginfo__find_probes() must check the tf.ntevs and
release the allocated memory for the array of struct probe_trace_event.
The current code releases the memory only if the debuginfo__find_probes()
hits an error but not checks tf.ntevs. In the result, the memory allocated
on *tevs are not released if tf.ntevs == 0.
This fixes the memory leakage by checking tf.ntevs == 0 in addition to
ret < 0.
Fixes: ff741783506c ("perf probe: Introduce debuginfo to encapsulate dwarf information")
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Reviewed-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Oleg Nesterov <oleg(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: http://lore.kernel.org/lkml/159438668346.62703.10887420400718492503.stgit@d…
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 9963e4e8ea20..659024342e9a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1467,7 +1467,7 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
if (ret >= 0 && tf.pf.skip_empty_arg)
ret = fill_empty_trace_arg(pev, tf.tevs, tf.ntevs);
- if (ret < 0) {
+ if (ret < 0 || tf.ntevs == 0) {
for (i = 0; i < tf.ntevs; i++)
clear_probe_trace_event(&tf.tevs[i]);
zfree(tevs);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 12d572e785b15bc764e956caaa8a4c846fd15694 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat(a)kernel.org>
Date: Fri, 10 Jul 2020 22:11:23 +0900
Subject: [PATCH] perf probe: Fix memory leakage when the probe point is not
found
Fix the memory leakage in debuginfo__find_trace_events() when the probe
point is not found in the debuginfo. If there is no probe point found in
the debuginfo, debuginfo__find_probes() will NOT return -ENOENT, but 0.
Thus the caller of debuginfo__find_probes() must check the tf.ntevs and
release the allocated memory for the array of struct probe_trace_event.
The current code releases the memory only if the debuginfo__find_probes()
hits an error but not checks tf.ntevs. In the result, the memory allocated
on *tevs are not released if tf.ntevs == 0.
This fixes the memory leakage by checking tf.ntevs == 0 in addition to
ret < 0.
Fixes: ff741783506c ("perf probe: Introduce debuginfo to encapsulate dwarf information")
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Reviewed-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Oleg Nesterov <oleg(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: http://lore.kernel.org/lkml/159438668346.62703.10887420400718492503.stgit@d…
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 9963e4e8ea20..659024342e9a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1467,7 +1467,7 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
if (ret >= 0 && tf.pf.skip_empty_arg)
ret = fill_empty_trace_arg(pev, tf.tevs, tf.ntevs);
- if (ret < 0) {
+ if (ret < 0 || tf.ntevs == 0) {
for (i = 0; i < tf.ntevs; i++)
clear_probe_trace_event(&tf.tevs[i]);
zfree(tevs);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 12d572e785b15bc764e956caaa8a4c846fd15694 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat(a)kernel.org>
Date: Fri, 10 Jul 2020 22:11:23 +0900
Subject: [PATCH] perf probe: Fix memory leakage when the probe point is not
found
Fix the memory leakage in debuginfo__find_trace_events() when the probe
point is not found in the debuginfo. If there is no probe point found in
the debuginfo, debuginfo__find_probes() will NOT return -ENOENT, but 0.
Thus the caller of debuginfo__find_probes() must check the tf.ntevs and
release the allocated memory for the array of struct probe_trace_event.
The current code releases the memory only if the debuginfo__find_probes()
hits an error but not checks tf.ntevs. In the result, the memory allocated
on *tevs are not released if tf.ntevs == 0.
This fixes the memory leakage by checking tf.ntevs == 0 in addition to
ret < 0.
Fixes: ff741783506c ("perf probe: Introduce debuginfo to encapsulate dwarf information")
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Reviewed-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Oleg Nesterov <oleg(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: http://lore.kernel.org/lkml/159438668346.62703.10887420400718492503.stgit@d…
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 9963e4e8ea20..659024342e9a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1467,7 +1467,7 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
if (ret >= 0 && tf.pf.skip_empty_arg)
ret = fill_empty_trace_arg(pev, tf.tevs, tf.ntevs);
- if (ret < 0) {
+ if (ret < 0 || tf.ntevs == 0) {
for (i = 0; i < tf.ntevs; i++)
clear_probe_trace_event(&tf.tevs[i]);
zfree(tevs);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 12d572e785b15bc764e956caaa8a4c846fd15694 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat(a)kernel.org>
Date: Fri, 10 Jul 2020 22:11:23 +0900
Subject: [PATCH] perf probe: Fix memory leakage when the probe point is not
found
Fix the memory leakage in debuginfo__find_trace_events() when the probe
point is not found in the debuginfo. If there is no probe point found in
the debuginfo, debuginfo__find_probes() will NOT return -ENOENT, but 0.
Thus the caller of debuginfo__find_probes() must check the tf.ntevs and
release the allocated memory for the array of struct probe_trace_event.
The current code releases the memory only if the debuginfo__find_probes()
hits an error but not checks tf.ntevs. In the result, the memory allocated
on *tevs are not released if tf.ntevs == 0.
This fixes the memory leakage by checking tf.ntevs == 0 in addition to
ret < 0.
Fixes: ff741783506c ("perf probe: Introduce debuginfo to encapsulate dwarf information")
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Reviewed-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Oleg Nesterov <oleg(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: http://lore.kernel.org/lkml/159438668346.62703.10887420400718492503.stgit@d…
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 9963e4e8ea20..659024342e9a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1467,7 +1467,7 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
if (ret >= 0 && tf.pf.skip_empty_arg)
ret = fill_empty_trace_arg(pev, tf.tevs, tf.ntevs);
- if (ret < 0) {
+ if (ret < 0 || tf.ntevs == 0) {
for (i = 0; i < tf.ntevs; i++)
clear_probe_trace_event(&tf.tevs[i]);
zfree(tevs);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 12d572e785b15bc764e956caaa8a4c846fd15694 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat(a)kernel.org>
Date: Fri, 10 Jul 2020 22:11:23 +0900
Subject: [PATCH] perf probe: Fix memory leakage when the probe point is not
found
Fix the memory leakage in debuginfo__find_trace_events() when the probe
point is not found in the debuginfo. If there is no probe point found in
the debuginfo, debuginfo__find_probes() will NOT return -ENOENT, but 0.
Thus the caller of debuginfo__find_probes() must check the tf.ntevs and
release the allocated memory for the array of struct probe_trace_event.
The current code releases the memory only if the debuginfo__find_probes()
hits an error but not checks tf.ntevs. In the result, the memory allocated
on *tevs are not released if tf.ntevs == 0.
This fixes the memory leakage by checking tf.ntevs == 0 in addition to
ret < 0.
Fixes: ff741783506c ("perf probe: Introduce debuginfo to encapsulate dwarf information")
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Reviewed-by: Srikar Dronamraju <srikar(a)linux.vnet.ibm.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Oleg Nesterov <oleg(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: http://lore.kernel.org/lkml/159438668346.62703.10887420400718492503.stgit@d…
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 9963e4e8ea20..659024342e9a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1467,7 +1467,7 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
if (ret >= 0 && tf.pf.skip_empty_arg)
ret = fill_empty_trace_arg(pev, tf.tevs, tf.ntevs);
- if (ret < 0) {
+ if (ret < 0 || tf.ntevs == 0) {
for (i = 0; i < tf.ntevs; i++)
clear_probe_trace_event(&tf.tevs[i]);
zfree(tevs);
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0768e6e4934e239f1a7f8ba83150a7c46765bb3e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk(a)kernel.org>
Date: Wed, 17 Jun 2020 12:23:05 +0200
Subject: [PATCH] dt-bindings: power: supply: bq25890: Document required
interrupt
The driver requires interrupts (fails probe if it is not provided) so
document this requirement in bindings.
Fixes: 4aeae9cb0dad ("power_supply: Add support for TI BQ25890 charger chip")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzk(a)kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel(a)collabora.com>
diff --git a/Documentation/devicetree/bindings/power/supply/bq25890.txt b/Documentation/devicetree/bindings/power/supply/bq25890.txt
index 51ecc756521f..3b4c69a7fa70 100644
--- a/Documentation/devicetree/bindings/power/supply/bq25890.txt
+++ b/Documentation/devicetree/bindings/power/supply/bq25890.txt
@@ -10,6 +10,7 @@ Required properties:
* "ti,bq25895"
* "ti,bq25896"
- reg: integer, i2c address of the device.
+- interrupts: interrupt line;
- ti,battery-regulation-voltage: integer, maximum charging voltage (in uV);
- ti,charge-current: integer, maximum charging current (in uA);
- ti,termination-current: integer, charge will be terminated when current in
@@ -39,6 +40,9 @@ bq25890 {
compatible = "ti,bq25890";
reg = <0x6a>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <16 IRQ_TYPE_EDGE_FALLING>;
+
ti,battery-regulation-voltage = <4200000>;
ti,charge-current = <1000000>;
ti,termination-current = <50000>;
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0768e6e4934e239f1a7f8ba83150a7c46765bb3e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk(a)kernel.org>
Date: Wed, 17 Jun 2020 12:23:05 +0200
Subject: [PATCH] dt-bindings: power: supply: bq25890: Document required
interrupt
The driver requires interrupts (fails probe if it is not provided) so
document this requirement in bindings.
Fixes: 4aeae9cb0dad ("power_supply: Add support for TI BQ25890 charger chip")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzk(a)kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel(a)collabora.com>
diff --git a/Documentation/devicetree/bindings/power/supply/bq25890.txt b/Documentation/devicetree/bindings/power/supply/bq25890.txt
index 51ecc756521f..3b4c69a7fa70 100644
--- a/Documentation/devicetree/bindings/power/supply/bq25890.txt
+++ b/Documentation/devicetree/bindings/power/supply/bq25890.txt
@@ -10,6 +10,7 @@ Required properties:
* "ti,bq25895"
* "ti,bq25896"
- reg: integer, i2c address of the device.
+- interrupts: interrupt line;
- ti,battery-regulation-voltage: integer, maximum charging voltage (in uV);
- ti,charge-current: integer, maximum charging current (in uA);
- ti,termination-current: integer, charge will be terminated when current in
@@ -39,6 +40,9 @@ bq25890 {
compatible = "ti,bq25890";
reg = <0x6a>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <16 IRQ_TYPE_EDGE_FALLING>;
+
ti,battery-regulation-voltage = <4200000>;
ti,charge-current = <1000000>;
ti,termination-current = <50000>;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0768e6e4934e239f1a7f8ba83150a7c46765bb3e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk(a)kernel.org>
Date: Wed, 17 Jun 2020 12:23:05 +0200
Subject: [PATCH] dt-bindings: power: supply: bq25890: Document required
interrupt
The driver requires interrupts (fails probe if it is not provided) so
document this requirement in bindings.
Fixes: 4aeae9cb0dad ("power_supply: Add support for TI BQ25890 charger chip")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzk(a)kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel(a)collabora.com>
diff --git a/Documentation/devicetree/bindings/power/supply/bq25890.txt b/Documentation/devicetree/bindings/power/supply/bq25890.txt
index 51ecc756521f..3b4c69a7fa70 100644
--- a/Documentation/devicetree/bindings/power/supply/bq25890.txt
+++ b/Documentation/devicetree/bindings/power/supply/bq25890.txt
@@ -10,6 +10,7 @@ Required properties:
* "ti,bq25895"
* "ti,bq25896"
- reg: integer, i2c address of the device.
+- interrupts: interrupt line;
- ti,battery-regulation-voltage: integer, maximum charging voltage (in uV);
- ti,charge-current: integer, maximum charging current (in uA);
- ti,termination-current: integer, charge will be terminated when current in
@@ -39,6 +40,9 @@ bq25890 {
compatible = "ti,bq25890";
reg = <0x6a>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <16 IRQ_TYPE_EDGE_FALLING>;
+
ti,battery-regulation-voltage = <4200000>;
ti,charge-current = <1000000>;
ti,termination-current = <50000>;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0768e6e4934e239f1a7f8ba83150a7c46765bb3e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk(a)kernel.org>
Date: Wed, 17 Jun 2020 12:23:05 +0200
Subject: [PATCH] dt-bindings: power: supply: bq25890: Document required
interrupt
The driver requires interrupts (fails probe if it is not provided) so
document this requirement in bindings.
Fixes: 4aeae9cb0dad ("power_supply: Add support for TI BQ25890 charger chip")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzk(a)kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel(a)collabora.com>
diff --git a/Documentation/devicetree/bindings/power/supply/bq25890.txt b/Documentation/devicetree/bindings/power/supply/bq25890.txt
index 51ecc756521f..3b4c69a7fa70 100644
--- a/Documentation/devicetree/bindings/power/supply/bq25890.txt
+++ b/Documentation/devicetree/bindings/power/supply/bq25890.txt
@@ -10,6 +10,7 @@ Required properties:
* "ti,bq25895"
* "ti,bq25896"
- reg: integer, i2c address of the device.
+- interrupts: interrupt line;
- ti,battery-regulation-voltage: integer, maximum charging voltage (in uV);
- ti,charge-current: integer, maximum charging current (in uA);
- ti,termination-current: integer, charge will be terminated when current in
@@ -39,6 +40,9 @@ bq25890 {
compatible = "ti,bq25890";
reg = <0x6a>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <16 IRQ_TYPE_EDGE_FALLING>;
+
ti,battery-regulation-voltage = <4200000>;
ti,charge-current = <1000000>;
ti,termination-current = <50000>;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0768e6e4934e239f1a7f8ba83150a7c46765bb3e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk(a)kernel.org>
Date: Wed, 17 Jun 2020 12:23:05 +0200
Subject: [PATCH] dt-bindings: power: supply: bq25890: Document required
interrupt
The driver requires interrupts (fails probe if it is not provided) so
document this requirement in bindings.
Fixes: 4aeae9cb0dad ("power_supply: Add support for TI BQ25890 charger chip")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzk(a)kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel(a)collabora.com>
diff --git a/Documentation/devicetree/bindings/power/supply/bq25890.txt b/Documentation/devicetree/bindings/power/supply/bq25890.txt
index 51ecc756521f..3b4c69a7fa70 100644
--- a/Documentation/devicetree/bindings/power/supply/bq25890.txt
+++ b/Documentation/devicetree/bindings/power/supply/bq25890.txt
@@ -10,6 +10,7 @@ Required properties:
* "ti,bq25895"
* "ti,bq25896"
- reg: integer, i2c address of the device.
+- interrupts: interrupt line;
- ti,battery-regulation-voltage: integer, maximum charging voltage (in uV);
- ti,charge-current: integer, maximum charging current (in uA);
- ti,termination-current: integer, charge will be terminated when current in
@@ -39,6 +40,9 @@ bq25890 {
compatible = "ti,bq25890";
reg = <0x6a>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <16 IRQ_TYPE_EDGE_FALLING>;
+
ti,battery-regulation-voltage = <4200000>;
ti,charge-current = <1000000>;
ti,termination-current = <50000>;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0768e6e4934e239f1a7f8ba83150a7c46765bb3e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk(a)kernel.org>
Date: Wed, 17 Jun 2020 12:23:05 +0200
Subject: [PATCH] dt-bindings: power: supply: bq25890: Document required
interrupt
The driver requires interrupts (fails probe if it is not provided) so
document this requirement in bindings.
Fixes: 4aeae9cb0dad ("power_supply: Add support for TI BQ25890 charger chip")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzk(a)kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel(a)collabora.com>
diff --git a/Documentation/devicetree/bindings/power/supply/bq25890.txt b/Documentation/devicetree/bindings/power/supply/bq25890.txt
index 51ecc756521f..3b4c69a7fa70 100644
--- a/Documentation/devicetree/bindings/power/supply/bq25890.txt
+++ b/Documentation/devicetree/bindings/power/supply/bq25890.txt
@@ -10,6 +10,7 @@ Required properties:
* "ti,bq25895"
* "ti,bq25896"
- reg: integer, i2c address of the device.
+- interrupts: interrupt line;
- ti,battery-regulation-voltage: integer, maximum charging voltage (in uV);
- ti,charge-current: integer, maximum charging current (in uA);
- ti,termination-current: integer, charge will be terminated when current in
@@ -39,6 +40,9 @@ bq25890 {
compatible = "ti,bq25890";
reg = <0x6a>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <16 IRQ_TYPE_EDGE_FALLING>;
+
ti,battery-regulation-voltage = <4200000>;
ti,charge-current = <1000000>;
ti,termination-current = <50000>;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0768e6e4934e239f1a7f8ba83150a7c46765bb3e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk(a)kernel.org>
Date: Wed, 17 Jun 2020 12:23:05 +0200
Subject: [PATCH] dt-bindings: power: supply: bq25890: Document required
interrupt
The driver requires interrupts (fails probe if it is not provided) so
document this requirement in bindings.
Fixes: 4aeae9cb0dad ("power_supply: Add support for TI BQ25890 charger chip")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzk(a)kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel(a)collabora.com>
diff --git a/Documentation/devicetree/bindings/power/supply/bq25890.txt b/Documentation/devicetree/bindings/power/supply/bq25890.txt
index 51ecc756521f..3b4c69a7fa70 100644
--- a/Documentation/devicetree/bindings/power/supply/bq25890.txt
+++ b/Documentation/devicetree/bindings/power/supply/bq25890.txt
@@ -10,6 +10,7 @@ Required properties:
* "ti,bq25895"
* "ti,bq25896"
- reg: integer, i2c address of the device.
+- interrupts: interrupt line;
- ti,battery-regulation-voltage: integer, maximum charging voltage (in uV);
- ti,charge-current: integer, maximum charging current (in uA);
- ti,termination-current: integer, charge will be terminated when current in
@@ -39,6 +40,9 @@ bq25890 {
compatible = "ti,bq25890";
reg = <0x6a>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <16 IRQ_TYPE_EDGE_FALLING>;
+
ti,battery-regulation-voltage = <4200000>;
ti,charge-current = <1000000>;
ti,termination-current = <50000>;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d9539752d23283db4692384a634034f451261e29 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook(a)chromium.org>
Date: Tue, 9 Jun 2020 16:11:29 -0700
Subject: [PATCH] net/compat: Add missing sock updates for SCM_RIGHTS
Add missed sock updates to compat path via a new helper, which will be
used more in coming patches. (The net/core/scm.c code is left as-is here
to assist with -stable backports for the compat path.)
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Sargun Dhillon <sargun(a)sargun.me>
Cc: Jakub Kicinski <kuba(a)kernel.org>
Cc: stable(a)vger.kernel.org
Fixes: 48a87cc26c13 ("net: netprio: fd passed in SCM_RIGHTS datagram not set correctly")
Fixes: d84295067fc7 ("net: net_cls: fd passed in SCM_RIGHTS datagram not set correctly")
Acked-by: Christian Brauner <christian.brauner(a)ubuntu.com>
Signed-off-by: Kees Cook <keescook(a)chromium.org>
diff --git a/include/net/sock.h b/include/net/sock.h
index c53cc42b5ab9..2be67f1ee8b1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -890,6 +890,8 @@ static inline int sk_memalloc_socks(void)
{
return static_branch_unlikely(&memalloc_socks_key);
}
+
+void __receive_sock(struct file *file);
#else
static inline int sk_memalloc_socks(void)
@@ -897,6 +899,8 @@ static inline int sk_memalloc_socks(void)
return 0;
}
+static inline void __receive_sock(struct file *file)
+{ }
#endif
static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
diff --git a/net/compat.c b/net/compat.c
index 5e3041a2c37d..2937b816107d 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -309,6 +309,7 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
break;
}
/* Bump the usage count and install the file. */
+ __receive_sock(fp[i]);
fd_install(new_fd, get_file(fp[i]));
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c4acf1f0220..bde394979041 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2840,6 +2840,27 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *
}
EXPORT_SYMBOL(sock_no_mmap);
+/*
+ * When a file is received (via SCM_RIGHTS, etc), we must bump the
+ * various sock-based usage counts.
+ */
+void __receive_sock(struct file *file)
+{
+ struct socket *sock;
+ int error;
+
+ /*
+ * The resulting value of "error" is ignored here since we only
+ * need to take action when the file is a socket and testing
+ * "sock" for NULL is sufficient.
+ */
+ sock = sock_from_file(file, &error);
+ if (sock) {
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
+ }
+}
+
ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
ssize_t res;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 802141462d844f2e6a4d63a12260d79b7afc4c34 Mon Sep 17 00:00:00 2001
From: Ahmad Fatoum <a.fatoum(a)pengutronix.de>
Date: Thu, 11 Jun 2020 21:17:44 +0200
Subject: [PATCH] watchdog: f71808e_wdt: remove use of wrong watchdog_info
option
The flags that should be or-ed into the watchdog_info.options by drivers
all start with WDIOF_, e.g. WDIOF_SETTIMEOUT, which indicates that the
driver's watchdog_ops has a usable set_timeout.
WDIOC_SETTIMEOUT was used instead, which expands to 0xc0045706, which
equals:
WDIOF_FANFAULT | WDIOF_EXTERN1 | WDIOF_PRETIMEOUT | WDIOF_ALARMONLY |
WDIOF_MAGICCLOSE | 0xc0045000
These were so far indicated to userspace on WDIOC_GETSUPPORT.
As the driver has not yet been migrated to the new watchdog kernel API,
the constant can just be dropped without substitute.
Fixes: 96cb4eb019ce ("watchdog: f71808e_wdt: new watchdog driver for Fintek F71808E and F71882FG")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ahmad Fatoum <a.fatoum(a)pengutronix.de>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Link: https://lore.kernel.org/r/20200611191750.28096-4-a.fatoum@pengutronix.de
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim(a)linux-watchdog.org>
diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c
index c8ce80c13403..8e5584c54423 100644
--- a/drivers/watchdog/f71808e_wdt.c
+++ b/drivers/watchdog/f71808e_wdt.c
@@ -690,8 +690,7 @@ static int __init watchdog_init(int sioaddr)
* into the module have been registered yet.
*/
watchdog.sioaddr = sioaddr;
- watchdog.ident.options = WDIOC_SETTIMEOUT
- | WDIOF_MAGICCLOSE
+ watchdog.ident.options = WDIOF_MAGICCLOSE
| WDIOF_KEEPALIVEPING
| WDIOF_CARDRESET;
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
commit 7ef282e05132d56b6f6b71e3873f317664bea78b upstream
If a process has the trace_pipe open on a trace_array, the current tracer
for that trace array should not be changed. This was original enforced by a
global lock, but when instances were introduced, it was moved to the
current_trace. But this structure is shared by all instances, and a
trace_pipe is for a single instance. There's no reason that a process that
has trace_pipe open on one instance should prevent another instance from
changing its current tracer. Move the reference counter to the trace_array
instead.
This is marked as "Fixes" but is more of a clean up than a true fix.
Backport if you want, but its not critical.
Fixes: cf6ab6d9143b1 ("tracing: Add ref count to tracer for when they are being read by pipe")
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
Cc: stable(a)vger.kernel.org # 5.4.x
[Resolved conflict in __remove_instance()]
Signed-off-by: dann frazier <dann.frazier(a)canonical.com>
---
This addresses an issue we've seen with users trying to change
current_tracer when they happen to have rasdaemon installed.
rasdaemon uses the trace_pipe interface at runtime, which therefore
blocks changing the current tracer. But of course, unless
you know about rasdaemon internals, it isn't exactly an obvious
failure mode.
Conflict in __remove_instance() was due to a change in reference counter
semantics introduced later in v5.5 via commit 288797871473
"tracing: Adding new functions for kernel access to Ftrace instances".
Resolved by leaving the first test in the if statement as it was in v5.4.
kernel/trace/trace.c | 12 ++++++------
kernel/trace/trace.h | 2 +-
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 721947b9962d..f9c2bdbbd893 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5686,7 +5686,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
}
/* If trace pipe files are being read, we can't change the tracer */
- if (tr->current_trace->ref) {
+ if (tr->trace_ref) {
ret = -EBUSY;
goto out;
}
@@ -5902,7 +5902,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
- tr->current_trace->ref++;
+ tr->trace_ref++;
out:
mutex_unlock(&trace_types_lock);
return ret;
@@ -5921,7 +5921,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- tr->current_trace->ref--;
+ tr->trace_ref--;
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
@@ -7230,7 +7230,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
filp->private_data = info;
- tr->current_trace->ref++;
+ tr->trace_ref++;
mutex_unlock(&trace_types_lock);
@@ -7331,7 +7331,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- iter->tr->current_trace->ref--;
+ iter->tr->trace_ref--;
__trace_array_put(iter->tr);
@@ -8470,7 +8470,7 @@ static int __remove_instance(struct trace_array *tr)
{
int i;
- if (tr->ref || (tr->current_trace && tr->current_trace->ref))
+ if (tr->ref || (tr->current_trace && tr->trace_ref))
return -EBUSY;
list_del(&tr->list);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a3c29d5fcc61..4055158c1dd2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -309,6 +309,7 @@ struct trace_array {
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
int ref;
+ int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
@@ -498,7 +499,6 @@ struct tracer {
struct tracer *next;
struct tracer_flags *flags;
int enabled;
- int ref;
bool print_max;
bool allow_instances;
#ifdef CONFIG_TRACER_MAX_TRACE
--
2.28.0
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 96b4833b6827a62c295b149213c68b559514c929 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin(a)gmail.com>
Date: Thu, 30 Jul 2020 16:23:18 +0800
Subject: [PATCH] tracing/hwlat: Honor the tracing_cpumask
In calculation of the cpu mask for the hwlat kernel thread, the wrong
cpu mask is used instead of the tracing_cpumask, this causes the
tracing/tracing_cpumask useless for hwlat tracer. Fixes it.
Link: https://lkml.kernel.org/r/20200730082318.42584-2-haokexin@gmail.com
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: 0330f7aa8ee6 ("tracing: Have hwlat trace migrate across tracing_cpumask CPUs")
Signed-off-by: Kevin Hao <haokexin(a)gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index ddb528a6cd51..17873e5d0353 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -283,6 +283,7 @@ static bool disable_migrate;
static void move_to_next_cpu(void)
{
struct cpumask *current_mask = &save_cpumask;
+ struct trace_array *tr = hwlat_trace;
int next_cpu;
if (disable_migrate)
@@ -296,7 +297,7 @@ static void move_to_next_cpu(void)
goto disable;
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
next_cpu = cpumask_next(smp_processor_id(), current_mask);
put_online_cpus();
@@ -372,7 +373,7 @@ static int start_kthread(struct trace_array *tr)
/* Just pick the first CPU on first iteration */
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
put_online_cpus();
next_cpu = cpumask_first(current_mask);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c64be2bb1c6eb43c838b2c6d57b074078be208dd Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski(a)samsung.com>
Date: Thu, 29 Dec 2011 13:09:51 +0100
Subject: [PATCH] drivers: add Contiguous Memory Allocator
The Contiguous Memory Allocator is a set of helper functions for DMA
mapping framework that improves allocations of contiguous memory chunks.
CMA grabs memory on system boot, marks it with MIGRATE_CMA migrate type
and gives back to the system. Kernel is allowed to allocate only movable
pages within CMA's managed memory so that it can be used for example for
page cache when DMA mapping do not use it. On
dma_alloc_from_contiguous() request such pages are migrated out of CMA
area to free required contiguous block and fulfill the request. This
allows to allocate large contiguous chunks of memory at any time
assuming that there is enough free memory available in the system.
This code is heavily based on earlier works by Michal Nazarewicz.
Signed-off-by: Marek Szyprowski <m.szyprowski(a)samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park(a)samsung.com>
Signed-off-by: Michal Nazarewicz <mina86(a)mina86.com>
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Tested-by: Rob Clark <rob.clark(a)linaro.org>
Tested-by: Ohad Ben-Cohen <ohad(a)wizery.com>
Tested-by: Benjamin Gaignard <benjamin.gaignard(a)linaro.org>
Tested-by: Robert Nelson <robertcnelson(a)gmail.com>
Tested-by: Barry Song <Baohua.Song(a)csr.com>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c1601e5a8b71..669e8bb52b94 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -508,6 +508,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Also note the kernel might malfunction if you disable
some critical bits.
+ cma=nn[MG] [ARM,KNL]
+ Sets the size of kernel global memory area for contiguous
+ memory allocations. For more information, see
+ include/linux/dma-contiguous.h
+
cmo_free_hint= [PPC] Format: { yes | no }
Specify whether pages are marked as being inactive
when they are freed. This is used in CMO environments
diff --git a/arch/Kconfig b/arch/Kconfig
index 684eb5af439d..0a3ffe46e567 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -142,6 +142,9 @@ config HAVE_ARCH_TRACEHOOK
config HAVE_DMA_ATTRS
bool
+config HAVE_DMA_CONTIGUOUS
+ bool
+
config USE_GENERIC_SMP_HELPERS
bool
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 9aa618acfe97..9b21469482ae 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -192,4 +192,93 @@ config DMA_SHARED_BUFFER
APIs extension; the file's descriptor can then be passed on to other
driver.
+config CMA
+ bool "Contiguous Memory Allocator (EXPERIMENTAL)"
+ depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL
+ select MIGRATION
+ help
+ This enables the Contiguous Memory Allocator which allows drivers
+ to allocate big physically-contiguous blocks of memory for use with
+ hardware components that do not support I/O map nor scatter-gather.
+
+ For more information see <include/linux/dma-contiguous.h>.
+ If unsure, say "n".
+
+if CMA
+
+config CMA_DEBUG
+ bool "CMA debug messages (DEVELOPMENT)"
+ depends on DEBUG_KERNEL
+ help
+ Turns on debug messages in CMA. This produces KERN_DEBUG
+ messages for every CMA call as well as various messages while
+ processing calls such as dma_alloc_from_contiguous().
+ This option does not affect warning and error messages.
+
+comment "Default contiguous memory area size:"
+
+config CMA_SIZE_MBYTES
+ int "Size in Mega Bytes"
+ depends on !CMA_SIZE_SEL_PERCENTAGE
+ default 16
+ help
+ Defines the size (in MiB) of the default memory area for Contiguous
+ Memory Allocator.
+
+config CMA_SIZE_PERCENTAGE
+ int "Percentage of total memory"
+ depends on !CMA_SIZE_SEL_MBYTES
+ default 10
+ help
+ Defines the size of the default memory area for Contiguous Memory
+ Allocator as a percentage of the total memory in the system.
+
+choice
+ prompt "Selected region size"
+ default CMA_SIZE_SEL_ABSOLUTE
+
+config CMA_SIZE_SEL_MBYTES
+ bool "Use mega bytes value only"
+
+config CMA_SIZE_SEL_PERCENTAGE
+ bool "Use percentage value only"
+
+config CMA_SIZE_SEL_MIN
+ bool "Use lower value (minimum)"
+
+config CMA_SIZE_SEL_MAX
+ bool "Use higher value (maximum)"
+
+endchoice
+
+config CMA_ALIGNMENT
+ int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+ range 4 9
+ default 8
+ help
+ DMA mapping framework by default aligns all buffers to the smallest
+ PAGE_SIZE order which is greater than or equal to the requested buffer
+ size. This works well for buffers up to a few hundreds kilobytes, but
+ for larger buffers it just a memory waste. With this parameter you can
+ specify the maximum PAGE_SIZE order for contiguous buffers. Larger
+ buffers will be aligned only to this specified order. The order is
+ expressed as a power of two multiplied by the PAGE_SIZE.
+
+ For example, if your system defaults to 4KiB pages, the order value
+ of 8 means that the buffers will be aligned up to 1MiB only.
+
+ If unsure, leave the default value "8".
+
+config CMA_AREAS
+ int "Maximum count of the CMA device-private areas"
+ default 7
+ help
+ CMA allows to create CMA areas for particular devices. This parameter
+ sets the maximum number of such device private CMA areas in the
+ system.
+
+ If unsure, leave the default value "7".
+
+endif
+
endmenu
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index b6d1b9c4200c..5aa2d703d19f 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -6,6 +6,7 @@ obj-y := core.o bus.o dd.o syscore.o \
attribute_container.o transport_class.o \
topology.o
obj-$(CONFIG_DEVTMPFS) += devtmpfs.o
+obj-$(CONFIG_CMA) += dma-contiguous.o
obj-y += power/
obj-$(CONFIG_HAS_DMA) += dma-mapping.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
new file mode 100644
index 000000000000..78efb0306a44
--- /dev/null
+++ b/drivers/base/dma-contiguous.c
@@ -0,0 +1,401 @@
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+#define pr_fmt(fmt) "cma: " fmt
+
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+# define DEBUG
+#endif
+#endif
+
+#include <asm/page.h>
+#include <asm/dma-contiguous.h>
+
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-isolation.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/mm_types.h>
+#include <linux/dma-contiguous.h>
+
+#ifndef SZ_1M
+#define SZ_1M (1 << 20)
+#endif
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+};
+
+struct cma *dma_contiguous_default_area;
+
+#ifdef CONFIG_CMA_SIZE_MBYTES
+#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
+#else
+#define CMA_SIZE_MBYTES 0
+#endif
+
+/*
+ * Default global CMA area size can be defined in kernel's .config.
+ * This is usefull mainly for distro maintainers to create a kernel
+ * that works correctly for most supported systems.
+ * The size can be set in bytes or as a percentage of the total memory
+ * in the system.
+ *
+ * Users, who want to set the size of global CMA area for their system
+ * should use cma= kernel parameter.
+ */
+static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M;
+static long size_cmdline = -1;
+
+static int __init early_cma(char *p)
+{
+ pr_debug("%s(%s)\n", __func__, p);
+ size_cmdline = memparse(p, &p);
+ return 0;
+}
+early_param("cma", early_cma);
+
+#ifdef CONFIG_CMA_SIZE_PERCENTAGE
+
+static unsigned long __init __maybe_unused cma_early_percent_memory(void)
+{
+ struct memblock_region *reg;
+ unsigned long total_pages = 0;
+
+ /*
+ * We cannot use memblock_phys_mem_size() here, because
+ * memblock_analyze() has not been called yet.
+ */
+ for_each_memblock(memory, reg)
+ total_pages += memblock_region_memory_end_pfn(reg) -
+ memblock_region_memory_base_pfn(reg);
+
+ return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
+}
+
+#else
+
+static inline __maybe_unused unsigned long cma_early_percent_memory(void)
+{
+ return 0;
+}
+
+#endif
+
+/**
+ * dma_contiguous_reserve() - reserve area for contiguous memory handling
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
+ */
+void __init dma_contiguous_reserve(phys_addr_t limit)
+{
+ unsigned long selected_size = 0;
+
+ pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
+
+ if (size_cmdline != -1) {
+ selected_size = size_cmdline;
+ } else {
+#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
+ selected_size = size_bytes;
+#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
+ selected_size = cma_early_percent_memory();
+#elif defined(CONFIG_CMA_SIZE_SEL_MIN)
+ selected_size = min(size_bytes, cma_early_percent_memory());
+#elif defined(CONFIG_CMA_SIZE_SEL_MAX)
+ selected_size = max(size_bytes, cma_early_percent_memory());
+#endif
+ }
+
+ if (selected_size) {
+ pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+ selected_size / SZ_1M);
+
+ dma_declare_contiguous(NULL, selected_size, 0, limit);
+ }
+};
+
+static DEFINE_MUTEX(cma_mutex);
+
+static __init int cma_activate_area(unsigned long base_pfn, unsigned long count)
+{
+ unsigned long pfn = base_pfn;
+ unsigned i = count >> pageblock_order;
+ struct zone *zone;
+
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ zone = page_zone(pfn_to_page(pfn));
+
+ do {
+ unsigned j;
+ base_pfn = pfn;
+ for (j = pageblock_nr_pages; j; --j, pfn++) {
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ if (page_zone(pfn_to_page(pfn)) != zone)
+ return -EINVAL;
+ }
+ init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+ } while (--i);
+ return 0;
+}
+
+static __init struct cma *cma_create_area(unsigned long base_pfn,
+ unsigned long count)
+{
+ int bitmap_size = BITS_TO_LONGS(count) * sizeof(long);
+ struct cma *cma;
+ int ret = -ENOMEM;
+
+ pr_debug("%s(base %08lx, count %lx)\n", __func__, base_pfn, count);
+
+ cma = kmalloc(sizeof *cma, GFP_KERNEL);
+ if (!cma)
+ return ERR_PTR(-ENOMEM);
+
+ cma->base_pfn = base_pfn;
+ cma->count = count;
+ cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+
+ if (!cma->bitmap)
+ goto no_mem;
+
+ ret = cma_activate_area(base_pfn, count);
+ if (ret)
+ goto error;
+
+ pr_debug("%s: returned %p\n", __func__, (void *)cma);
+ return cma;
+
+error:
+ kfree(cma->bitmap);
+no_mem:
+ kfree(cma);
+ return ERR_PTR(ret);
+}
+
+static struct cma_reserved {
+ phys_addr_t start;
+ unsigned long size;
+ struct device *dev;
+} cma_reserved[MAX_CMA_AREAS] __initdata;
+static unsigned cma_reserved_count __initdata;
+
+static int __init cma_init_reserved_areas(void)
+{
+ struct cma_reserved *r = cma_reserved;
+ unsigned i = cma_reserved_count;
+
+ pr_debug("%s()\n", __func__);
+
+ for (; i; --i, ++r) {
+ struct cma *cma;
+ cma = cma_create_area(PFN_DOWN(r->start),
+ r->size >> PAGE_SHIFT);
+ if (!IS_ERR(cma))
+ dev_set_cma_area(r->dev, cma);
+ }
+ return 0;
+}
+core_initcall(cma_init_reserved_areas);
+
+/**
+ * dma_declare_contiguous() - reserve area for contiguous memory handling
+ * for particular device
+ * @dev: Pointer to device structure.
+ * @size: Size of the reserved memory.
+ * @base: Start address of the reserved memory (optional, 0 for any).
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory for specified device. It should be
+ * called by board specific code when early allocator (memblock or bootmem)
+ * is still activate.
+ */
+int __init dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ struct cma_reserved *r = &cma_reserved[cma_reserved_count];
+ unsigned long alignment;
+
+ pr_debug("%s(size %lx, base %08lx, limit %08lx)\n", __func__,
+ (unsigned long)size, (unsigned long)base,
+ (unsigned long)limit);
+
+ /* Sanity checks */
+ if (cma_reserved_count == ARRAY_SIZE(cma_reserved)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size)
+ return -EINVAL;
+
+ /* Sanitise input arguments */
+ alignment = PAGE_SIZE << max(MAX_ORDER, pageblock_order);
+ base = ALIGN(base, alignment);
+ size = ALIGN(size, alignment);
+ limit &= ~(alignment - 1);
+
+ /* Reserve memory */
+ if (base) {
+ if (memblock_is_region_reserved(base, size) ||
+ memblock_reserve(base, size) < 0) {
+ base = -EBUSY;
+ goto err;
+ }
+ } else {
+ /*
+ * Use __memblock_alloc_base() since
+ * memblock_alloc_base() panic()s.
+ */
+ phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
+ if (!addr) {
+ base = -ENOMEM;
+ goto err;
+ } else if (addr + size > ~(unsigned long)0) {
+ memblock_free(addr, size);
+ base = -EINVAL;
+ goto err;
+ } else {
+ base = addr;
+ }
+ }
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ r->start = base;
+ r->size = size;
+ r->dev = dev;
+ cma_reserved_count++;
+ pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M,
+ (unsigned long)base);
+
+ /* Architecture specific contiguous memory fixup. */
+ dma_contiguous_early_fixup(base, size);
+ return 0;
+err:
+ pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M);
+ return base;
+}
+
+/**
+ * dma_alloc_from_contiguous() - allocate pages from contiguous area
+ * @dev: Pointer to device for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ *
+ * This function allocates memory buffer for specified device. It uses
+ * device specific contiguous memory area if available or the default
+ * global one. Requires architecture specific get_dev_cma_area() helper
+ * function.
+ */
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int align)
+{
+ unsigned long mask, pfn, pageno, start = 0;
+ struct cma *cma = dev_get_cma_area(dev);
+ int ret;
+
+ if (!cma || !cma->count)
+ return NULL;
+
+ if (align > CONFIG_CMA_ALIGNMENT)
+ align = CONFIG_CMA_ALIGNMENT;
+
+ pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma,
+ count, align);
+
+ if (!count)
+ return NULL;
+
+ mask = (1 << align) - 1;
+
+ mutex_lock(&cma_mutex);
+
+ for (;;) {
+ pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,
+ start, count, mask);
+ if (pageno >= cma->count) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ pfn = cma->base_pfn + pageno;
+ ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+ if (ret == 0) {
+ bitmap_set(cma->bitmap, pageno, count);
+ break;
+ } else if (ret != -EBUSY) {
+ goto error;
+ }
+ pr_debug("%s(): memory range at %p is busy, retrying\n",
+ __func__, pfn_to_page(pfn));
+ /* try again with a bit different memory target */
+ start = pageno + mask + 1;
+ }
+
+ mutex_unlock(&cma_mutex);
+
+ pr_debug("%s(): returned %p\n", __func__, pfn_to_page(pfn));
+ return pfn_to_page(pfn);
+error:
+ mutex_unlock(&cma_mutex);
+ return NULL;
+}
+
+/**
+ * dma_release_from_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_from_contiguous().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ struct cma *cma = dev_get_cma_area(dev);
+ unsigned long pfn;
+
+ if (!cma || !pages)
+ return false;
+
+ pr_debug("%s(page %p)\n", __func__, (void *)pages);
+
+ pfn = page_to_pfn(pages);
+
+ if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+ return false;
+
+ VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+
+ mutex_lock(&cma_mutex);
+ bitmap_clear(cma->bitmap, pfn - cma->base_pfn, count);
+ free_contig_range(pfn, count);
+ mutex_unlock(&cma_mutex);
+
+ return true;
+}
diff --git a/include/asm-generic/dma-contiguous.h b/include/asm-generic/dma-contiguous.h
new file mode 100644
index 000000000000..c544356b374b
--- /dev/null
+++ b/include/asm-generic/dma-contiguous.h
@@ -0,0 +1,28 @@
+#ifndef ASM_DMA_CONTIGUOUS_H
+#define ASM_DMA_CONTIGUOUS_H
+
+#ifdef __KERNEL__
+#ifdef CONFIG_CMA
+
+#include <linux/device.h>
+#include <linux/dma-contiguous.h>
+
+static inline struct cma *dev_get_cma_area(struct device *dev)
+{
+ if (dev && dev->cma_area)
+ return dev->cma_area;
+ return dma_contiguous_default_area;
+}
+
+static inline void dev_set_cma_area(struct device *dev, struct cma *cma)
+{
+ if (dev)
+ dev->cma_area = cma;
+ if (!dev || !dma_contiguous_default_area)
+ dma_contiguous_default_area = cma;
+}
+
+#endif
+#endif
+
+#endif
diff --git a/include/linux/device.h b/include/linux/device.h
index 5ad17cccdd71..e3399290436e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -661,6 +661,10 @@ struct device {
struct dma_coherent_mem *dma_mem; /* internal for coherent mem
override */
+#ifdef CONFIG_CMA
+ struct cma *cma_area; /* contiguous memory area for dma
+ allocations */
+#endif
/* arch specific additions */
struct dev_archdata archdata;
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
new file mode 100644
index 000000000000..2f303e4b7ed3
--- /dev/null
+++ b/include/linux/dma-contiguous.h
@@ -0,0 +1,110 @@
+#ifndef __LINUX_CMA_H
+#define __LINUX_CMA_H
+
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+/*
+ * Contiguous Memory Allocator
+ *
+ * The Contiguous Memory Allocator (CMA) makes it possible to
+ * allocate big contiguous chunks of memory after the system has
+ * booted.
+ *
+ * Why is it needed?
+ *
+ * Various devices on embedded systems have no scatter-getter and/or
+ * IO map support and require contiguous blocks of memory to
+ * operate. They include devices such as cameras, hardware video
+ * coders, etc.
+ *
+ * Such devices often require big memory buffers (a full HD frame
+ * is, for instance, more then 2 mega pixels large, i.e. more than 6
+ * MB of memory), which makes mechanisms such as kmalloc() or
+ * alloc_page() ineffective.
+ *
+ * At the same time, a solution where a big memory region is
+ * reserved for a device is suboptimal since often more memory is
+ * reserved then strictly required and, moreover, the memory is
+ * inaccessible to page system even if device drivers don't use it.
+ *
+ * CMA tries to solve this issue by operating on memory regions
+ * where only movable pages can be allocated from. This way, kernel
+ * can use the memory for pagecache and when device driver requests
+ * it, allocated pages can be migrated.
+ *
+ * Driver usage
+ *
+ * CMA should not be used by the device drivers directly. It is
+ * only a helper framework for dma-mapping subsystem.
+ *
+ * For more information, see kernel-docs in drivers/base/dma-contiguous.c
+ */
+
+#ifdef __KERNEL__
+
+struct cma;
+struct page;
+struct device;
+
+#ifdef CONFIG_CMA
+
+/*
+ * There is always at least global CMA area and a few optional device
+ * private areas configured in kernel .config.
+ */
+#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS)
+
+extern struct cma *dma_contiguous_default_area;
+
+void dma_contiguous_reserve(phys_addr_t addr_limit);
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit);
+
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order);
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count);
+
+#else
+
+#define MAX_CMA_AREAS (0)
+
+static inline void dma_contiguous_reserve(phys_addr_t limit) { }
+
+static inline
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ return -ENOSYS;
+}
+
+static inline
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order)
+{
+ return NULL;
+}
+
+static inline
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ return false;
+}
+
+#endif
+
+#endif
+
+#endif
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c64be2bb1c6eb43c838b2c6d57b074078be208dd Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski(a)samsung.com>
Date: Thu, 29 Dec 2011 13:09:51 +0100
Subject: [PATCH] drivers: add Contiguous Memory Allocator
The Contiguous Memory Allocator is a set of helper functions for DMA
mapping framework that improves allocations of contiguous memory chunks.
CMA grabs memory on system boot, marks it with MIGRATE_CMA migrate type
and gives back to the system. Kernel is allowed to allocate only movable
pages within CMA's managed memory so that it can be used for example for
page cache when DMA mapping do not use it. On
dma_alloc_from_contiguous() request such pages are migrated out of CMA
area to free required contiguous block and fulfill the request. This
allows to allocate large contiguous chunks of memory at any time
assuming that there is enough free memory available in the system.
This code is heavily based on earlier works by Michal Nazarewicz.
Signed-off-by: Marek Szyprowski <m.szyprowski(a)samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park(a)samsung.com>
Signed-off-by: Michal Nazarewicz <mina86(a)mina86.com>
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Tested-by: Rob Clark <rob.clark(a)linaro.org>
Tested-by: Ohad Ben-Cohen <ohad(a)wizery.com>
Tested-by: Benjamin Gaignard <benjamin.gaignard(a)linaro.org>
Tested-by: Robert Nelson <robertcnelson(a)gmail.com>
Tested-by: Barry Song <Baohua.Song(a)csr.com>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c1601e5a8b71..669e8bb52b94 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -508,6 +508,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Also note the kernel might malfunction if you disable
some critical bits.
+ cma=nn[MG] [ARM,KNL]
+ Sets the size of kernel global memory area for contiguous
+ memory allocations. For more information, see
+ include/linux/dma-contiguous.h
+
cmo_free_hint= [PPC] Format: { yes | no }
Specify whether pages are marked as being inactive
when they are freed. This is used in CMO environments
diff --git a/arch/Kconfig b/arch/Kconfig
index 684eb5af439d..0a3ffe46e567 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -142,6 +142,9 @@ config HAVE_ARCH_TRACEHOOK
config HAVE_DMA_ATTRS
bool
+config HAVE_DMA_CONTIGUOUS
+ bool
+
config USE_GENERIC_SMP_HELPERS
bool
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 9aa618acfe97..9b21469482ae 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -192,4 +192,93 @@ config DMA_SHARED_BUFFER
APIs extension; the file's descriptor can then be passed on to other
driver.
+config CMA
+ bool "Contiguous Memory Allocator (EXPERIMENTAL)"
+ depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL
+ select MIGRATION
+ help
+ This enables the Contiguous Memory Allocator which allows drivers
+ to allocate big physically-contiguous blocks of memory for use with
+ hardware components that do not support I/O map nor scatter-gather.
+
+ For more information see <include/linux/dma-contiguous.h>.
+ If unsure, say "n".
+
+if CMA
+
+config CMA_DEBUG
+ bool "CMA debug messages (DEVELOPMENT)"
+ depends on DEBUG_KERNEL
+ help
+ Turns on debug messages in CMA. This produces KERN_DEBUG
+ messages for every CMA call as well as various messages while
+ processing calls such as dma_alloc_from_contiguous().
+ This option does not affect warning and error messages.
+
+comment "Default contiguous memory area size:"
+
+config CMA_SIZE_MBYTES
+ int "Size in Mega Bytes"
+ depends on !CMA_SIZE_SEL_PERCENTAGE
+ default 16
+ help
+ Defines the size (in MiB) of the default memory area for Contiguous
+ Memory Allocator.
+
+config CMA_SIZE_PERCENTAGE
+ int "Percentage of total memory"
+ depends on !CMA_SIZE_SEL_MBYTES
+ default 10
+ help
+ Defines the size of the default memory area for Contiguous Memory
+ Allocator as a percentage of the total memory in the system.
+
+choice
+ prompt "Selected region size"
+ default CMA_SIZE_SEL_ABSOLUTE
+
+config CMA_SIZE_SEL_MBYTES
+ bool "Use mega bytes value only"
+
+config CMA_SIZE_SEL_PERCENTAGE
+ bool "Use percentage value only"
+
+config CMA_SIZE_SEL_MIN
+ bool "Use lower value (minimum)"
+
+config CMA_SIZE_SEL_MAX
+ bool "Use higher value (maximum)"
+
+endchoice
+
+config CMA_ALIGNMENT
+ int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+ range 4 9
+ default 8
+ help
+ DMA mapping framework by default aligns all buffers to the smallest
+ PAGE_SIZE order which is greater than or equal to the requested buffer
+ size. This works well for buffers up to a few hundreds kilobytes, but
+ for larger buffers it just a memory waste. With this parameter you can
+ specify the maximum PAGE_SIZE order for contiguous buffers. Larger
+ buffers will be aligned only to this specified order. The order is
+ expressed as a power of two multiplied by the PAGE_SIZE.
+
+ For example, if your system defaults to 4KiB pages, the order value
+ of 8 means that the buffers will be aligned up to 1MiB only.
+
+ If unsure, leave the default value "8".
+
+config CMA_AREAS
+ int "Maximum count of the CMA device-private areas"
+ default 7
+ help
+ CMA allows to create CMA areas for particular devices. This parameter
+ sets the maximum number of such device private CMA areas in the
+ system.
+
+ If unsure, leave the default value "7".
+
+endif
+
endmenu
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index b6d1b9c4200c..5aa2d703d19f 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -6,6 +6,7 @@ obj-y := core.o bus.o dd.o syscore.o \
attribute_container.o transport_class.o \
topology.o
obj-$(CONFIG_DEVTMPFS) += devtmpfs.o
+obj-$(CONFIG_CMA) += dma-contiguous.o
obj-y += power/
obj-$(CONFIG_HAS_DMA) += dma-mapping.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
new file mode 100644
index 000000000000..78efb0306a44
--- /dev/null
+++ b/drivers/base/dma-contiguous.c
@@ -0,0 +1,401 @@
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+#define pr_fmt(fmt) "cma: " fmt
+
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+# define DEBUG
+#endif
+#endif
+
+#include <asm/page.h>
+#include <asm/dma-contiguous.h>
+
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-isolation.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/mm_types.h>
+#include <linux/dma-contiguous.h>
+
+#ifndef SZ_1M
+#define SZ_1M (1 << 20)
+#endif
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+};
+
+struct cma *dma_contiguous_default_area;
+
+#ifdef CONFIG_CMA_SIZE_MBYTES
+#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
+#else
+#define CMA_SIZE_MBYTES 0
+#endif
+
+/*
+ * Default global CMA area size can be defined in kernel's .config.
+ * This is usefull mainly for distro maintainers to create a kernel
+ * that works correctly for most supported systems.
+ * The size can be set in bytes or as a percentage of the total memory
+ * in the system.
+ *
+ * Users, who want to set the size of global CMA area for their system
+ * should use cma= kernel parameter.
+ */
+static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M;
+static long size_cmdline = -1;
+
+static int __init early_cma(char *p)
+{
+ pr_debug("%s(%s)\n", __func__, p);
+ size_cmdline = memparse(p, &p);
+ return 0;
+}
+early_param("cma", early_cma);
+
+#ifdef CONFIG_CMA_SIZE_PERCENTAGE
+
+static unsigned long __init __maybe_unused cma_early_percent_memory(void)
+{
+ struct memblock_region *reg;
+ unsigned long total_pages = 0;
+
+ /*
+ * We cannot use memblock_phys_mem_size() here, because
+ * memblock_analyze() has not been called yet.
+ */
+ for_each_memblock(memory, reg)
+ total_pages += memblock_region_memory_end_pfn(reg) -
+ memblock_region_memory_base_pfn(reg);
+
+ return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
+}
+
+#else
+
+static inline __maybe_unused unsigned long cma_early_percent_memory(void)
+{
+ return 0;
+}
+
+#endif
+
+/**
+ * dma_contiguous_reserve() - reserve area for contiguous memory handling
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
+ */
+void __init dma_contiguous_reserve(phys_addr_t limit)
+{
+ unsigned long selected_size = 0;
+
+ pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
+
+ if (size_cmdline != -1) {
+ selected_size = size_cmdline;
+ } else {
+#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
+ selected_size = size_bytes;
+#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
+ selected_size = cma_early_percent_memory();
+#elif defined(CONFIG_CMA_SIZE_SEL_MIN)
+ selected_size = min(size_bytes, cma_early_percent_memory());
+#elif defined(CONFIG_CMA_SIZE_SEL_MAX)
+ selected_size = max(size_bytes, cma_early_percent_memory());
+#endif
+ }
+
+ if (selected_size) {
+ pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+ selected_size / SZ_1M);
+
+ dma_declare_contiguous(NULL, selected_size, 0, limit);
+ }
+};
+
+static DEFINE_MUTEX(cma_mutex);
+
+static __init int cma_activate_area(unsigned long base_pfn, unsigned long count)
+{
+ unsigned long pfn = base_pfn;
+ unsigned i = count >> pageblock_order;
+ struct zone *zone;
+
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ zone = page_zone(pfn_to_page(pfn));
+
+ do {
+ unsigned j;
+ base_pfn = pfn;
+ for (j = pageblock_nr_pages; j; --j, pfn++) {
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ if (page_zone(pfn_to_page(pfn)) != zone)
+ return -EINVAL;
+ }
+ init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+ } while (--i);
+ return 0;
+}
+
+static __init struct cma *cma_create_area(unsigned long base_pfn,
+ unsigned long count)
+{
+ int bitmap_size = BITS_TO_LONGS(count) * sizeof(long);
+ struct cma *cma;
+ int ret = -ENOMEM;
+
+ pr_debug("%s(base %08lx, count %lx)\n", __func__, base_pfn, count);
+
+ cma = kmalloc(sizeof *cma, GFP_KERNEL);
+ if (!cma)
+ return ERR_PTR(-ENOMEM);
+
+ cma->base_pfn = base_pfn;
+ cma->count = count;
+ cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+
+ if (!cma->bitmap)
+ goto no_mem;
+
+ ret = cma_activate_area(base_pfn, count);
+ if (ret)
+ goto error;
+
+ pr_debug("%s: returned %p\n", __func__, (void *)cma);
+ return cma;
+
+error:
+ kfree(cma->bitmap);
+no_mem:
+ kfree(cma);
+ return ERR_PTR(ret);
+}
+
+static struct cma_reserved {
+ phys_addr_t start;
+ unsigned long size;
+ struct device *dev;
+} cma_reserved[MAX_CMA_AREAS] __initdata;
+static unsigned cma_reserved_count __initdata;
+
+static int __init cma_init_reserved_areas(void)
+{
+ struct cma_reserved *r = cma_reserved;
+ unsigned i = cma_reserved_count;
+
+ pr_debug("%s()\n", __func__);
+
+ for (; i; --i, ++r) {
+ struct cma *cma;
+ cma = cma_create_area(PFN_DOWN(r->start),
+ r->size >> PAGE_SHIFT);
+ if (!IS_ERR(cma))
+ dev_set_cma_area(r->dev, cma);
+ }
+ return 0;
+}
+core_initcall(cma_init_reserved_areas);
+
+/**
+ * dma_declare_contiguous() - reserve area for contiguous memory handling
+ * for particular device
+ * @dev: Pointer to device structure.
+ * @size: Size of the reserved memory.
+ * @base: Start address of the reserved memory (optional, 0 for any).
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory for specified device. It should be
+ * called by board specific code when early allocator (memblock or bootmem)
+ * is still activate.
+ */
+int __init dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ struct cma_reserved *r = &cma_reserved[cma_reserved_count];
+ unsigned long alignment;
+
+ pr_debug("%s(size %lx, base %08lx, limit %08lx)\n", __func__,
+ (unsigned long)size, (unsigned long)base,
+ (unsigned long)limit);
+
+ /* Sanity checks */
+ if (cma_reserved_count == ARRAY_SIZE(cma_reserved)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size)
+ return -EINVAL;
+
+ /* Sanitise input arguments */
+ alignment = PAGE_SIZE << max(MAX_ORDER, pageblock_order);
+ base = ALIGN(base, alignment);
+ size = ALIGN(size, alignment);
+ limit &= ~(alignment - 1);
+
+ /* Reserve memory */
+ if (base) {
+ if (memblock_is_region_reserved(base, size) ||
+ memblock_reserve(base, size) < 0) {
+ base = -EBUSY;
+ goto err;
+ }
+ } else {
+ /*
+ * Use __memblock_alloc_base() since
+ * memblock_alloc_base() panic()s.
+ */
+ phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
+ if (!addr) {
+ base = -ENOMEM;
+ goto err;
+ } else if (addr + size > ~(unsigned long)0) {
+ memblock_free(addr, size);
+ base = -EINVAL;
+ goto err;
+ } else {
+ base = addr;
+ }
+ }
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ r->start = base;
+ r->size = size;
+ r->dev = dev;
+ cma_reserved_count++;
+ pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M,
+ (unsigned long)base);
+
+ /* Architecture specific contiguous memory fixup. */
+ dma_contiguous_early_fixup(base, size);
+ return 0;
+err:
+ pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M);
+ return base;
+}
+
+/**
+ * dma_alloc_from_contiguous() - allocate pages from contiguous area
+ * @dev: Pointer to device for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ *
+ * This function allocates memory buffer for specified device. It uses
+ * device specific contiguous memory area if available or the default
+ * global one. Requires architecture specific get_dev_cma_area() helper
+ * function.
+ */
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int align)
+{
+ unsigned long mask, pfn, pageno, start = 0;
+ struct cma *cma = dev_get_cma_area(dev);
+ int ret;
+
+ if (!cma || !cma->count)
+ return NULL;
+
+ if (align > CONFIG_CMA_ALIGNMENT)
+ align = CONFIG_CMA_ALIGNMENT;
+
+ pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma,
+ count, align);
+
+ if (!count)
+ return NULL;
+
+ mask = (1 << align) - 1;
+
+ mutex_lock(&cma_mutex);
+
+ for (;;) {
+ pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,
+ start, count, mask);
+ if (pageno >= cma->count) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ pfn = cma->base_pfn + pageno;
+ ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+ if (ret == 0) {
+ bitmap_set(cma->bitmap, pageno, count);
+ break;
+ } else if (ret != -EBUSY) {
+ goto error;
+ }
+ pr_debug("%s(): memory range at %p is busy, retrying\n",
+ __func__, pfn_to_page(pfn));
+ /* try again with a bit different memory target */
+ start = pageno + mask + 1;
+ }
+
+ mutex_unlock(&cma_mutex);
+
+ pr_debug("%s(): returned %p\n", __func__, pfn_to_page(pfn));
+ return pfn_to_page(pfn);
+error:
+ mutex_unlock(&cma_mutex);
+ return NULL;
+}
+
+/**
+ * dma_release_from_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_from_contiguous().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ struct cma *cma = dev_get_cma_area(dev);
+ unsigned long pfn;
+
+ if (!cma || !pages)
+ return false;
+
+ pr_debug("%s(page %p)\n", __func__, (void *)pages);
+
+ pfn = page_to_pfn(pages);
+
+ if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+ return false;
+
+ VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+
+ mutex_lock(&cma_mutex);
+ bitmap_clear(cma->bitmap, pfn - cma->base_pfn, count);
+ free_contig_range(pfn, count);
+ mutex_unlock(&cma_mutex);
+
+ return true;
+}
diff --git a/include/asm-generic/dma-contiguous.h b/include/asm-generic/dma-contiguous.h
new file mode 100644
index 000000000000..c544356b374b
--- /dev/null
+++ b/include/asm-generic/dma-contiguous.h
@@ -0,0 +1,28 @@
+#ifndef ASM_DMA_CONTIGUOUS_H
+#define ASM_DMA_CONTIGUOUS_H
+
+#ifdef __KERNEL__
+#ifdef CONFIG_CMA
+
+#include <linux/device.h>
+#include <linux/dma-contiguous.h>
+
+static inline struct cma *dev_get_cma_area(struct device *dev)
+{
+ if (dev && dev->cma_area)
+ return dev->cma_area;
+ return dma_contiguous_default_area;
+}
+
+static inline void dev_set_cma_area(struct device *dev, struct cma *cma)
+{
+ if (dev)
+ dev->cma_area = cma;
+ if (!dev || !dma_contiguous_default_area)
+ dma_contiguous_default_area = cma;
+}
+
+#endif
+#endif
+
+#endif
diff --git a/include/linux/device.h b/include/linux/device.h
index 5ad17cccdd71..e3399290436e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -661,6 +661,10 @@ struct device {
struct dma_coherent_mem *dma_mem; /* internal for coherent mem
override */
+#ifdef CONFIG_CMA
+ struct cma *cma_area; /* contiguous memory area for dma
+ allocations */
+#endif
/* arch specific additions */
struct dev_archdata archdata;
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
new file mode 100644
index 000000000000..2f303e4b7ed3
--- /dev/null
+++ b/include/linux/dma-contiguous.h
@@ -0,0 +1,110 @@
+#ifndef __LINUX_CMA_H
+#define __LINUX_CMA_H
+
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+/*
+ * Contiguous Memory Allocator
+ *
+ * The Contiguous Memory Allocator (CMA) makes it possible to
+ * allocate big contiguous chunks of memory after the system has
+ * booted.
+ *
+ * Why is it needed?
+ *
+ * Various devices on embedded systems have no scatter-getter and/or
+ * IO map support and require contiguous blocks of memory to
+ * operate. They include devices such as cameras, hardware video
+ * coders, etc.
+ *
+ * Such devices often require big memory buffers (a full HD frame
+ * is, for instance, more then 2 mega pixels large, i.e. more than 6
+ * MB of memory), which makes mechanisms such as kmalloc() or
+ * alloc_page() ineffective.
+ *
+ * At the same time, a solution where a big memory region is
+ * reserved for a device is suboptimal since often more memory is
+ * reserved then strictly required and, moreover, the memory is
+ * inaccessible to page system even if device drivers don't use it.
+ *
+ * CMA tries to solve this issue by operating on memory regions
+ * where only movable pages can be allocated from. This way, kernel
+ * can use the memory for pagecache and when device driver requests
+ * it, allocated pages can be migrated.
+ *
+ * Driver usage
+ *
+ * CMA should not be used by the device drivers directly. It is
+ * only a helper framework for dma-mapping subsystem.
+ *
+ * For more information, see kernel-docs in drivers/base/dma-contiguous.c
+ */
+
+#ifdef __KERNEL__
+
+struct cma;
+struct page;
+struct device;
+
+#ifdef CONFIG_CMA
+
+/*
+ * There is always at least global CMA area and a few optional device
+ * private areas configured in kernel .config.
+ */
+#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS)
+
+extern struct cma *dma_contiguous_default_area;
+
+void dma_contiguous_reserve(phys_addr_t addr_limit);
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit);
+
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order);
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count);
+
+#else
+
+#define MAX_CMA_AREAS (0)
+
+static inline void dma_contiguous_reserve(phys_addr_t limit) { }
+
+static inline
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ return -ENOSYS;
+}
+
+static inline
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order)
+{
+ return NULL;
+}
+
+static inline
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ return false;
+}
+
+#endif
+
+#endif
+
+#endif
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c64be2bb1c6eb43c838b2c6d57b074078be208dd Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski(a)samsung.com>
Date: Thu, 29 Dec 2011 13:09:51 +0100
Subject: [PATCH] drivers: add Contiguous Memory Allocator
The Contiguous Memory Allocator is a set of helper functions for DMA
mapping framework that improves allocations of contiguous memory chunks.
CMA grabs memory on system boot, marks it with MIGRATE_CMA migrate type
and gives back to the system. Kernel is allowed to allocate only movable
pages within CMA's managed memory so that it can be used for example for
page cache when DMA mapping do not use it. On
dma_alloc_from_contiguous() request such pages are migrated out of CMA
area to free required contiguous block and fulfill the request. This
allows to allocate large contiguous chunks of memory at any time
assuming that there is enough free memory available in the system.
This code is heavily based on earlier works by Michal Nazarewicz.
Signed-off-by: Marek Szyprowski <m.szyprowski(a)samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park(a)samsung.com>
Signed-off-by: Michal Nazarewicz <mina86(a)mina86.com>
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Tested-by: Rob Clark <rob.clark(a)linaro.org>
Tested-by: Ohad Ben-Cohen <ohad(a)wizery.com>
Tested-by: Benjamin Gaignard <benjamin.gaignard(a)linaro.org>
Tested-by: Robert Nelson <robertcnelson(a)gmail.com>
Tested-by: Barry Song <Baohua.Song(a)csr.com>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c1601e5a8b71..669e8bb52b94 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -508,6 +508,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Also note the kernel might malfunction if you disable
some critical bits.
+ cma=nn[MG] [ARM,KNL]
+ Sets the size of kernel global memory area for contiguous
+ memory allocations. For more information, see
+ include/linux/dma-contiguous.h
+
cmo_free_hint= [PPC] Format: { yes | no }
Specify whether pages are marked as being inactive
when they are freed. This is used in CMO environments
diff --git a/arch/Kconfig b/arch/Kconfig
index 684eb5af439d..0a3ffe46e567 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -142,6 +142,9 @@ config HAVE_ARCH_TRACEHOOK
config HAVE_DMA_ATTRS
bool
+config HAVE_DMA_CONTIGUOUS
+ bool
+
config USE_GENERIC_SMP_HELPERS
bool
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 9aa618acfe97..9b21469482ae 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -192,4 +192,93 @@ config DMA_SHARED_BUFFER
APIs extension; the file's descriptor can then be passed on to other
driver.
+config CMA
+ bool "Contiguous Memory Allocator (EXPERIMENTAL)"
+ depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL
+ select MIGRATION
+ help
+ This enables the Contiguous Memory Allocator which allows drivers
+ to allocate big physically-contiguous blocks of memory for use with
+ hardware components that do not support I/O map nor scatter-gather.
+
+ For more information see <include/linux/dma-contiguous.h>.
+ If unsure, say "n".
+
+if CMA
+
+config CMA_DEBUG
+ bool "CMA debug messages (DEVELOPMENT)"
+ depends on DEBUG_KERNEL
+ help
+ Turns on debug messages in CMA. This produces KERN_DEBUG
+ messages for every CMA call as well as various messages while
+ processing calls such as dma_alloc_from_contiguous().
+ This option does not affect warning and error messages.
+
+comment "Default contiguous memory area size:"
+
+config CMA_SIZE_MBYTES
+ int "Size in Mega Bytes"
+ depends on !CMA_SIZE_SEL_PERCENTAGE
+ default 16
+ help
+ Defines the size (in MiB) of the default memory area for Contiguous
+ Memory Allocator.
+
+config CMA_SIZE_PERCENTAGE
+ int "Percentage of total memory"
+ depends on !CMA_SIZE_SEL_MBYTES
+ default 10
+ help
+ Defines the size of the default memory area for Contiguous Memory
+ Allocator as a percentage of the total memory in the system.
+
+choice
+ prompt "Selected region size"
+ default CMA_SIZE_SEL_ABSOLUTE
+
+config CMA_SIZE_SEL_MBYTES
+ bool "Use mega bytes value only"
+
+config CMA_SIZE_SEL_PERCENTAGE
+ bool "Use percentage value only"
+
+config CMA_SIZE_SEL_MIN
+ bool "Use lower value (minimum)"
+
+config CMA_SIZE_SEL_MAX
+ bool "Use higher value (maximum)"
+
+endchoice
+
+config CMA_ALIGNMENT
+ int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+ range 4 9
+ default 8
+ help
+ DMA mapping framework by default aligns all buffers to the smallest
+ PAGE_SIZE order which is greater than or equal to the requested buffer
+ size. This works well for buffers up to a few hundreds kilobytes, but
+ for larger buffers it just a memory waste. With this parameter you can
+ specify the maximum PAGE_SIZE order for contiguous buffers. Larger
+ buffers will be aligned only to this specified order. The order is
+ expressed as a power of two multiplied by the PAGE_SIZE.
+
+ For example, if your system defaults to 4KiB pages, the order value
+ of 8 means that the buffers will be aligned up to 1MiB only.
+
+ If unsure, leave the default value "8".
+
+config CMA_AREAS
+ int "Maximum count of the CMA device-private areas"
+ default 7
+ help
+ CMA allows to create CMA areas for particular devices. This parameter
+ sets the maximum number of such device private CMA areas in the
+ system.
+
+ If unsure, leave the default value "7".
+
+endif
+
endmenu
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index b6d1b9c4200c..5aa2d703d19f 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -6,6 +6,7 @@ obj-y := core.o bus.o dd.o syscore.o \
attribute_container.o transport_class.o \
topology.o
obj-$(CONFIG_DEVTMPFS) += devtmpfs.o
+obj-$(CONFIG_CMA) += dma-contiguous.o
obj-y += power/
obj-$(CONFIG_HAS_DMA) += dma-mapping.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
new file mode 100644
index 000000000000..78efb0306a44
--- /dev/null
+++ b/drivers/base/dma-contiguous.c
@@ -0,0 +1,401 @@
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+#define pr_fmt(fmt) "cma: " fmt
+
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+# define DEBUG
+#endif
+#endif
+
+#include <asm/page.h>
+#include <asm/dma-contiguous.h>
+
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-isolation.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/mm_types.h>
+#include <linux/dma-contiguous.h>
+
+#ifndef SZ_1M
+#define SZ_1M (1 << 20)
+#endif
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+};
+
+struct cma *dma_contiguous_default_area;
+
+#ifdef CONFIG_CMA_SIZE_MBYTES
+#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
+#else
+#define CMA_SIZE_MBYTES 0
+#endif
+
+/*
+ * Default global CMA area size can be defined in kernel's .config.
+ * This is usefull mainly for distro maintainers to create a kernel
+ * that works correctly for most supported systems.
+ * The size can be set in bytes or as a percentage of the total memory
+ * in the system.
+ *
+ * Users, who want to set the size of global CMA area for their system
+ * should use cma= kernel parameter.
+ */
+static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M;
+static long size_cmdline = -1;
+
+static int __init early_cma(char *p)
+{
+ pr_debug("%s(%s)\n", __func__, p);
+ size_cmdline = memparse(p, &p);
+ return 0;
+}
+early_param("cma", early_cma);
+
+#ifdef CONFIG_CMA_SIZE_PERCENTAGE
+
+static unsigned long __init __maybe_unused cma_early_percent_memory(void)
+{
+ struct memblock_region *reg;
+ unsigned long total_pages = 0;
+
+ /*
+ * We cannot use memblock_phys_mem_size() here, because
+ * memblock_analyze() has not been called yet.
+ */
+ for_each_memblock(memory, reg)
+ total_pages += memblock_region_memory_end_pfn(reg) -
+ memblock_region_memory_base_pfn(reg);
+
+ return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
+}
+
+#else
+
+static inline __maybe_unused unsigned long cma_early_percent_memory(void)
+{
+ return 0;
+}
+
+#endif
+
+/**
+ * dma_contiguous_reserve() - reserve area for contiguous memory handling
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
+ */
+void __init dma_contiguous_reserve(phys_addr_t limit)
+{
+ unsigned long selected_size = 0;
+
+ pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
+
+ if (size_cmdline != -1) {
+ selected_size = size_cmdline;
+ } else {
+#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
+ selected_size = size_bytes;
+#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
+ selected_size = cma_early_percent_memory();
+#elif defined(CONFIG_CMA_SIZE_SEL_MIN)
+ selected_size = min(size_bytes, cma_early_percent_memory());
+#elif defined(CONFIG_CMA_SIZE_SEL_MAX)
+ selected_size = max(size_bytes, cma_early_percent_memory());
+#endif
+ }
+
+ if (selected_size) {
+ pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+ selected_size / SZ_1M);
+
+ dma_declare_contiguous(NULL, selected_size, 0, limit);
+ }
+};
+
+static DEFINE_MUTEX(cma_mutex);
+
+static __init int cma_activate_area(unsigned long base_pfn, unsigned long count)
+{
+ unsigned long pfn = base_pfn;
+ unsigned i = count >> pageblock_order;
+ struct zone *zone;
+
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ zone = page_zone(pfn_to_page(pfn));
+
+ do {
+ unsigned j;
+ base_pfn = pfn;
+ for (j = pageblock_nr_pages; j; --j, pfn++) {
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ if (page_zone(pfn_to_page(pfn)) != zone)
+ return -EINVAL;
+ }
+ init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+ } while (--i);
+ return 0;
+}
+
+static __init struct cma *cma_create_area(unsigned long base_pfn,
+ unsigned long count)
+{
+ int bitmap_size = BITS_TO_LONGS(count) * sizeof(long);
+ struct cma *cma;
+ int ret = -ENOMEM;
+
+ pr_debug("%s(base %08lx, count %lx)\n", __func__, base_pfn, count);
+
+ cma = kmalloc(sizeof *cma, GFP_KERNEL);
+ if (!cma)
+ return ERR_PTR(-ENOMEM);
+
+ cma->base_pfn = base_pfn;
+ cma->count = count;
+ cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+
+ if (!cma->bitmap)
+ goto no_mem;
+
+ ret = cma_activate_area(base_pfn, count);
+ if (ret)
+ goto error;
+
+ pr_debug("%s: returned %p\n", __func__, (void *)cma);
+ return cma;
+
+error:
+ kfree(cma->bitmap);
+no_mem:
+ kfree(cma);
+ return ERR_PTR(ret);
+}
+
+static struct cma_reserved {
+ phys_addr_t start;
+ unsigned long size;
+ struct device *dev;
+} cma_reserved[MAX_CMA_AREAS] __initdata;
+static unsigned cma_reserved_count __initdata;
+
+static int __init cma_init_reserved_areas(void)
+{
+ struct cma_reserved *r = cma_reserved;
+ unsigned i = cma_reserved_count;
+
+ pr_debug("%s()\n", __func__);
+
+ for (; i; --i, ++r) {
+ struct cma *cma;
+ cma = cma_create_area(PFN_DOWN(r->start),
+ r->size >> PAGE_SHIFT);
+ if (!IS_ERR(cma))
+ dev_set_cma_area(r->dev, cma);
+ }
+ return 0;
+}
+core_initcall(cma_init_reserved_areas);
+
+/**
+ * dma_declare_contiguous() - reserve area for contiguous memory handling
+ * for particular device
+ * @dev: Pointer to device structure.
+ * @size: Size of the reserved memory.
+ * @base: Start address of the reserved memory (optional, 0 for any).
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory for specified device. It should be
+ * called by board specific code when early allocator (memblock or bootmem)
+ * is still activate.
+ */
+int __init dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ struct cma_reserved *r = &cma_reserved[cma_reserved_count];
+ unsigned long alignment;
+
+ pr_debug("%s(size %lx, base %08lx, limit %08lx)\n", __func__,
+ (unsigned long)size, (unsigned long)base,
+ (unsigned long)limit);
+
+ /* Sanity checks */
+ if (cma_reserved_count == ARRAY_SIZE(cma_reserved)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size)
+ return -EINVAL;
+
+ /* Sanitise input arguments */
+ alignment = PAGE_SIZE << max(MAX_ORDER, pageblock_order);
+ base = ALIGN(base, alignment);
+ size = ALIGN(size, alignment);
+ limit &= ~(alignment - 1);
+
+ /* Reserve memory */
+ if (base) {
+ if (memblock_is_region_reserved(base, size) ||
+ memblock_reserve(base, size) < 0) {
+ base = -EBUSY;
+ goto err;
+ }
+ } else {
+ /*
+ * Use __memblock_alloc_base() since
+ * memblock_alloc_base() panic()s.
+ */
+ phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
+ if (!addr) {
+ base = -ENOMEM;
+ goto err;
+ } else if (addr + size > ~(unsigned long)0) {
+ memblock_free(addr, size);
+ base = -EINVAL;
+ goto err;
+ } else {
+ base = addr;
+ }
+ }
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ r->start = base;
+ r->size = size;
+ r->dev = dev;
+ cma_reserved_count++;
+ pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M,
+ (unsigned long)base);
+
+ /* Architecture specific contiguous memory fixup. */
+ dma_contiguous_early_fixup(base, size);
+ return 0;
+err:
+ pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M);
+ return base;
+}
+
+/**
+ * dma_alloc_from_contiguous() - allocate pages from contiguous area
+ * @dev: Pointer to device for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ *
+ * This function allocates memory buffer for specified device. It uses
+ * device specific contiguous memory area if available or the default
+ * global one. Requires architecture specific get_dev_cma_area() helper
+ * function.
+ */
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int align)
+{
+ unsigned long mask, pfn, pageno, start = 0;
+ struct cma *cma = dev_get_cma_area(dev);
+ int ret;
+
+ if (!cma || !cma->count)
+ return NULL;
+
+ if (align > CONFIG_CMA_ALIGNMENT)
+ align = CONFIG_CMA_ALIGNMENT;
+
+ pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma,
+ count, align);
+
+ if (!count)
+ return NULL;
+
+ mask = (1 << align) - 1;
+
+ mutex_lock(&cma_mutex);
+
+ for (;;) {
+ pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,
+ start, count, mask);
+ if (pageno >= cma->count) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ pfn = cma->base_pfn + pageno;
+ ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+ if (ret == 0) {
+ bitmap_set(cma->bitmap, pageno, count);
+ break;
+ } else if (ret != -EBUSY) {
+ goto error;
+ }
+ pr_debug("%s(): memory range at %p is busy, retrying\n",
+ __func__, pfn_to_page(pfn));
+ /* try again with a bit different memory target */
+ start = pageno + mask + 1;
+ }
+
+ mutex_unlock(&cma_mutex);
+
+ pr_debug("%s(): returned %p\n", __func__, pfn_to_page(pfn));
+ return pfn_to_page(pfn);
+error:
+ mutex_unlock(&cma_mutex);
+ return NULL;
+}
+
+/**
+ * dma_release_from_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_from_contiguous().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ struct cma *cma = dev_get_cma_area(dev);
+ unsigned long pfn;
+
+ if (!cma || !pages)
+ return false;
+
+ pr_debug("%s(page %p)\n", __func__, (void *)pages);
+
+ pfn = page_to_pfn(pages);
+
+ if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+ return false;
+
+ VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+
+ mutex_lock(&cma_mutex);
+ bitmap_clear(cma->bitmap, pfn - cma->base_pfn, count);
+ free_contig_range(pfn, count);
+ mutex_unlock(&cma_mutex);
+
+ return true;
+}
diff --git a/include/asm-generic/dma-contiguous.h b/include/asm-generic/dma-contiguous.h
new file mode 100644
index 000000000000..c544356b374b
--- /dev/null
+++ b/include/asm-generic/dma-contiguous.h
@@ -0,0 +1,28 @@
+#ifndef ASM_DMA_CONTIGUOUS_H
+#define ASM_DMA_CONTIGUOUS_H
+
+#ifdef __KERNEL__
+#ifdef CONFIG_CMA
+
+#include <linux/device.h>
+#include <linux/dma-contiguous.h>
+
+static inline struct cma *dev_get_cma_area(struct device *dev)
+{
+ if (dev && dev->cma_area)
+ return dev->cma_area;
+ return dma_contiguous_default_area;
+}
+
+static inline void dev_set_cma_area(struct device *dev, struct cma *cma)
+{
+ if (dev)
+ dev->cma_area = cma;
+ if (!dev || !dma_contiguous_default_area)
+ dma_contiguous_default_area = cma;
+}
+
+#endif
+#endif
+
+#endif
diff --git a/include/linux/device.h b/include/linux/device.h
index 5ad17cccdd71..e3399290436e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -661,6 +661,10 @@ struct device {
struct dma_coherent_mem *dma_mem; /* internal for coherent mem
override */
+#ifdef CONFIG_CMA
+ struct cma *cma_area; /* contiguous memory area for dma
+ allocations */
+#endif
/* arch specific additions */
struct dev_archdata archdata;
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
new file mode 100644
index 000000000000..2f303e4b7ed3
--- /dev/null
+++ b/include/linux/dma-contiguous.h
@@ -0,0 +1,110 @@
+#ifndef __LINUX_CMA_H
+#define __LINUX_CMA_H
+
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+/*
+ * Contiguous Memory Allocator
+ *
+ * The Contiguous Memory Allocator (CMA) makes it possible to
+ * allocate big contiguous chunks of memory after the system has
+ * booted.
+ *
+ * Why is it needed?
+ *
+ * Various devices on embedded systems have no scatter-getter and/or
+ * IO map support and require contiguous blocks of memory to
+ * operate. They include devices such as cameras, hardware video
+ * coders, etc.
+ *
+ * Such devices often require big memory buffers (a full HD frame
+ * is, for instance, more then 2 mega pixels large, i.e. more than 6
+ * MB of memory), which makes mechanisms such as kmalloc() or
+ * alloc_page() ineffective.
+ *
+ * At the same time, a solution where a big memory region is
+ * reserved for a device is suboptimal since often more memory is
+ * reserved then strictly required and, moreover, the memory is
+ * inaccessible to page system even if device drivers don't use it.
+ *
+ * CMA tries to solve this issue by operating on memory regions
+ * where only movable pages can be allocated from. This way, kernel
+ * can use the memory for pagecache and when device driver requests
+ * it, allocated pages can be migrated.
+ *
+ * Driver usage
+ *
+ * CMA should not be used by the device drivers directly. It is
+ * only a helper framework for dma-mapping subsystem.
+ *
+ * For more information, see kernel-docs in drivers/base/dma-contiguous.c
+ */
+
+#ifdef __KERNEL__
+
+struct cma;
+struct page;
+struct device;
+
+#ifdef CONFIG_CMA
+
+/*
+ * There is always at least global CMA area and a few optional device
+ * private areas configured in kernel .config.
+ */
+#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS)
+
+extern struct cma *dma_contiguous_default_area;
+
+void dma_contiguous_reserve(phys_addr_t addr_limit);
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit);
+
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order);
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count);
+
+#else
+
+#define MAX_CMA_AREAS (0)
+
+static inline void dma_contiguous_reserve(phys_addr_t limit) { }
+
+static inline
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ return -ENOSYS;
+}
+
+static inline
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order)
+{
+ return NULL;
+}
+
+static inline
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ return false;
+}
+
+#endif
+
+#endif
+
+#endif
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c64be2bb1c6eb43c838b2c6d57b074078be208dd Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski(a)samsung.com>
Date: Thu, 29 Dec 2011 13:09:51 +0100
Subject: [PATCH] drivers: add Contiguous Memory Allocator
The Contiguous Memory Allocator is a set of helper functions for DMA
mapping framework that improves allocations of contiguous memory chunks.
CMA grabs memory on system boot, marks it with MIGRATE_CMA migrate type
and gives back to the system. Kernel is allowed to allocate only movable
pages within CMA's managed memory so that it can be used for example for
page cache when DMA mapping do not use it. On
dma_alloc_from_contiguous() request such pages are migrated out of CMA
area to free required contiguous block and fulfill the request. This
allows to allocate large contiguous chunks of memory at any time
assuming that there is enough free memory available in the system.
This code is heavily based on earlier works by Michal Nazarewicz.
Signed-off-by: Marek Szyprowski <m.szyprowski(a)samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park(a)samsung.com>
Signed-off-by: Michal Nazarewicz <mina86(a)mina86.com>
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Tested-by: Rob Clark <rob.clark(a)linaro.org>
Tested-by: Ohad Ben-Cohen <ohad(a)wizery.com>
Tested-by: Benjamin Gaignard <benjamin.gaignard(a)linaro.org>
Tested-by: Robert Nelson <robertcnelson(a)gmail.com>
Tested-by: Barry Song <Baohua.Song(a)csr.com>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c1601e5a8b71..669e8bb52b94 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -508,6 +508,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Also note the kernel might malfunction if you disable
some critical bits.
+ cma=nn[MG] [ARM,KNL]
+ Sets the size of kernel global memory area for contiguous
+ memory allocations. For more information, see
+ include/linux/dma-contiguous.h
+
cmo_free_hint= [PPC] Format: { yes | no }
Specify whether pages are marked as being inactive
when they are freed. This is used in CMO environments
diff --git a/arch/Kconfig b/arch/Kconfig
index 684eb5af439d..0a3ffe46e567 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -142,6 +142,9 @@ config HAVE_ARCH_TRACEHOOK
config HAVE_DMA_ATTRS
bool
+config HAVE_DMA_CONTIGUOUS
+ bool
+
config USE_GENERIC_SMP_HELPERS
bool
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 9aa618acfe97..9b21469482ae 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -192,4 +192,93 @@ config DMA_SHARED_BUFFER
APIs extension; the file's descriptor can then be passed on to other
driver.
+config CMA
+ bool "Contiguous Memory Allocator (EXPERIMENTAL)"
+ depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL
+ select MIGRATION
+ help
+ This enables the Contiguous Memory Allocator which allows drivers
+ to allocate big physically-contiguous blocks of memory for use with
+ hardware components that do not support I/O map nor scatter-gather.
+
+ For more information see <include/linux/dma-contiguous.h>.
+ If unsure, say "n".
+
+if CMA
+
+config CMA_DEBUG
+ bool "CMA debug messages (DEVELOPMENT)"
+ depends on DEBUG_KERNEL
+ help
+ Turns on debug messages in CMA. This produces KERN_DEBUG
+ messages for every CMA call as well as various messages while
+ processing calls such as dma_alloc_from_contiguous().
+ This option does not affect warning and error messages.
+
+comment "Default contiguous memory area size:"
+
+config CMA_SIZE_MBYTES
+ int "Size in Mega Bytes"
+ depends on !CMA_SIZE_SEL_PERCENTAGE
+ default 16
+ help
+ Defines the size (in MiB) of the default memory area for Contiguous
+ Memory Allocator.
+
+config CMA_SIZE_PERCENTAGE
+ int "Percentage of total memory"
+ depends on !CMA_SIZE_SEL_MBYTES
+ default 10
+ help
+ Defines the size of the default memory area for Contiguous Memory
+ Allocator as a percentage of the total memory in the system.
+
+choice
+ prompt "Selected region size"
+ default CMA_SIZE_SEL_ABSOLUTE
+
+config CMA_SIZE_SEL_MBYTES
+ bool "Use mega bytes value only"
+
+config CMA_SIZE_SEL_PERCENTAGE
+ bool "Use percentage value only"
+
+config CMA_SIZE_SEL_MIN
+ bool "Use lower value (minimum)"
+
+config CMA_SIZE_SEL_MAX
+ bool "Use higher value (maximum)"
+
+endchoice
+
+config CMA_ALIGNMENT
+ int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+ range 4 9
+ default 8
+ help
+ DMA mapping framework by default aligns all buffers to the smallest
+ PAGE_SIZE order which is greater than or equal to the requested buffer
+ size. This works well for buffers up to a few hundreds kilobytes, but
+ for larger buffers it just a memory waste. With this parameter you can
+ specify the maximum PAGE_SIZE order for contiguous buffers. Larger
+ buffers will be aligned only to this specified order. The order is
+ expressed as a power of two multiplied by the PAGE_SIZE.
+
+ For example, if your system defaults to 4KiB pages, the order value
+ of 8 means that the buffers will be aligned up to 1MiB only.
+
+ If unsure, leave the default value "8".
+
+config CMA_AREAS
+ int "Maximum count of the CMA device-private areas"
+ default 7
+ help
+ CMA allows to create CMA areas for particular devices. This parameter
+ sets the maximum number of such device private CMA areas in the
+ system.
+
+ If unsure, leave the default value "7".
+
+endif
+
endmenu
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index b6d1b9c4200c..5aa2d703d19f 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -6,6 +6,7 @@ obj-y := core.o bus.o dd.o syscore.o \
attribute_container.o transport_class.o \
topology.o
obj-$(CONFIG_DEVTMPFS) += devtmpfs.o
+obj-$(CONFIG_CMA) += dma-contiguous.o
obj-y += power/
obj-$(CONFIG_HAS_DMA) += dma-mapping.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
new file mode 100644
index 000000000000..78efb0306a44
--- /dev/null
+++ b/drivers/base/dma-contiguous.c
@@ -0,0 +1,401 @@
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+#define pr_fmt(fmt) "cma: " fmt
+
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+# define DEBUG
+#endif
+#endif
+
+#include <asm/page.h>
+#include <asm/dma-contiguous.h>
+
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-isolation.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/mm_types.h>
+#include <linux/dma-contiguous.h>
+
+#ifndef SZ_1M
+#define SZ_1M (1 << 20)
+#endif
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+};
+
+struct cma *dma_contiguous_default_area;
+
+#ifdef CONFIG_CMA_SIZE_MBYTES
+#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
+#else
+#define CMA_SIZE_MBYTES 0
+#endif
+
+/*
+ * Default global CMA area size can be defined in kernel's .config.
+ * This is usefull mainly for distro maintainers to create a kernel
+ * that works correctly for most supported systems.
+ * The size can be set in bytes or as a percentage of the total memory
+ * in the system.
+ *
+ * Users, who want to set the size of global CMA area for their system
+ * should use cma= kernel parameter.
+ */
+static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M;
+static long size_cmdline = -1;
+
+static int __init early_cma(char *p)
+{
+ pr_debug("%s(%s)\n", __func__, p);
+ size_cmdline = memparse(p, &p);
+ return 0;
+}
+early_param("cma", early_cma);
+
+#ifdef CONFIG_CMA_SIZE_PERCENTAGE
+
+static unsigned long __init __maybe_unused cma_early_percent_memory(void)
+{
+ struct memblock_region *reg;
+ unsigned long total_pages = 0;
+
+ /*
+ * We cannot use memblock_phys_mem_size() here, because
+ * memblock_analyze() has not been called yet.
+ */
+ for_each_memblock(memory, reg)
+ total_pages += memblock_region_memory_end_pfn(reg) -
+ memblock_region_memory_base_pfn(reg);
+
+ return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
+}
+
+#else
+
+static inline __maybe_unused unsigned long cma_early_percent_memory(void)
+{
+ return 0;
+}
+
+#endif
+
+/**
+ * dma_contiguous_reserve() - reserve area for contiguous memory handling
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
+ */
+void __init dma_contiguous_reserve(phys_addr_t limit)
+{
+ unsigned long selected_size = 0;
+
+ pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
+
+ if (size_cmdline != -1) {
+ selected_size = size_cmdline;
+ } else {
+#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
+ selected_size = size_bytes;
+#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
+ selected_size = cma_early_percent_memory();
+#elif defined(CONFIG_CMA_SIZE_SEL_MIN)
+ selected_size = min(size_bytes, cma_early_percent_memory());
+#elif defined(CONFIG_CMA_SIZE_SEL_MAX)
+ selected_size = max(size_bytes, cma_early_percent_memory());
+#endif
+ }
+
+ if (selected_size) {
+ pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+ selected_size / SZ_1M);
+
+ dma_declare_contiguous(NULL, selected_size, 0, limit);
+ }
+};
+
+static DEFINE_MUTEX(cma_mutex);
+
+static __init int cma_activate_area(unsigned long base_pfn, unsigned long count)
+{
+ unsigned long pfn = base_pfn;
+ unsigned i = count >> pageblock_order;
+ struct zone *zone;
+
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ zone = page_zone(pfn_to_page(pfn));
+
+ do {
+ unsigned j;
+ base_pfn = pfn;
+ for (j = pageblock_nr_pages; j; --j, pfn++) {
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ if (page_zone(pfn_to_page(pfn)) != zone)
+ return -EINVAL;
+ }
+ init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+ } while (--i);
+ return 0;
+}
+
+static __init struct cma *cma_create_area(unsigned long base_pfn,
+ unsigned long count)
+{
+ int bitmap_size = BITS_TO_LONGS(count) * sizeof(long);
+ struct cma *cma;
+ int ret = -ENOMEM;
+
+ pr_debug("%s(base %08lx, count %lx)\n", __func__, base_pfn, count);
+
+ cma = kmalloc(sizeof *cma, GFP_KERNEL);
+ if (!cma)
+ return ERR_PTR(-ENOMEM);
+
+ cma->base_pfn = base_pfn;
+ cma->count = count;
+ cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+
+ if (!cma->bitmap)
+ goto no_mem;
+
+ ret = cma_activate_area(base_pfn, count);
+ if (ret)
+ goto error;
+
+ pr_debug("%s: returned %p\n", __func__, (void *)cma);
+ return cma;
+
+error:
+ kfree(cma->bitmap);
+no_mem:
+ kfree(cma);
+ return ERR_PTR(ret);
+}
+
+static struct cma_reserved {
+ phys_addr_t start;
+ unsigned long size;
+ struct device *dev;
+} cma_reserved[MAX_CMA_AREAS] __initdata;
+static unsigned cma_reserved_count __initdata;
+
+static int __init cma_init_reserved_areas(void)
+{
+ struct cma_reserved *r = cma_reserved;
+ unsigned i = cma_reserved_count;
+
+ pr_debug("%s()\n", __func__);
+
+ for (; i; --i, ++r) {
+ struct cma *cma;
+ cma = cma_create_area(PFN_DOWN(r->start),
+ r->size >> PAGE_SHIFT);
+ if (!IS_ERR(cma))
+ dev_set_cma_area(r->dev, cma);
+ }
+ return 0;
+}
+core_initcall(cma_init_reserved_areas);
+
+/**
+ * dma_declare_contiguous() - reserve area for contiguous memory handling
+ * for particular device
+ * @dev: Pointer to device structure.
+ * @size: Size of the reserved memory.
+ * @base: Start address of the reserved memory (optional, 0 for any).
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory for specified device. It should be
+ * called by board specific code when early allocator (memblock or bootmem)
+ * is still activate.
+ */
+int __init dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ struct cma_reserved *r = &cma_reserved[cma_reserved_count];
+ unsigned long alignment;
+
+ pr_debug("%s(size %lx, base %08lx, limit %08lx)\n", __func__,
+ (unsigned long)size, (unsigned long)base,
+ (unsigned long)limit);
+
+ /* Sanity checks */
+ if (cma_reserved_count == ARRAY_SIZE(cma_reserved)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size)
+ return -EINVAL;
+
+ /* Sanitise input arguments */
+ alignment = PAGE_SIZE << max(MAX_ORDER, pageblock_order);
+ base = ALIGN(base, alignment);
+ size = ALIGN(size, alignment);
+ limit &= ~(alignment - 1);
+
+ /* Reserve memory */
+ if (base) {
+ if (memblock_is_region_reserved(base, size) ||
+ memblock_reserve(base, size) < 0) {
+ base = -EBUSY;
+ goto err;
+ }
+ } else {
+ /*
+ * Use __memblock_alloc_base() since
+ * memblock_alloc_base() panic()s.
+ */
+ phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
+ if (!addr) {
+ base = -ENOMEM;
+ goto err;
+ } else if (addr + size > ~(unsigned long)0) {
+ memblock_free(addr, size);
+ base = -EINVAL;
+ goto err;
+ } else {
+ base = addr;
+ }
+ }
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ r->start = base;
+ r->size = size;
+ r->dev = dev;
+ cma_reserved_count++;
+ pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M,
+ (unsigned long)base);
+
+ /* Architecture specific contiguous memory fixup. */
+ dma_contiguous_early_fixup(base, size);
+ return 0;
+err:
+ pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M);
+ return base;
+}
+
+/**
+ * dma_alloc_from_contiguous() - allocate pages from contiguous area
+ * @dev: Pointer to device for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ *
+ * This function allocates memory buffer for specified device. It uses
+ * device specific contiguous memory area if available or the default
+ * global one. Requires architecture specific get_dev_cma_area() helper
+ * function.
+ */
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int align)
+{
+ unsigned long mask, pfn, pageno, start = 0;
+ struct cma *cma = dev_get_cma_area(dev);
+ int ret;
+
+ if (!cma || !cma->count)
+ return NULL;
+
+ if (align > CONFIG_CMA_ALIGNMENT)
+ align = CONFIG_CMA_ALIGNMENT;
+
+ pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma,
+ count, align);
+
+ if (!count)
+ return NULL;
+
+ mask = (1 << align) - 1;
+
+ mutex_lock(&cma_mutex);
+
+ for (;;) {
+ pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,
+ start, count, mask);
+ if (pageno >= cma->count) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ pfn = cma->base_pfn + pageno;
+ ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+ if (ret == 0) {
+ bitmap_set(cma->bitmap, pageno, count);
+ break;
+ } else if (ret != -EBUSY) {
+ goto error;
+ }
+ pr_debug("%s(): memory range at %p is busy, retrying\n",
+ __func__, pfn_to_page(pfn));
+ /* try again with a bit different memory target */
+ start = pageno + mask + 1;
+ }
+
+ mutex_unlock(&cma_mutex);
+
+ pr_debug("%s(): returned %p\n", __func__, pfn_to_page(pfn));
+ return pfn_to_page(pfn);
+error:
+ mutex_unlock(&cma_mutex);
+ return NULL;
+}
+
+/**
+ * dma_release_from_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_from_contiguous().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ struct cma *cma = dev_get_cma_area(dev);
+ unsigned long pfn;
+
+ if (!cma || !pages)
+ return false;
+
+ pr_debug("%s(page %p)\n", __func__, (void *)pages);
+
+ pfn = page_to_pfn(pages);
+
+ if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+ return false;
+
+ VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+
+ mutex_lock(&cma_mutex);
+ bitmap_clear(cma->bitmap, pfn - cma->base_pfn, count);
+ free_contig_range(pfn, count);
+ mutex_unlock(&cma_mutex);
+
+ return true;
+}
diff --git a/include/asm-generic/dma-contiguous.h b/include/asm-generic/dma-contiguous.h
new file mode 100644
index 000000000000..c544356b374b
--- /dev/null
+++ b/include/asm-generic/dma-contiguous.h
@@ -0,0 +1,28 @@
+#ifndef ASM_DMA_CONTIGUOUS_H
+#define ASM_DMA_CONTIGUOUS_H
+
+#ifdef __KERNEL__
+#ifdef CONFIG_CMA
+
+#include <linux/device.h>
+#include <linux/dma-contiguous.h>
+
+static inline struct cma *dev_get_cma_area(struct device *dev)
+{
+ if (dev && dev->cma_area)
+ return dev->cma_area;
+ return dma_contiguous_default_area;
+}
+
+static inline void dev_set_cma_area(struct device *dev, struct cma *cma)
+{
+ if (dev)
+ dev->cma_area = cma;
+ if (!dev || !dma_contiguous_default_area)
+ dma_contiguous_default_area = cma;
+}
+
+#endif
+#endif
+
+#endif
diff --git a/include/linux/device.h b/include/linux/device.h
index 5ad17cccdd71..e3399290436e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -661,6 +661,10 @@ struct device {
struct dma_coherent_mem *dma_mem; /* internal for coherent mem
override */
+#ifdef CONFIG_CMA
+ struct cma *cma_area; /* contiguous memory area for dma
+ allocations */
+#endif
/* arch specific additions */
struct dev_archdata archdata;
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
new file mode 100644
index 000000000000..2f303e4b7ed3
--- /dev/null
+++ b/include/linux/dma-contiguous.h
@@ -0,0 +1,110 @@
+#ifndef __LINUX_CMA_H
+#define __LINUX_CMA_H
+
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+/*
+ * Contiguous Memory Allocator
+ *
+ * The Contiguous Memory Allocator (CMA) makes it possible to
+ * allocate big contiguous chunks of memory after the system has
+ * booted.
+ *
+ * Why is it needed?
+ *
+ * Various devices on embedded systems have no scatter-getter and/or
+ * IO map support and require contiguous blocks of memory to
+ * operate. They include devices such as cameras, hardware video
+ * coders, etc.
+ *
+ * Such devices often require big memory buffers (a full HD frame
+ * is, for instance, more then 2 mega pixels large, i.e. more than 6
+ * MB of memory), which makes mechanisms such as kmalloc() or
+ * alloc_page() ineffective.
+ *
+ * At the same time, a solution where a big memory region is
+ * reserved for a device is suboptimal since often more memory is
+ * reserved then strictly required and, moreover, the memory is
+ * inaccessible to page system even if device drivers don't use it.
+ *
+ * CMA tries to solve this issue by operating on memory regions
+ * where only movable pages can be allocated from. This way, kernel
+ * can use the memory for pagecache and when device driver requests
+ * it, allocated pages can be migrated.
+ *
+ * Driver usage
+ *
+ * CMA should not be used by the device drivers directly. It is
+ * only a helper framework for dma-mapping subsystem.
+ *
+ * For more information, see kernel-docs in drivers/base/dma-contiguous.c
+ */
+
+#ifdef __KERNEL__
+
+struct cma;
+struct page;
+struct device;
+
+#ifdef CONFIG_CMA
+
+/*
+ * There is always at least global CMA area and a few optional device
+ * private areas configured in kernel .config.
+ */
+#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS)
+
+extern struct cma *dma_contiguous_default_area;
+
+void dma_contiguous_reserve(phys_addr_t addr_limit);
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit);
+
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order);
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count);
+
+#else
+
+#define MAX_CMA_AREAS (0)
+
+static inline void dma_contiguous_reserve(phys_addr_t limit) { }
+
+static inline
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ return -ENOSYS;
+}
+
+static inline
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order)
+{
+ return NULL;
+}
+
+static inline
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ return false;
+}
+
+#endif
+
+#endif
+
+#endif
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c64be2bb1c6eb43c838b2c6d57b074078be208dd Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski(a)samsung.com>
Date: Thu, 29 Dec 2011 13:09:51 +0100
Subject: [PATCH] drivers: add Contiguous Memory Allocator
The Contiguous Memory Allocator is a set of helper functions for DMA
mapping framework that improves allocations of contiguous memory chunks.
CMA grabs memory on system boot, marks it with MIGRATE_CMA migrate type
and gives back to the system. Kernel is allowed to allocate only movable
pages within CMA's managed memory so that it can be used for example for
page cache when DMA mapping do not use it. On
dma_alloc_from_contiguous() request such pages are migrated out of CMA
area to free required contiguous block and fulfill the request. This
allows to allocate large contiguous chunks of memory at any time
assuming that there is enough free memory available in the system.
This code is heavily based on earlier works by Michal Nazarewicz.
Signed-off-by: Marek Szyprowski <m.szyprowski(a)samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park(a)samsung.com>
Signed-off-by: Michal Nazarewicz <mina86(a)mina86.com>
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Tested-by: Rob Clark <rob.clark(a)linaro.org>
Tested-by: Ohad Ben-Cohen <ohad(a)wizery.com>
Tested-by: Benjamin Gaignard <benjamin.gaignard(a)linaro.org>
Tested-by: Robert Nelson <robertcnelson(a)gmail.com>
Tested-by: Barry Song <Baohua.Song(a)csr.com>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c1601e5a8b71..669e8bb52b94 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -508,6 +508,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Also note the kernel might malfunction if you disable
some critical bits.
+ cma=nn[MG] [ARM,KNL]
+ Sets the size of kernel global memory area for contiguous
+ memory allocations. For more information, see
+ include/linux/dma-contiguous.h
+
cmo_free_hint= [PPC] Format: { yes | no }
Specify whether pages are marked as being inactive
when they are freed. This is used in CMO environments
diff --git a/arch/Kconfig b/arch/Kconfig
index 684eb5af439d..0a3ffe46e567 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -142,6 +142,9 @@ config HAVE_ARCH_TRACEHOOK
config HAVE_DMA_ATTRS
bool
+config HAVE_DMA_CONTIGUOUS
+ bool
+
config USE_GENERIC_SMP_HELPERS
bool
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 9aa618acfe97..9b21469482ae 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -192,4 +192,93 @@ config DMA_SHARED_BUFFER
APIs extension; the file's descriptor can then be passed on to other
driver.
+config CMA
+ bool "Contiguous Memory Allocator (EXPERIMENTAL)"
+ depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL
+ select MIGRATION
+ help
+ This enables the Contiguous Memory Allocator which allows drivers
+ to allocate big physically-contiguous blocks of memory for use with
+ hardware components that do not support I/O map nor scatter-gather.
+
+ For more information see <include/linux/dma-contiguous.h>.
+ If unsure, say "n".
+
+if CMA
+
+config CMA_DEBUG
+ bool "CMA debug messages (DEVELOPMENT)"
+ depends on DEBUG_KERNEL
+ help
+ Turns on debug messages in CMA. This produces KERN_DEBUG
+ messages for every CMA call as well as various messages while
+ processing calls such as dma_alloc_from_contiguous().
+ This option does not affect warning and error messages.
+
+comment "Default contiguous memory area size:"
+
+config CMA_SIZE_MBYTES
+ int "Size in Mega Bytes"
+ depends on !CMA_SIZE_SEL_PERCENTAGE
+ default 16
+ help
+ Defines the size (in MiB) of the default memory area for Contiguous
+ Memory Allocator.
+
+config CMA_SIZE_PERCENTAGE
+ int "Percentage of total memory"
+ depends on !CMA_SIZE_SEL_MBYTES
+ default 10
+ help
+ Defines the size of the default memory area for Contiguous Memory
+ Allocator as a percentage of the total memory in the system.
+
+choice
+ prompt "Selected region size"
+ default CMA_SIZE_SEL_ABSOLUTE
+
+config CMA_SIZE_SEL_MBYTES
+ bool "Use mega bytes value only"
+
+config CMA_SIZE_SEL_PERCENTAGE
+ bool "Use percentage value only"
+
+config CMA_SIZE_SEL_MIN
+ bool "Use lower value (minimum)"
+
+config CMA_SIZE_SEL_MAX
+ bool "Use higher value (maximum)"
+
+endchoice
+
+config CMA_ALIGNMENT
+ int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+ range 4 9
+ default 8
+ help
+ DMA mapping framework by default aligns all buffers to the smallest
+ PAGE_SIZE order which is greater than or equal to the requested buffer
+ size. This works well for buffers up to a few hundreds kilobytes, but
+ for larger buffers it just a memory waste. With this parameter you can
+ specify the maximum PAGE_SIZE order for contiguous buffers. Larger
+ buffers will be aligned only to this specified order. The order is
+ expressed as a power of two multiplied by the PAGE_SIZE.
+
+ For example, if your system defaults to 4KiB pages, the order value
+ of 8 means that the buffers will be aligned up to 1MiB only.
+
+ If unsure, leave the default value "8".
+
+config CMA_AREAS
+ int "Maximum count of the CMA device-private areas"
+ default 7
+ help
+ CMA allows to create CMA areas for particular devices. This parameter
+ sets the maximum number of such device private CMA areas in the
+ system.
+
+ If unsure, leave the default value "7".
+
+endif
+
endmenu
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index b6d1b9c4200c..5aa2d703d19f 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -6,6 +6,7 @@ obj-y := core.o bus.o dd.o syscore.o \
attribute_container.o transport_class.o \
topology.o
obj-$(CONFIG_DEVTMPFS) += devtmpfs.o
+obj-$(CONFIG_CMA) += dma-contiguous.o
obj-y += power/
obj-$(CONFIG_HAS_DMA) += dma-mapping.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
new file mode 100644
index 000000000000..78efb0306a44
--- /dev/null
+++ b/drivers/base/dma-contiguous.c
@@ -0,0 +1,401 @@
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+#define pr_fmt(fmt) "cma: " fmt
+
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+# define DEBUG
+#endif
+#endif
+
+#include <asm/page.h>
+#include <asm/dma-contiguous.h>
+
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-isolation.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/mm_types.h>
+#include <linux/dma-contiguous.h>
+
+#ifndef SZ_1M
+#define SZ_1M (1 << 20)
+#endif
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+};
+
+struct cma *dma_contiguous_default_area;
+
+#ifdef CONFIG_CMA_SIZE_MBYTES
+#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
+#else
+#define CMA_SIZE_MBYTES 0
+#endif
+
+/*
+ * Default global CMA area size can be defined in kernel's .config.
+ * This is usefull mainly for distro maintainers to create a kernel
+ * that works correctly for most supported systems.
+ * The size can be set in bytes or as a percentage of the total memory
+ * in the system.
+ *
+ * Users, who want to set the size of global CMA area for their system
+ * should use cma= kernel parameter.
+ */
+static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M;
+static long size_cmdline = -1;
+
+static int __init early_cma(char *p)
+{
+ pr_debug("%s(%s)\n", __func__, p);
+ size_cmdline = memparse(p, &p);
+ return 0;
+}
+early_param("cma", early_cma);
+
+#ifdef CONFIG_CMA_SIZE_PERCENTAGE
+
+static unsigned long __init __maybe_unused cma_early_percent_memory(void)
+{
+ struct memblock_region *reg;
+ unsigned long total_pages = 0;
+
+ /*
+ * We cannot use memblock_phys_mem_size() here, because
+ * memblock_analyze() has not been called yet.
+ */
+ for_each_memblock(memory, reg)
+ total_pages += memblock_region_memory_end_pfn(reg) -
+ memblock_region_memory_base_pfn(reg);
+
+ return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
+}
+
+#else
+
+static inline __maybe_unused unsigned long cma_early_percent_memory(void)
+{
+ return 0;
+}
+
+#endif
+
+/**
+ * dma_contiguous_reserve() - reserve area for contiguous memory handling
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
+ */
+void __init dma_contiguous_reserve(phys_addr_t limit)
+{
+ unsigned long selected_size = 0;
+
+ pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
+
+ if (size_cmdline != -1) {
+ selected_size = size_cmdline;
+ } else {
+#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
+ selected_size = size_bytes;
+#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
+ selected_size = cma_early_percent_memory();
+#elif defined(CONFIG_CMA_SIZE_SEL_MIN)
+ selected_size = min(size_bytes, cma_early_percent_memory());
+#elif defined(CONFIG_CMA_SIZE_SEL_MAX)
+ selected_size = max(size_bytes, cma_early_percent_memory());
+#endif
+ }
+
+ if (selected_size) {
+ pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+ selected_size / SZ_1M);
+
+ dma_declare_contiguous(NULL, selected_size, 0, limit);
+ }
+};
+
+static DEFINE_MUTEX(cma_mutex);
+
+static __init int cma_activate_area(unsigned long base_pfn, unsigned long count)
+{
+ unsigned long pfn = base_pfn;
+ unsigned i = count >> pageblock_order;
+ struct zone *zone;
+
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ zone = page_zone(pfn_to_page(pfn));
+
+ do {
+ unsigned j;
+ base_pfn = pfn;
+ for (j = pageblock_nr_pages; j; --j, pfn++) {
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ if (page_zone(pfn_to_page(pfn)) != zone)
+ return -EINVAL;
+ }
+ init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+ } while (--i);
+ return 0;
+}
+
+static __init struct cma *cma_create_area(unsigned long base_pfn,
+ unsigned long count)
+{
+ int bitmap_size = BITS_TO_LONGS(count) * sizeof(long);
+ struct cma *cma;
+ int ret = -ENOMEM;
+
+ pr_debug("%s(base %08lx, count %lx)\n", __func__, base_pfn, count);
+
+ cma = kmalloc(sizeof *cma, GFP_KERNEL);
+ if (!cma)
+ return ERR_PTR(-ENOMEM);
+
+ cma->base_pfn = base_pfn;
+ cma->count = count;
+ cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+
+ if (!cma->bitmap)
+ goto no_mem;
+
+ ret = cma_activate_area(base_pfn, count);
+ if (ret)
+ goto error;
+
+ pr_debug("%s: returned %p\n", __func__, (void *)cma);
+ return cma;
+
+error:
+ kfree(cma->bitmap);
+no_mem:
+ kfree(cma);
+ return ERR_PTR(ret);
+}
+
+static struct cma_reserved {
+ phys_addr_t start;
+ unsigned long size;
+ struct device *dev;
+} cma_reserved[MAX_CMA_AREAS] __initdata;
+static unsigned cma_reserved_count __initdata;
+
+static int __init cma_init_reserved_areas(void)
+{
+ struct cma_reserved *r = cma_reserved;
+ unsigned i = cma_reserved_count;
+
+ pr_debug("%s()\n", __func__);
+
+ for (; i; --i, ++r) {
+ struct cma *cma;
+ cma = cma_create_area(PFN_DOWN(r->start),
+ r->size >> PAGE_SHIFT);
+ if (!IS_ERR(cma))
+ dev_set_cma_area(r->dev, cma);
+ }
+ return 0;
+}
+core_initcall(cma_init_reserved_areas);
+
+/**
+ * dma_declare_contiguous() - reserve area for contiguous memory handling
+ * for particular device
+ * @dev: Pointer to device structure.
+ * @size: Size of the reserved memory.
+ * @base: Start address of the reserved memory (optional, 0 for any).
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory for specified device. It should be
+ * called by board specific code when early allocator (memblock or bootmem)
+ * is still activate.
+ */
+int __init dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ struct cma_reserved *r = &cma_reserved[cma_reserved_count];
+ unsigned long alignment;
+
+ pr_debug("%s(size %lx, base %08lx, limit %08lx)\n", __func__,
+ (unsigned long)size, (unsigned long)base,
+ (unsigned long)limit);
+
+ /* Sanity checks */
+ if (cma_reserved_count == ARRAY_SIZE(cma_reserved)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size)
+ return -EINVAL;
+
+ /* Sanitise input arguments */
+ alignment = PAGE_SIZE << max(MAX_ORDER, pageblock_order);
+ base = ALIGN(base, alignment);
+ size = ALIGN(size, alignment);
+ limit &= ~(alignment - 1);
+
+ /* Reserve memory */
+ if (base) {
+ if (memblock_is_region_reserved(base, size) ||
+ memblock_reserve(base, size) < 0) {
+ base = -EBUSY;
+ goto err;
+ }
+ } else {
+ /*
+ * Use __memblock_alloc_base() since
+ * memblock_alloc_base() panic()s.
+ */
+ phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
+ if (!addr) {
+ base = -ENOMEM;
+ goto err;
+ } else if (addr + size > ~(unsigned long)0) {
+ memblock_free(addr, size);
+ base = -EINVAL;
+ goto err;
+ } else {
+ base = addr;
+ }
+ }
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ r->start = base;
+ r->size = size;
+ r->dev = dev;
+ cma_reserved_count++;
+ pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M,
+ (unsigned long)base);
+
+ /* Architecture specific contiguous memory fixup. */
+ dma_contiguous_early_fixup(base, size);
+ return 0;
+err:
+ pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M);
+ return base;
+}
+
+/**
+ * dma_alloc_from_contiguous() - allocate pages from contiguous area
+ * @dev: Pointer to device for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ *
+ * This function allocates memory buffer for specified device. It uses
+ * device specific contiguous memory area if available or the default
+ * global one. Requires architecture specific get_dev_cma_area() helper
+ * function.
+ */
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int align)
+{
+ unsigned long mask, pfn, pageno, start = 0;
+ struct cma *cma = dev_get_cma_area(dev);
+ int ret;
+
+ if (!cma || !cma->count)
+ return NULL;
+
+ if (align > CONFIG_CMA_ALIGNMENT)
+ align = CONFIG_CMA_ALIGNMENT;
+
+ pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma,
+ count, align);
+
+ if (!count)
+ return NULL;
+
+ mask = (1 << align) - 1;
+
+ mutex_lock(&cma_mutex);
+
+ for (;;) {
+ pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,
+ start, count, mask);
+ if (pageno >= cma->count) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ pfn = cma->base_pfn + pageno;
+ ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+ if (ret == 0) {
+ bitmap_set(cma->bitmap, pageno, count);
+ break;
+ } else if (ret != -EBUSY) {
+ goto error;
+ }
+ pr_debug("%s(): memory range at %p is busy, retrying\n",
+ __func__, pfn_to_page(pfn));
+ /* try again with a bit different memory target */
+ start = pageno + mask + 1;
+ }
+
+ mutex_unlock(&cma_mutex);
+
+ pr_debug("%s(): returned %p\n", __func__, pfn_to_page(pfn));
+ return pfn_to_page(pfn);
+error:
+ mutex_unlock(&cma_mutex);
+ return NULL;
+}
+
+/**
+ * dma_release_from_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_from_contiguous().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ struct cma *cma = dev_get_cma_area(dev);
+ unsigned long pfn;
+
+ if (!cma || !pages)
+ return false;
+
+ pr_debug("%s(page %p)\n", __func__, (void *)pages);
+
+ pfn = page_to_pfn(pages);
+
+ if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+ return false;
+
+ VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+
+ mutex_lock(&cma_mutex);
+ bitmap_clear(cma->bitmap, pfn - cma->base_pfn, count);
+ free_contig_range(pfn, count);
+ mutex_unlock(&cma_mutex);
+
+ return true;
+}
diff --git a/include/asm-generic/dma-contiguous.h b/include/asm-generic/dma-contiguous.h
new file mode 100644
index 000000000000..c544356b374b
--- /dev/null
+++ b/include/asm-generic/dma-contiguous.h
@@ -0,0 +1,28 @@
+#ifndef ASM_DMA_CONTIGUOUS_H
+#define ASM_DMA_CONTIGUOUS_H
+
+#ifdef __KERNEL__
+#ifdef CONFIG_CMA
+
+#include <linux/device.h>
+#include <linux/dma-contiguous.h>
+
+static inline struct cma *dev_get_cma_area(struct device *dev)
+{
+ if (dev && dev->cma_area)
+ return dev->cma_area;
+ return dma_contiguous_default_area;
+}
+
+static inline void dev_set_cma_area(struct device *dev, struct cma *cma)
+{
+ if (dev)
+ dev->cma_area = cma;
+ if (!dev || !dma_contiguous_default_area)
+ dma_contiguous_default_area = cma;
+}
+
+#endif
+#endif
+
+#endif
diff --git a/include/linux/device.h b/include/linux/device.h
index 5ad17cccdd71..e3399290436e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -661,6 +661,10 @@ struct device {
struct dma_coherent_mem *dma_mem; /* internal for coherent mem
override */
+#ifdef CONFIG_CMA
+ struct cma *cma_area; /* contiguous memory area for dma
+ allocations */
+#endif
/* arch specific additions */
struct dev_archdata archdata;
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
new file mode 100644
index 000000000000..2f303e4b7ed3
--- /dev/null
+++ b/include/linux/dma-contiguous.h
@@ -0,0 +1,110 @@
+#ifndef __LINUX_CMA_H
+#define __LINUX_CMA_H
+
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski(a)samsung.com>
+ * Michal Nazarewicz <mina86(a)mina86.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+
+/*
+ * Contiguous Memory Allocator
+ *
+ * The Contiguous Memory Allocator (CMA) makes it possible to
+ * allocate big contiguous chunks of memory after the system has
+ * booted.
+ *
+ * Why is it needed?
+ *
+ * Various devices on embedded systems have no scatter-getter and/or
+ * IO map support and require contiguous blocks of memory to
+ * operate. They include devices such as cameras, hardware video
+ * coders, etc.
+ *
+ * Such devices often require big memory buffers (a full HD frame
+ * is, for instance, more then 2 mega pixels large, i.e. more than 6
+ * MB of memory), which makes mechanisms such as kmalloc() or
+ * alloc_page() ineffective.
+ *
+ * At the same time, a solution where a big memory region is
+ * reserved for a device is suboptimal since often more memory is
+ * reserved then strictly required and, moreover, the memory is
+ * inaccessible to page system even if device drivers don't use it.
+ *
+ * CMA tries to solve this issue by operating on memory regions
+ * where only movable pages can be allocated from. This way, kernel
+ * can use the memory for pagecache and when device driver requests
+ * it, allocated pages can be migrated.
+ *
+ * Driver usage
+ *
+ * CMA should not be used by the device drivers directly. It is
+ * only a helper framework for dma-mapping subsystem.
+ *
+ * For more information, see kernel-docs in drivers/base/dma-contiguous.c
+ */
+
+#ifdef __KERNEL__
+
+struct cma;
+struct page;
+struct device;
+
+#ifdef CONFIG_CMA
+
+/*
+ * There is always at least global CMA area and a few optional device
+ * private areas configured in kernel .config.
+ */
+#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS)
+
+extern struct cma *dma_contiguous_default_area;
+
+void dma_contiguous_reserve(phys_addr_t addr_limit);
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit);
+
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order);
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count);
+
+#else
+
+#define MAX_CMA_AREAS (0)
+
+static inline void dma_contiguous_reserve(phys_addr_t limit) { }
+
+static inline
+int dma_declare_contiguous(struct device *dev, unsigned long size,
+ phys_addr_t base, phys_addr_t limit)
+{
+ return -ENOSYS;
+}
+
+static inline
+struct page *dma_alloc_from_contiguous(struct device *dev, int count,
+ unsigned int order)
+{
+ return NULL;
+}
+
+static inline
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ return false;
+}
+
+#endif
+
+#endif
+
+#endif
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4a93025cbe4a0b19d1a25a2d763a3d2018bad0d9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david(a)redhat.com>
Date: Thu, 6 Aug 2020 23:17:13 -0700
Subject: [PATCH] mm/shuffle: don't move pages between zones and don't read
garbage memmaps
Especially with memory hotplug, we can have offline sections (with a
garbage memmap) and overlapping zones. We have to make sure to only touch
initialized memmaps (online sections managed by the buddy) and that the
zone matches, to not move pages between zones.
To test if this can actually happen, I added a simple
BUG_ON(page_zone(page_i) != page_zone(page_j));
right before the swap. When hotplugging a 256M DIMM to a 4G x86-64 VM and
onlining the first memory block "online_movable" and the second memory
block "online_kernel", it will trigger the BUG, as both zones (NORMAL and
MOVABLE) overlap.
This might result in all kinds of weird situations (e.g., double
allocations, list corruptions, unmovable allocations ending up in the
movable zone).
Fixes: e900a918b098 ("mm: shuffle initial free memory to improve memory-side-cache utilization")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Wei Yang <richard.weiyang(a)linux.alibaba.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Acked-by: Dan Williams <dan.j.williams(a)intel.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Huang Ying <ying.huang(a)intel.com>
Cc: Wei Yang <richard.weiyang(a)gmail.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: <stable(a)vger.kernel.org> [5.2+]
Link: http://lkml.kernel.org/r/20200624094741.9918-2-david@redhat.com
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 44406d9977c7..dd13ab851b3e 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -58,25 +58,25 @@ module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
* For two pages to be swapped in the shuffle, they must be free (on a
* 'free_area' lru), have the same order, and have the same migratetype.
*/
-static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+static struct page * __meminit shuffle_valid_page(struct zone *zone,
+ unsigned long pfn, int order)
{
- struct page *page;
+ struct page *page = pfn_to_online_page(pfn);
/*
* Given we're dealing with randomly selected pfns in a zone we
* need to ask questions like...
*/
- /* ...is the pfn even in the memmap? */
- if (!pfn_valid_within(pfn))
+ /* ... is the page managed by the buddy? */
+ if (!page)
return NULL;
- /* ...is the pfn in a present section or a hole? */
- if (!pfn_in_present_section(pfn))
+ /* ... is the page assigned to the same zone? */
+ if (page_zone(page) != zone)
return NULL;
/* ...is the page free and currently on a free_area list? */
- page = pfn_to_page(pfn);
if (!PageBuddy(page))
return NULL;
@@ -123,7 +123,7 @@ void __meminit __shuffle_zone(struct zone *z)
* page_j randomly selected in the span @zone_start_pfn to
* @spanned_pages.
*/
- page_i = shuffle_valid_page(i, order);
+ page_i = shuffle_valid_page(z, i, order);
if (!page_i)
continue;
@@ -137,7 +137,7 @@ void __meminit __shuffle_zone(struct zone *z)
j = z->zone_start_pfn +
ALIGN_DOWN(get_random_long() % z->spanned_pages,
order_pages);
- page_j = shuffle_valid_page(j, order);
+ page_j = shuffle_valid_page(z, j, order);
if (page_j && page_j != page_i)
break;
}
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bbe98f9cadff58cdd6a4acaeba0efa8565dabe65 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:25 -0700
Subject: [PATCH] khugepaged: khugepaged_test_exit() check mmget_still_valid()
Move collapse_huge_page()'s mmget_still_valid() check into
khugepaged_test_exit() itself. collapse_huge_page() is used for anon THP
only, and earned its mmget_still_valid() check because it inserts a huge
pmd entry in place of the page table's pmd entry; whereas
collapse_file()'s retract_page_tables() or collapse_pte_mapped_thp()
merely clears the page table's pmd entry. But core dumping without mmap
lock must have been as open to mistaking a racily cleared pmd entry for a
page table at physical page 0, as exit_mmap() was. And we certainly have
no interest in mapping as a THP once dumping core.
Fixes: 59ea6d06cfa9 ("coredump: fix race condition between collapse_huge_page() and core dumping")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021217020.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ac04b332a373..b52bd46ad146 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -431,7 +431,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -1100,9 +1100,6 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bbe98f9cadff58cdd6a4acaeba0efa8565dabe65 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:25 -0700
Subject: [PATCH] khugepaged: khugepaged_test_exit() check mmget_still_valid()
Move collapse_huge_page()'s mmget_still_valid() check into
khugepaged_test_exit() itself. collapse_huge_page() is used for anon THP
only, and earned its mmget_still_valid() check because it inserts a huge
pmd entry in place of the page table's pmd entry; whereas
collapse_file()'s retract_page_tables() or collapse_pte_mapped_thp()
merely clears the page table's pmd entry. But core dumping without mmap
lock must have been as open to mistaking a racily cleared pmd entry for a
page table at physical page 0, as exit_mmap() was. And we certainly have
no interest in mapping as a THP once dumping core.
Fixes: 59ea6d06cfa9 ("coredump: fix race condition between collapse_huge_page() and core dumping")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021217020.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ac04b332a373..b52bd46ad146 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -431,7 +431,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -1100,9 +1100,6 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bbe98f9cadff58cdd6a4acaeba0efa8565dabe65 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:25 -0700
Subject: [PATCH] khugepaged: khugepaged_test_exit() check mmget_still_valid()
Move collapse_huge_page()'s mmget_still_valid() check into
khugepaged_test_exit() itself. collapse_huge_page() is used for anon THP
only, and earned its mmget_still_valid() check because it inserts a huge
pmd entry in place of the page table's pmd entry; whereas
collapse_file()'s retract_page_tables() or collapse_pte_mapped_thp()
merely clears the page table's pmd entry. But core dumping without mmap
lock must have been as open to mistaking a racily cleared pmd entry for a
page table at physical page 0, as exit_mmap() was. And we certainly have
no interest in mapping as a THP once dumping core.
Fixes: 59ea6d06cfa9 ("coredump: fix race condition between collapse_huge_page() and core dumping")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021217020.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ac04b332a373..b52bd46ad146 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -431,7 +431,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -1100,9 +1100,6 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bbe98f9cadff58cdd6a4acaeba0efa8565dabe65 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:25 -0700
Subject: [PATCH] khugepaged: khugepaged_test_exit() check mmget_still_valid()
Move collapse_huge_page()'s mmget_still_valid() check into
khugepaged_test_exit() itself. collapse_huge_page() is used for anon THP
only, and earned its mmget_still_valid() check because it inserts a huge
pmd entry in place of the page table's pmd entry; whereas
collapse_file()'s retract_page_tables() or collapse_pte_mapped_thp()
merely clears the page table's pmd entry. But core dumping without mmap
lock must have been as open to mistaking a racily cleared pmd entry for a
page table at physical page 0, as exit_mmap() was. And we certainly have
no interest in mapping as a THP once dumping core.
Fixes: 59ea6d06cfa9 ("coredump: fix race condition between collapse_huge_page() and core dumping")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021217020.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ac04b332a373..b52bd46ad146 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -431,7 +431,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -1100,9 +1100,6 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bbe98f9cadff58cdd6a4acaeba0efa8565dabe65 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:25 -0700
Subject: [PATCH] khugepaged: khugepaged_test_exit() check mmget_still_valid()
Move collapse_huge_page()'s mmget_still_valid() check into
khugepaged_test_exit() itself. collapse_huge_page() is used for anon THP
only, and earned its mmget_still_valid() check because it inserts a huge
pmd entry in place of the page table's pmd entry; whereas
collapse_file()'s retract_page_tables() or collapse_pte_mapped_thp()
merely clears the page table's pmd entry. But core dumping without mmap
lock must have been as open to mistaking a racily cleared pmd entry for a
page table at physical page 0, as exit_mmap() was. And we certainly have
no interest in mapping as a THP once dumping core.
Fixes: 59ea6d06cfa9 ("coredump: fix race condition between collapse_huge_page() and core dumping")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021217020.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ac04b332a373..b52bd46ad146 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -431,7 +431,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -1100,9 +1100,6 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bbe98f9cadff58cdd6a4acaeba0efa8565dabe65 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 6 Aug 2020 23:26:25 -0700
Subject: [PATCH] khugepaged: khugepaged_test_exit() check mmget_still_valid()
Move collapse_huge_page()'s mmget_still_valid() check into
khugepaged_test_exit() itself. collapse_huge_page() is used for anon THP
only, and earned its mmget_still_valid() check because it inserts a huge
pmd entry in place of the page table's pmd entry; whereas
collapse_file()'s retract_page_tables() or collapse_pte_mapped_thp()
merely clears the page table's pmd entry. But core dumping without mmap
lock must have been as open to mistaking a racily cleared pmd entry for a
page table at physical page 0, as exit_mmap() was. And we certainly have
no interest in mapping as a THP once dumping core.
Fixes: 59ea6d06cfa9 ("coredump: fix race condition between collapse_huge_page() and core dumping")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> [4.8+]
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008021217020.27773@eggly.anvils
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ac04b332a373..b52bd46ad146 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -431,7 +431,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -1100,9 +1100,6 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 65f0f017e7be8c70330372df23bcb2a407ecf02d Mon Sep 17 00:00:00 2001
From: Coly Li <colyli(a)suse.de>
Date: Sat, 25 Jul 2020 20:00:21 +0800
Subject: [PATCH] bcache: avoid nr_stripes overflow in bcache_device_init()
For some block devices which large capacity (e.g. 8TB) but small io_opt
size (e.g. 8 sectors), in bcache_device_init() the stripes number calcu-
lated by,
DIV_ROUND_UP_ULL(sectors, d->stripe_size);
might be overflow to the unsigned int bcache_device->nr_stripes.
This patch uses the uint64_t variable to store DIV_ROUND_UP_ULL()
and after the value is checked to be available in unsigned int range,
sets it to bache_device->nr_stripes. Then the overflow is avoided.
Reported-and-tested-by: Ken Raeburn <raeburn(a)redhat.com>
Signed-off-by: Coly Li <colyli(a)suse.de>
Cc: stable(a)vger.kernel.org
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1783075
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 4a77bfd4009f..0f90616dc8d3 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -835,19 +835,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
- size_t n;
+ uint64_t n;
int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
- d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
-
- if (!d->nr_stripes || d->nr_stripes > max_stripes) {
- pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)\n",
- (unsigned int)d->nr_stripes);
+ n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
+ if (!n || n > max_stripes) {
+ pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
+ n);
return -ENOMEM;
}
+ d->nr_stripes = n;
n = d->nr_stripes * sizeof(atomic_t);
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 65f0f017e7be8c70330372df23bcb2a407ecf02d Mon Sep 17 00:00:00 2001
From: Coly Li <colyli(a)suse.de>
Date: Sat, 25 Jul 2020 20:00:21 +0800
Subject: [PATCH] bcache: avoid nr_stripes overflow in bcache_device_init()
For some block devices which large capacity (e.g. 8TB) but small io_opt
size (e.g. 8 sectors), in bcache_device_init() the stripes number calcu-
lated by,
DIV_ROUND_UP_ULL(sectors, d->stripe_size);
might be overflow to the unsigned int bcache_device->nr_stripes.
This patch uses the uint64_t variable to store DIV_ROUND_UP_ULL()
and after the value is checked to be available in unsigned int range,
sets it to bache_device->nr_stripes. Then the overflow is avoided.
Reported-and-tested-by: Ken Raeburn <raeburn(a)redhat.com>
Signed-off-by: Coly Li <colyli(a)suse.de>
Cc: stable(a)vger.kernel.org
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1783075
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 4a77bfd4009f..0f90616dc8d3 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -835,19 +835,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
- size_t n;
+ uint64_t n;
int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
- d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
-
- if (!d->nr_stripes || d->nr_stripes > max_stripes) {
- pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)\n",
- (unsigned int)d->nr_stripes);
+ n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
+ if (!n || n > max_stripes) {
+ pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
+ n);
return -ENOMEM;
}
+ d->nr_stripes = n;
n = d->nr_stripes * sizeof(atomic_t);
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
This is a note to let you know that I've just added the patch titled
usb: dwc3: gadget: Handle ZLP for sg requests
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From bc9a2e226ea95e1699f7590845554de095308b75 Mon Sep 17 00:00:00 2001
From: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Date: Thu, 6 Aug 2020 19:46:35 -0700
Subject: usb: dwc3: gadget: Handle ZLP for sg requests
Currently dwc3 doesn't handle usb_request->zero for SG requests. This
change checks and prepares extra TRBs for the ZLP for SG requests.
Cc: <stable(a)vger.kernel.org> # v4.5+
Fixes: 04c03d10e507 ("usb: dwc3: gadget: handle request->zero")
Signed-off-by: Thinh Nguyen <thinhn(a)synopsys.com>
Signed-off-by: Felipe Balbi <balbi(a)kernel.org>
---
drivers/usb/dwc3/gadget.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index df603a817a98..c2a0f64f8d1e 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1143,6 +1143,37 @@ static void dwc3_prepare_one_trb_sg(struct dwc3_ep *dep,
req->request.short_not_ok,
req->request.no_interrupt,
req->request.is_last);
+ } else if (req->request.zero && req->request.length &&
+ !usb_endpoint_xfer_isoc(dep->endpoint.desc) &&
+ !rem && !chain) {
+ struct dwc3 *dwc = dep->dwc;
+ struct dwc3_trb *trb;
+
+ req->needs_extra_trb = true;
+
+ /* Prepare normal TRB */
+ dwc3_prepare_one_trb(dep, req, trb_length, true, i);
+
+ /* Prepare one extra TRB to handle ZLP */
+ trb = &dep->trb_pool[dep->trb_enqueue];
+ req->num_trbs++;
+ __dwc3_prepare_one_trb(dep, trb, dwc->bounce_addr, 0,
+ !req->direction, 1,
+ req->request.stream_id,
+ req->request.short_not_ok,
+ req->request.no_interrupt,
+ req->request.is_last);
+
+ /* Prepare one more TRB to handle MPS alignment */
+ if (!req->direction) {
+ trb = &dep->trb_pool[dep->trb_enqueue];
+ req->num_trbs++;
+ __dwc3_prepare_one_trb(dep, trb, dwc->bounce_addr, maxp,
+ false, 1, req->request.stream_id,
+ req->request.short_not_ok,
+ req->request.no_interrupt,
+ req->request.is_last);
+ }
} else {
dwc3_prepare_one_trb(dep, req, trb_length, chain, i);
}
--
2.28.0
This is a note to let you know that I've just added the patch titled
usb: dwc3: gadget: Fix handling ZLP
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From d2ee3ff79e6a3d4105e684021017d100524dc560 Mon Sep 17 00:00:00 2001
From: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Date: Thu, 6 Aug 2020 19:46:29 -0700
Subject: usb: dwc3: gadget: Fix handling ZLP
The usb_request->zero doesn't apply for isoc. Also, if we prepare a
0-length (ZLP) TRB for the OUT direction, we need to prepare an extra
TRB to pad up to the MPS alignment. Use the same bounce buffer for the
ZLP TRB and the extra pad TRB.
Cc: <stable(a)vger.kernel.org> # v4.5+
Fixes: d6e5a549cc4d ("usb: dwc3: simplify ZLP handling")
Fixes: 04c03d10e507 ("usb: dwc3: gadget: handle request->zero")
Signed-off-by: Thinh Nguyen <thinhn(a)synopsys.com>
Signed-off-by: Felipe Balbi <balbi(a)kernel.org>
---
drivers/usb/dwc3/gadget.c | 25 +++++++++++++++++++++++--
1 file changed, 23 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index f9231253cbed..df603a817a98 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1199,6 +1199,7 @@ static void dwc3_prepare_one_trb_linear(struct dwc3_ep *dep,
req->request.no_interrupt,
req->request.is_last);
} else if (req->request.zero && req->request.length &&
+ !usb_endpoint_xfer_isoc(dep->endpoint.desc) &&
(IS_ALIGNED(req->request.length, maxp))) {
struct dwc3 *dwc = dep->dwc;
struct dwc3_trb *trb;
@@ -1208,14 +1209,25 @@ static void dwc3_prepare_one_trb_linear(struct dwc3_ep *dep,
/* prepare normal TRB */
dwc3_prepare_one_trb(dep, req, length, true, 0);
- /* Now prepare one extra TRB to handle ZLP */
+ /* Prepare one extra TRB to handle ZLP */
trb = &dep->trb_pool[dep->trb_enqueue];
req->num_trbs++;
__dwc3_prepare_one_trb(dep, trb, dwc->bounce_addr, 0,
- false, 1, req->request.stream_id,
+ !req->direction, 1, req->request.stream_id,
req->request.short_not_ok,
req->request.no_interrupt,
req->request.is_last);
+
+ /* Prepare one more TRB to handle MPS alignment for OUT */
+ if (!req->direction) {
+ trb = &dep->trb_pool[dep->trb_enqueue];
+ req->num_trbs++;
+ __dwc3_prepare_one_trb(dep, trb, dwc->bounce_addr, maxp,
+ false, 1, req->request.stream_id,
+ req->request.short_not_ok,
+ req->request.no_interrupt,
+ req->request.is_last);
+ }
} else {
dwc3_prepare_one_trb(dep, req, length, false, 0);
}
@@ -2690,8 +2702,17 @@ static int dwc3_gadget_ep_cleanup_completed_request(struct dwc3_ep *dep,
status);
if (req->needs_extra_trb) {
+ unsigned int maxp = usb_endpoint_maxp(dep->endpoint.desc);
+
ret = dwc3_gadget_ep_reclaim_trb_linear(dep, req, event,
status);
+
+ /* Reclaim MPS padding TRB for ZLP */
+ if (!req->direction && req->request.zero && req->request.length &&
+ !usb_endpoint_xfer_isoc(dep->endpoint.desc) &&
+ (IS_ALIGNED(req->request.length, maxp)))
+ ret = dwc3_gadget_ep_reclaim_trb_linear(dep, req, event, status);
+
req->needs_extra_trb = false;
}
--
2.28.0
This is a note to let you know that I've just added the patch titled
usb: dwc3: gadget: Don't setup more than requested
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 5d187c0454ef4c5e046a81af36882d4d515922ec Mon Sep 17 00:00:00 2001
From: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Date: Thu, 6 Aug 2020 19:46:23 -0700
Subject: usb: dwc3: gadget: Don't setup more than requested
The SG list may be set up with entry size more than the requested
length. Check the usb_request->length and make sure that we don't setup
the TRBs to send/receive more than requested. This case may occur when
the SG entry is allocated up to a certain minimum size, but the request
length is less than that. It can also occur when the request is reused
for a different request length.
Cc: <stable(a)vger.kernel.org> # v4.18+
Fixes: a31e63b608ff ("usb: dwc3: gadget: Correct handling of scattergather lists")
Signed-off-by: Thinh Nguyen <thinhn(a)synopsys.com>
Signed-off-by: Felipe Balbi <balbi(a)kernel.org>
---
drivers/usb/dwc3/gadget.c | 51 +++++++++++++++++++++++++++------------
1 file changed, 35 insertions(+), 16 deletions(-)
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index e44bfc3b5096..f9231253cbed 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1054,27 +1054,25 @@ static void __dwc3_prepare_one_trb(struct dwc3_ep *dep, struct dwc3_trb *trb,
* dwc3_prepare_one_trb - setup one TRB from one request
* @dep: endpoint for which this request is prepared
* @req: dwc3_request pointer
+ * @trb_length: buffer size of the TRB
* @chain: should this TRB be chained to the next?
* @node: only for isochronous endpoints. First TRB needs different type.
*/
static void dwc3_prepare_one_trb(struct dwc3_ep *dep,
- struct dwc3_request *req, unsigned chain, unsigned node)
+ struct dwc3_request *req, unsigned int trb_length,
+ unsigned chain, unsigned node)
{
struct dwc3_trb *trb;
- unsigned int length;
dma_addr_t dma;
unsigned stream_id = req->request.stream_id;
unsigned short_not_ok = req->request.short_not_ok;
unsigned no_interrupt = req->request.no_interrupt;
unsigned is_last = req->request.is_last;
- if (req->request.num_sgs > 0) {
- length = sg_dma_len(req->start_sg);
+ if (req->request.num_sgs > 0)
dma = sg_dma_address(req->start_sg);
- } else {
- length = req->request.length;
+ else
dma = req->request.dma;
- }
trb = &dep->trb_pool[dep->trb_enqueue];
@@ -1086,7 +1084,7 @@ static void dwc3_prepare_one_trb(struct dwc3_ep *dep,
req->num_trbs++;
- __dwc3_prepare_one_trb(dep, trb, dma, length, chain, node,
+ __dwc3_prepare_one_trb(dep, trb, dma, trb_length, chain, node,
stream_id, short_not_ok, no_interrupt, is_last);
}
@@ -1096,16 +1094,27 @@ static void dwc3_prepare_one_trb_sg(struct dwc3_ep *dep,
struct scatterlist *sg = req->start_sg;
struct scatterlist *s;
int i;
-
+ unsigned int length = req->request.length;
unsigned int remaining = req->request.num_mapped_sgs
- req->num_queued_sgs;
+ /*
+ * If we resume preparing the request, then get the remaining length of
+ * the request and resume where we left off.
+ */
+ for_each_sg(req->request.sg, s, req->num_queued_sgs, i)
+ length -= sg_dma_len(s);
+
for_each_sg(sg, s, remaining, i) {
- unsigned int length = req->request.length;
unsigned int maxp = usb_endpoint_maxp(dep->endpoint.desc);
unsigned int rem = length % maxp;
+ unsigned int trb_length;
unsigned chain = true;
+ trb_length = min_t(unsigned int, length, sg_dma_len(s));
+
+ length -= trb_length;
+
/*
* IOMMU driver is coalescing the list of sgs which shares a
* page boundary into one and giving it to USB driver. With
@@ -1113,7 +1122,7 @@ static void dwc3_prepare_one_trb_sg(struct dwc3_ep *dep,
* sgs passed. So mark the chain bit to false if it isthe last
* mapped sg.
*/
- if (i == remaining - 1)
+ if ((i == remaining - 1) || !length)
chain = false;
if (rem && usb_endpoint_dir_out(dep->endpoint.desc) && !chain) {
@@ -1123,7 +1132,7 @@ static void dwc3_prepare_one_trb_sg(struct dwc3_ep *dep,
req->needs_extra_trb = true;
/* prepare normal TRB */
- dwc3_prepare_one_trb(dep, req, true, i);
+ dwc3_prepare_one_trb(dep, req, trb_length, true, i);
/* Now prepare one extra TRB to align transfer size */
trb = &dep->trb_pool[dep->trb_enqueue];
@@ -1135,7 +1144,7 @@ static void dwc3_prepare_one_trb_sg(struct dwc3_ep *dep,
req->request.no_interrupt,
req->request.is_last);
} else {
- dwc3_prepare_one_trb(dep, req, chain, i);
+ dwc3_prepare_one_trb(dep, req, trb_length, chain, i);
}
/*
@@ -1150,6 +1159,16 @@ static void dwc3_prepare_one_trb_sg(struct dwc3_ep *dep,
req->num_queued_sgs++;
+ /*
+ * The number of pending SG entries may not correspond to the
+ * number of mapped SG entries. If all the data are queued, then
+ * don't include unused SG entries.
+ */
+ if (length == 0) {
+ req->num_pending_sgs -= req->request.num_mapped_sgs - req->num_queued_sgs;
+ break;
+ }
+
if (!dwc3_calc_trbs_left(dep))
break;
}
@@ -1169,7 +1188,7 @@ static void dwc3_prepare_one_trb_linear(struct dwc3_ep *dep,
req->needs_extra_trb = true;
/* prepare normal TRB */
- dwc3_prepare_one_trb(dep, req, true, 0);
+ dwc3_prepare_one_trb(dep, req, length, true, 0);
/* Now prepare one extra TRB to align transfer size */
trb = &dep->trb_pool[dep->trb_enqueue];
@@ -1187,7 +1206,7 @@ static void dwc3_prepare_one_trb_linear(struct dwc3_ep *dep,
req->needs_extra_trb = true;
/* prepare normal TRB */
- dwc3_prepare_one_trb(dep, req, true, 0);
+ dwc3_prepare_one_trb(dep, req, length, true, 0);
/* Now prepare one extra TRB to handle ZLP */
trb = &dep->trb_pool[dep->trb_enqueue];
@@ -1198,7 +1217,7 @@ static void dwc3_prepare_one_trb_linear(struct dwc3_ep *dep,
req->request.no_interrupt,
req->request.is_last);
} else {
- dwc3_prepare_one_trb(dep, req, false, 0);
+ dwc3_prepare_one_trb(dep, req, length, false, 0);
}
}
--
2.28.0
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 156abe2961601d60a8c2a60c6dc8dd6ce7adcdaf Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede(a)redhat.com>
Date: Sat, 6 Jun 2020 11:31:50 +0200
Subject: [PATCH] pinctrl: baytrail: Fix pin being driven low for a while on
gpiod_get(..., GPIOD_OUT_HIGH)
The pins on the Bay Trail SoC have separate input-buffer and output-buffer
enable bits and a read of the level bit of the value register will always
return the value from the input-buffer.
The BIOS of a device may configure a pin in output-only mode, only enabling
the output buffer, and write 1 to the level bit to drive the pin high.
This 1 written to the level bit will be stored inside the data-latch of the
output buffer.
But a subsequent read of the value register will return 0 for the level bit
because the input-buffer is disabled. This causes a read-modify-write as
done by byt_gpio_set_direction() to write 0 to the level bit, driving the
pin low!
Before this commit byt_gpio_direction_output() relied on
pinctrl_gpio_direction_output() to set the direction, followed by a call
to byt_gpio_set() to apply the selected value. This causes the pin to
go low between the pinctrl_gpio_direction_output() and byt_gpio_set()
calls.
Change byt_gpio_direction_output() to directly make the register
modifications itself instead. Replacing the 2 subsequent writes to the
value register with a single write.
Note that the pinctrl code does not keep track internally of the direction,
so not going through pinctrl_gpio_direction_output() is not an issue.
This issue was noticed on a Trekstor SurfTab Twin 10.1. When the panel is
already on at boot (no external monitor connected), then the i915 driver
does a gpiod_get(..., GPIOD_OUT_HIGH) for the panel-enable GPIO. The
temporarily going low of that GPIO was causing the panel to reset itself
after which it would not show an image until it was turned off and back on
again (until a full modeset was done on it). This commit fixes this.
This commit also updates the byt_gpio_direction_input() to use direct
register accesses instead of going through pinctrl_gpio_direction_input(),
to keep it consistent with byt_gpio_direction_output().
Note for backporting, this commit depends on:
commit e2b74419e5cc ("pinctrl: baytrail: Replace WARN with dev_info_once
when setting direct-irq pin to output")
Cc: stable(a)vger.kernel.org
Fixes: 86e3ef812fe3 ("pinctrl: baytrail: Update gpio chip operations")
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Acked-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
diff --git a/drivers/pinctrl/intel/pinctrl-baytrail.c b/drivers/pinctrl/intel/pinctrl-baytrail.c
index e3ceb3dfeabe..a917a2df520e 100644
--- a/drivers/pinctrl/intel/pinctrl-baytrail.c
+++ b/drivers/pinctrl/intel/pinctrl-baytrail.c
@@ -800,6 +800,21 @@ static void byt_gpio_disable_free(struct pinctrl_dev *pctl_dev,
pm_runtime_put(vg->dev);
}
+static void byt_gpio_direct_irq_check(struct intel_pinctrl *vg,
+ unsigned int offset)
+{
+ void __iomem *conf_reg = byt_gpio_reg(vg, offset, BYT_CONF0_REG);
+
+ /*
+ * Before making any direction modifications, do a check if gpio is set
+ * for direct IRQ. On Bay Trail, setting GPIO to output does not make
+ * sense, so let's at least inform the caller before they shoot
+ * themselves in the foot.
+ */
+ if (readl(conf_reg) & BYT_DIRECT_IRQ_EN)
+ dev_info_once(vg->dev, "Potential Error: Setting GPIO with direct_irq_en to output");
+}
+
static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
struct pinctrl_gpio_range *range,
unsigned int offset,
@@ -807,7 +822,6 @@ static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
{
struct intel_pinctrl *vg = pinctrl_dev_get_drvdata(pctl_dev);
void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
- void __iomem *conf_reg = byt_gpio_reg(vg, offset, BYT_CONF0_REG);
unsigned long flags;
u32 value;
@@ -817,14 +831,8 @@ static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
value &= ~BYT_DIR_MASK;
if (input)
value |= BYT_OUTPUT_EN;
- else if (readl(conf_reg) & BYT_DIRECT_IRQ_EN)
- /*
- * Before making any direction modifications, do a check if gpio
- * is set for direct IRQ. On baytrail, setting GPIO to output
- * does not make sense, so let's at least inform the caller before
- * they shoot themselves in the foot.
- */
- dev_info_once(vg->dev, "Potential Error: Setting GPIO with direct_irq_en to output");
+ else
+ byt_gpio_direct_irq_check(vg, offset);
writel(value, val_reg);
@@ -1165,19 +1173,50 @@ static int byt_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
static int byt_gpio_direction_input(struct gpio_chip *chip, unsigned int offset)
{
- return pinctrl_gpio_direction_input(chip->base + offset);
+ struct intel_pinctrl *vg = gpiochip_get_data(chip);
+ void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
+ unsigned long flags;
+ u32 reg;
+
+ raw_spin_lock_irqsave(&byt_lock, flags);
+
+ reg = readl(val_reg);
+ reg &= ~BYT_DIR_MASK;
+ reg |= BYT_OUTPUT_EN;
+ writel(reg, val_reg);
+
+ raw_spin_unlock_irqrestore(&byt_lock, flags);
+ return 0;
}
+/*
+ * Note despite the temptation this MUST NOT be converted into a call to
+ * pinctrl_gpio_direction_output() + byt_gpio_set() that does not work this
+ * MUST be done as a single BYT_VAL_REG register write.
+ * See the commit message of the commit adding this comment for details.
+ */
static int byt_gpio_direction_output(struct gpio_chip *chip,
unsigned int offset, int value)
{
- int ret = pinctrl_gpio_direction_output(chip->base + offset);
+ struct intel_pinctrl *vg = gpiochip_get_data(chip);
+ void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
+ unsigned long flags;
+ u32 reg;
- if (ret)
- return ret;
+ raw_spin_lock_irqsave(&byt_lock, flags);
+
+ byt_gpio_direct_irq_check(vg, offset);
- byt_gpio_set(chip, offset, value);
+ reg = readl(val_reg);
+ reg &= ~BYT_DIR_MASK;
+ if (value)
+ reg |= BYT_LEVEL;
+ else
+ reg &= ~BYT_LEVEL;
+ writel(reg, val_reg);
+
+ raw_spin_unlock_irqrestore(&byt_lock, flags);
return 0;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 156abe2961601d60a8c2a60c6dc8dd6ce7adcdaf Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede(a)redhat.com>
Date: Sat, 6 Jun 2020 11:31:50 +0200
Subject: [PATCH] pinctrl: baytrail: Fix pin being driven low for a while on
gpiod_get(..., GPIOD_OUT_HIGH)
The pins on the Bay Trail SoC have separate input-buffer and output-buffer
enable bits and a read of the level bit of the value register will always
return the value from the input-buffer.
The BIOS of a device may configure a pin in output-only mode, only enabling
the output buffer, and write 1 to the level bit to drive the pin high.
This 1 written to the level bit will be stored inside the data-latch of the
output buffer.
But a subsequent read of the value register will return 0 for the level bit
because the input-buffer is disabled. This causes a read-modify-write as
done by byt_gpio_set_direction() to write 0 to the level bit, driving the
pin low!
Before this commit byt_gpio_direction_output() relied on
pinctrl_gpio_direction_output() to set the direction, followed by a call
to byt_gpio_set() to apply the selected value. This causes the pin to
go low between the pinctrl_gpio_direction_output() and byt_gpio_set()
calls.
Change byt_gpio_direction_output() to directly make the register
modifications itself instead. Replacing the 2 subsequent writes to the
value register with a single write.
Note that the pinctrl code does not keep track internally of the direction,
so not going through pinctrl_gpio_direction_output() is not an issue.
This issue was noticed on a Trekstor SurfTab Twin 10.1. When the panel is
already on at boot (no external monitor connected), then the i915 driver
does a gpiod_get(..., GPIOD_OUT_HIGH) for the panel-enable GPIO. The
temporarily going low of that GPIO was causing the panel to reset itself
after which it would not show an image until it was turned off and back on
again (until a full modeset was done on it). This commit fixes this.
This commit also updates the byt_gpio_direction_input() to use direct
register accesses instead of going through pinctrl_gpio_direction_input(),
to keep it consistent with byt_gpio_direction_output().
Note for backporting, this commit depends on:
commit e2b74419e5cc ("pinctrl: baytrail: Replace WARN with dev_info_once
when setting direct-irq pin to output")
Cc: stable(a)vger.kernel.org
Fixes: 86e3ef812fe3 ("pinctrl: baytrail: Update gpio chip operations")
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Acked-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
diff --git a/drivers/pinctrl/intel/pinctrl-baytrail.c b/drivers/pinctrl/intel/pinctrl-baytrail.c
index e3ceb3dfeabe..a917a2df520e 100644
--- a/drivers/pinctrl/intel/pinctrl-baytrail.c
+++ b/drivers/pinctrl/intel/pinctrl-baytrail.c
@@ -800,6 +800,21 @@ static void byt_gpio_disable_free(struct pinctrl_dev *pctl_dev,
pm_runtime_put(vg->dev);
}
+static void byt_gpio_direct_irq_check(struct intel_pinctrl *vg,
+ unsigned int offset)
+{
+ void __iomem *conf_reg = byt_gpio_reg(vg, offset, BYT_CONF0_REG);
+
+ /*
+ * Before making any direction modifications, do a check if gpio is set
+ * for direct IRQ. On Bay Trail, setting GPIO to output does not make
+ * sense, so let's at least inform the caller before they shoot
+ * themselves in the foot.
+ */
+ if (readl(conf_reg) & BYT_DIRECT_IRQ_EN)
+ dev_info_once(vg->dev, "Potential Error: Setting GPIO with direct_irq_en to output");
+}
+
static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
struct pinctrl_gpio_range *range,
unsigned int offset,
@@ -807,7 +822,6 @@ static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
{
struct intel_pinctrl *vg = pinctrl_dev_get_drvdata(pctl_dev);
void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
- void __iomem *conf_reg = byt_gpio_reg(vg, offset, BYT_CONF0_REG);
unsigned long flags;
u32 value;
@@ -817,14 +831,8 @@ static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
value &= ~BYT_DIR_MASK;
if (input)
value |= BYT_OUTPUT_EN;
- else if (readl(conf_reg) & BYT_DIRECT_IRQ_EN)
- /*
- * Before making any direction modifications, do a check if gpio
- * is set for direct IRQ. On baytrail, setting GPIO to output
- * does not make sense, so let's at least inform the caller before
- * they shoot themselves in the foot.
- */
- dev_info_once(vg->dev, "Potential Error: Setting GPIO with direct_irq_en to output");
+ else
+ byt_gpio_direct_irq_check(vg, offset);
writel(value, val_reg);
@@ -1165,19 +1173,50 @@ static int byt_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
static int byt_gpio_direction_input(struct gpio_chip *chip, unsigned int offset)
{
- return pinctrl_gpio_direction_input(chip->base + offset);
+ struct intel_pinctrl *vg = gpiochip_get_data(chip);
+ void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
+ unsigned long flags;
+ u32 reg;
+
+ raw_spin_lock_irqsave(&byt_lock, flags);
+
+ reg = readl(val_reg);
+ reg &= ~BYT_DIR_MASK;
+ reg |= BYT_OUTPUT_EN;
+ writel(reg, val_reg);
+
+ raw_spin_unlock_irqrestore(&byt_lock, flags);
+ return 0;
}
+/*
+ * Note despite the temptation this MUST NOT be converted into a call to
+ * pinctrl_gpio_direction_output() + byt_gpio_set() that does not work this
+ * MUST be done as a single BYT_VAL_REG register write.
+ * See the commit message of the commit adding this comment for details.
+ */
static int byt_gpio_direction_output(struct gpio_chip *chip,
unsigned int offset, int value)
{
- int ret = pinctrl_gpio_direction_output(chip->base + offset);
+ struct intel_pinctrl *vg = gpiochip_get_data(chip);
+ void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
+ unsigned long flags;
+ u32 reg;
- if (ret)
- return ret;
+ raw_spin_lock_irqsave(&byt_lock, flags);
+
+ byt_gpio_direct_irq_check(vg, offset);
- byt_gpio_set(chip, offset, value);
+ reg = readl(val_reg);
+ reg &= ~BYT_DIR_MASK;
+ if (value)
+ reg |= BYT_LEVEL;
+ else
+ reg &= ~BYT_LEVEL;
+ writel(reg, val_reg);
+
+ raw_spin_unlock_irqrestore(&byt_lock, flags);
return 0;
}
The patch below does not apply to the 5.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 156abe2961601d60a8c2a60c6dc8dd6ce7adcdaf Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede(a)redhat.com>
Date: Sat, 6 Jun 2020 11:31:50 +0200
Subject: [PATCH] pinctrl: baytrail: Fix pin being driven low for a while on
gpiod_get(..., GPIOD_OUT_HIGH)
The pins on the Bay Trail SoC have separate input-buffer and output-buffer
enable bits and a read of the level bit of the value register will always
return the value from the input-buffer.
The BIOS of a device may configure a pin in output-only mode, only enabling
the output buffer, and write 1 to the level bit to drive the pin high.
This 1 written to the level bit will be stored inside the data-latch of the
output buffer.
But a subsequent read of the value register will return 0 for the level bit
because the input-buffer is disabled. This causes a read-modify-write as
done by byt_gpio_set_direction() to write 0 to the level bit, driving the
pin low!
Before this commit byt_gpio_direction_output() relied on
pinctrl_gpio_direction_output() to set the direction, followed by a call
to byt_gpio_set() to apply the selected value. This causes the pin to
go low between the pinctrl_gpio_direction_output() and byt_gpio_set()
calls.
Change byt_gpio_direction_output() to directly make the register
modifications itself instead. Replacing the 2 subsequent writes to the
value register with a single write.
Note that the pinctrl code does not keep track internally of the direction,
so not going through pinctrl_gpio_direction_output() is not an issue.
This issue was noticed on a Trekstor SurfTab Twin 10.1. When the panel is
already on at boot (no external monitor connected), then the i915 driver
does a gpiod_get(..., GPIOD_OUT_HIGH) for the panel-enable GPIO. The
temporarily going low of that GPIO was causing the panel to reset itself
after which it would not show an image until it was turned off and back on
again (until a full modeset was done on it). This commit fixes this.
This commit also updates the byt_gpio_direction_input() to use direct
register accesses instead of going through pinctrl_gpio_direction_input(),
to keep it consistent with byt_gpio_direction_output().
Note for backporting, this commit depends on:
commit e2b74419e5cc ("pinctrl: baytrail: Replace WARN with dev_info_once
when setting direct-irq pin to output")
Cc: stable(a)vger.kernel.org
Fixes: 86e3ef812fe3 ("pinctrl: baytrail: Update gpio chip operations")
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Acked-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
diff --git a/drivers/pinctrl/intel/pinctrl-baytrail.c b/drivers/pinctrl/intel/pinctrl-baytrail.c
index e3ceb3dfeabe..a917a2df520e 100644
--- a/drivers/pinctrl/intel/pinctrl-baytrail.c
+++ b/drivers/pinctrl/intel/pinctrl-baytrail.c
@@ -800,6 +800,21 @@ static void byt_gpio_disable_free(struct pinctrl_dev *pctl_dev,
pm_runtime_put(vg->dev);
}
+static void byt_gpio_direct_irq_check(struct intel_pinctrl *vg,
+ unsigned int offset)
+{
+ void __iomem *conf_reg = byt_gpio_reg(vg, offset, BYT_CONF0_REG);
+
+ /*
+ * Before making any direction modifications, do a check if gpio is set
+ * for direct IRQ. On Bay Trail, setting GPIO to output does not make
+ * sense, so let's at least inform the caller before they shoot
+ * themselves in the foot.
+ */
+ if (readl(conf_reg) & BYT_DIRECT_IRQ_EN)
+ dev_info_once(vg->dev, "Potential Error: Setting GPIO with direct_irq_en to output");
+}
+
static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
struct pinctrl_gpio_range *range,
unsigned int offset,
@@ -807,7 +822,6 @@ static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
{
struct intel_pinctrl *vg = pinctrl_dev_get_drvdata(pctl_dev);
void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
- void __iomem *conf_reg = byt_gpio_reg(vg, offset, BYT_CONF0_REG);
unsigned long flags;
u32 value;
@@ -817,14 +831,8 @@ static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
value &= ~BYT_DIR_MASK;
if (input)
value |= BYT_OUTPUT_EN;
- else if (readl(conf_reg) & BYT_DIRECT_IRQ_EN)
- /*
- * Before making any direction modifications, do a check if gpio
- * is set for direct IRQ. On baytrail, setting GPIO to output
- * does not make sense, so let's at least inform the caller before
- * they shoot themselves in the foot.
- */
- dev_info_once(vg->dev, "Potential Error: Setting GPIO with direct_irq_en to output");
+ else
+ byt_gpio_direct_irq_check(vg, offset);
writel(value, val_reg);
@@ -1165,19 +1173,50 @@ static int byt_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
static int byt_gpio_direction_input(struct gpio_chip *chip, unsigned int offset)
{
- return pinctrl_gpio_direction_input(chip->base + offset);
+ struct intel_pinctrl *vg = gpiochip_get_data(chip);
+ void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
+ unsigned long flags;
+ u32 reg;
+
+ raw_spin_lock_irqsave(&byt_lock, flags);
+
+ reg = readl(val_reg);
+ reg &= ~BYT_DIR_MASK;
+ reg |= BYT_OUTPUT_EN;
+ writel(reg, val_reg);
+
+ raw_spin_unlock_irqrestore(&byt_lock, flags);
+ return 0;
}
+/*
+ * Note despite the temptation this MUST NOT be converted into a call to
+ * pinctrl_gpio_direction_output() + byt_gpio_set() that does not work this
+ * MUST be done as a single BYT_VAL_REG register write.
+ * See the commit message of the commit adding this comment for details.
+ */
static int byt_gpio_direction_output(struct gpio_chip *chip,
unsigned int offset, int value)
{
- int ret = pinctrl_gpio_direction_output(chip->base + offset);
+ struct intel_pinctrl *vg = gpiochip_get_data(chip);
+ void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
+ unsigned long flags;
+ u32 reg;
- if (ret)
- return ret;
+ raw_spin_lock_irqsave(&byt_lock, flags);
+
+ byt_gpio_direct_irq_check(vg, offset);
- byt_gpio_set(chip, offset, value);
+ reg = readl(val_reg);
+ reg &= ~BYT_DIR_MASK;
+ if (value)
+ reg |= BYT_LEVEL;
+ else
+ reg &= ~BYT_LEVEL;
+ writel(reg, val_reg);
+
+ raw_spin_unlock_irqrestore(&byt_lock, flags);
return 0;
}
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 156abe2961601d60a8c2a60c6dc8dd6ce7adcdaf Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede(a)redhat.com>
Date: Sat, 6 Jun 2020 11:31:50 +0200
Subject: [PATCH] pinctrl: baytrail: Fix pin being driven low for a while on
gpiod_get(..., GPIOD_OUT_HIGH)
The pins on the Bay Trail SoC have separate input-buffer and output-buffer
enable bits and a read of the level bit of the value register will always
return the value from the input-buffer.
The BIOS of a device may configure a pin in output-only mode, only enabling
the output buffer, and write 1 to the level bit to drive the pin high.
This 1 written to the level bit will be stored inside the data-latch of the
output buffer.
But a subsequent read of the value register will return 0 for the level bit
because the input-buffer is disabled. This causes a read-modify-write as
done by byt_gpio_set_direction() to write 0 to the level bit, driving the
pin low!
Before this commit byt_gpio_direction_output() relied on
pinctrl_gpio_direction_output() to set the direction, followed by a call
to byt_gpio_set() to apply the selected value. This causes the pin to
go low between the pinctrl_gpio_direction_output() and byt_gpio_set()
calls.
Change byt_gpio_direction_output() to directly make the register
modifications itself instead. Replacing the 2 subsequent writes to the
value register with a single write.
Note that the pinctrl code does not keep track internally of the direction,
so not going through pinctrl_gpio_direction_output() is not an issue.
This issue was noticed on a Trekstor SurfTab Twin 10.1. When the panel is
already on at boot (no external monitor connected), then the i915 driver
does a gpiod_get(..., GPIOD_OUT_HIGH) for the panel-enable GPIO. The
temporarily going low of that GPIO was causing the panel to reset itself
after which it would not show an image until it was turned off and back on
again (until a full modeset was done on it). This commit fixes this.
This commit also updates the byt_gpio_direction_input() to use direct
register accesses instead of going through pinctrl_gpio_direction_input(),
to keep it consistent with byt_gpio_direction_output().
Note for backporting, this commit depends on:
commit e2b74419e5cc ("pinctrl: baytrail: Replace WARN with dev_info_once
when setting direct-irq pin to output")
Cc: stable(a)vger.kernel.org
Fixes: 86e3ef812fe3 ("pinctrl: baytrail: Update gpio chip operations")
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Acked-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
diff --git a/drivers/pinctrl/intel/pinctrl-baytrail.c b/drivers/pinctrl/intel/pinctrl-baytrail.c
index e3ceb3dfeabe..a917a2df520e 100644
--- a/drivers/pinctrl/intel/pinctrl-baytrail.c
+++ b/drivers/pinctrl/intel/pinctrl-baytrail.c
@@ -800,6 +800,21 @@ static void byt_gpio_disable_free(struct pinctrl_dev *pctl_dev,
pm_runtime_put(vg->dev);
}
+static void byt_gpio_direct_irq_check(struct intel_pinctrl *vg,
+ unsigned int offset)
+{
+ void __iomem *conf_reg = byt_gpio_reg(vg, offset, BYT_CONF0_REG);
+
+ /*
+ * Before making any direction modifications, do a check if gpio is set
+ * for direct IRQ. On Bay Trail, setting GPIO to output does not make
+ * sense, so let's at least inform the caller before they shoot
+ * themselves in the foot.
+ */
+ if (readl(conf_reg) & BYT_DIRECT_IRQ_EN)
+ dev_info_once(vg->dev, "Potential Error: Setting GPIO with direct_irq_en to output");
+}
+
static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
struct pinctrl_gpio_range *range,
unsigned int offset,
@@ -807,7 +822,6 @@ static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
{
struct intel_pinctrl *vg = pinctrl_dev_get_drvdata(pctl_dev);
void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
- void __iomem *conf_reg = byt_gpio_reg(vg, offset, BYT_CONF0_REG);
unsigned long flags;
u32 value;
@@ -817,14 +831,8 @@ static int byt_gpio_set_direction(struct pinctrl_dev *pctl_dev,
value &= ~BYT_DIR_MASK;
if (input)
value |= BYT_OUTPUT_EN;
- else if (readl(conf_reg) & BYT_DIRECT_IRQ_EN)
- /*
- * Before making any direction modifications, do a check if gpio
- * is set for direct IRQ. On baytrail, setting GPIO to output
- * does not make sense, so let's at least inform the caller before
- * they shoot themselves in the foot.
- */
- dev_info_once(vg->dev, "Potential Error: Setting GPIO with direct_irq_en to output");
+ else
+ byt_gpio_direct_irq_check(vg, offset);
writel(value, val_reg);
@@ -1165,19 +1173,50 @@ static int byt_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
static int byt_gpio_direction_input(struct gpio_chip *chip, unsigned int offset)
{
- return pinctrl_gpio_direction_input(chip->base + offset);
+ struct intel_pinctrl *vg = gpiochip_get_data(chip);
+ void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
+ unsigned long flags;
+ u32 reg;
+
+ raw_spin_lock_irqsave(&byt_lock, flags);
+
+ reg = readl(val_reg);
+ reg &= ~BYT_DIR_MASK;
+ reg |= BYT_OUTPUT_EN;
+ writel(reg, val_reg);
+
+ raw_spin_unlock_irqrestore(&byt_lock, flags);
+ return 0;
}
+/*
+ * Note despite the temptation this MUST NOT be converted into a call to
+ * pinctrl_gpio_direction_output() + byt_gpio_set() that does not work this
+ * MUST be done as a single BYT_VAL_REG register write.
+ * See the commit message of the commit adding this comment for details.
+ */
static int byt_gpio_direction_output(struct gpio_chip *chip,
unsigned int offset, int value)
{
- int ret = pinctrl_gpio_direction_output(chip->base + offset);
+ struct intel_pinctrl *vg = gpiochip_get_data(chip);
+ void __iomem *val_reg = byt_gpio_reg(vg, offset, BYT_VAL_REG);
+ unsigned long flags;
+ u32 reg;
- if (ret)
- return ret;
+ raw_spin_lock_irqsave(&byt_lock, flags);
+
+ byt_gpio_direct_irq_check(vg, offset);
- byt_gpio_set(chip, offset, value);
+ reg = readl(val_reg);
+ reg &= ~BYT_DIR_MASK;
+ if (value)
+ reg |= BYT_LEVEL;
+ else
+ reg &= ~BYT_LEVEL;
+ writel(reg, val_reg);
+
+ raw_spin_unlock_irqrestore(&byt_lock, flags);
return 0;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 63dee5df43a31f3844efabc58972f0a206ca4534 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe(a)ellerman.id.au>
Date: Fri, 24 Jul 2020 19:25:25 +1000
Subject: [PATCH] powerpc: Allow 4224 bytes of stack expansion for the signal
frame
We have powerpc specific logic in our page fault handling to decide if
an access to an unmapped address below the stack pointer should expand
the stack VMA.
The code was originally added in 2004 "ported from 2.4". The rough
logic is that the stack is allowed to grow to 1MB with no extra
checking. Over 1MB the access must be within 2048 bytes of the stack
pointer, or be from a user instruction that updates the stack pointer.
The 2048 byte allowance below the stack pointer is there to cover the
288 byte "red zone" as well as the "about 1.5kB" needed by the signal
delivery code.
Unfortunately since then the signal frame has expanded, and is now
4224 bytes on 64-bit kernels with transactional memory enabled. This
means if a process has consumed more than 1MB of stack, and its stack
pointer lies less than 4224 bytes from the next page boundary, signal
delivery will fault when trying to expand the stack and the process
will see a SEGV.
The total size of the signal frame is the size of struct rt_sigframe
(which includes the red zone) plus __SIGNAL_FRAMESIZE (128 bytes on
64-bit).
The 2048 byte allowance was correct until 2008 as the signal frame
was:
struct rt_sigframe {
struct ucontext uc; /* 0 1440 */
/* --- cacheline 11 boundary (1408 bytes) was 32 bytes ago --- */
long unsigned int _unused[2]; /* 1440 16 */
unsigned int tramp[6]; /* 1456 24 */
struct siginfo * pinfo; /* 1480 8 */
void * puc; /* 1488 8 */
struct siginfo info; /* 1496 128 */
/* --- cacheline 12 boundary (1536 bytes) was 88 bytes ago --- */
char abigap[288]; /* 1624 288 */
/* size: 1920, cachelines: 15, members: 7 */
/* padding: 8 */
};
1920 + 128 = 2048
Then in commit ce48b2100785 ("powerpc: Add VSX context save/restore,
ptrace and signal support") (Jul 2008) the signal frame expanded to
2304 bytes:
struct rt_sigframe {
struct ucontext uc; /* 0 1696 */ <--
/* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
long unsigned int _unused[2]; /* 1696 16 */
unsigned int tramp[6]; /* 1712 24 */
struct siginfo * pinfo; /* 1736 8 */
void * puc; /* 1744 8 */
struct siginfo info; /* 1752 128 */
/* --- cacheline 14 boundary (1792 bytes) was 88 bytes ago --- */
char abigap[288]; /* 1880 288 */
/* size: 2176, cachelines: 17, members: 7 */
/* padding: 8 */
};
2176 + 128 = 2304
At this point we should have been exposed to the bug, though as far as
I know it was never reported. I no longer have a system old enough to
easily test on.
Then in 2010 commit 320b2b8de126 ("mm: keep a guard page below a
grow-down stack segment") caused our stack expansion code to never
trigger, as there was always a VMA found for a write up to PAGE_SIZE
below r1.
That meant the bug was hidden as we continued to expand the signal
frame in commit 2b0a576d15e0 ("powerpc: Add new transactional memory
state to the signal context") (Feb 2013):
struct rt_sigframe {
struct ucontext uc; /* 0 1696 */
/* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
struct ucontext uc_transact; /* 1696 1696 */ <--
/* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
long unsigned int _unused[2]; /* 3392 16 */
unsigned int tramp[6]; /* 3408 24 */
struct siginfo * pinfo; /* 3432 8 */
void * puc; /* 3440 8 */
struct siginfo info; /* 3448 128 */
/* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
char abigap[288]; /* 3576 288 */
/* size: 3872, cachelines: 31, members: 8 */
/* padding: 8 */
/* last cacheline: 32 bytes */
};
3872 + 128 = 4000
And commit 573ebfa6601f ("powerpc: Increase stack redzone for 64-bit
userspace to 512 bytes") (Feb 2014):
struct rt_sigframe {
struct ucontext uc; /* 0 1696 */
/* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
struct ucontext uc_transact; /* 1696 1696 */
/* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
long unsigned int _unused[2]; /* 3392 16 */
unsigned int tramp[6]; /* 3408 24 */
struct siginfo * pinfo; /* 3432 8 */
void * puc; /* 3440 8 */
struct siginfo info; /* 3448 128 */
/* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
char abigap[512]; /* 3576 512 */ <--
/* size: 4096, cachelines: 32, members: 8 */
/* padding: 8 */
};
4096 + 128 = 4224
Then finally in 2017, commit 1be7107fbe18 ("mm: larger stack guard
gap, between vmas") exposed us to the existing bug, because it changed
the stack VMA to be the correct/real size, meaning our stack expansion
code is now triggered.
Fix it by increasing the allowance to 4224 bytes.
Hard-coding 4224 is obviously unsafe against future expansions of the
signal frame in the same way as the existing code. We can't easily use
sizeof() because the signal frame structure is not in a header. We
will either fix that, or rip out all the custom stack expansion
checking logic entirely.
Fixes: ce48b2100785 ("powerpc: Add VSX context save/restore, ptrace and signal support")
Cc: stable(a)vger.kernel.org # v2.6.27+
Reported-by: Tom Lane <tgl(a)sss.pgh.pa.us>
Tested-by: Daniel Axtens <dja(a)axtens.net>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20200724092528.1578671-2-mpe@ellerman.id.au
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 641fc5f3d7dd..3ebb1792e636 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -267,6 +267,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
return false;
}
+// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
+#define SIGFRAME_MAX_SIZE (4096 + 128)
+
static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
struct vm_area_struct *vma, unsigned int flags,
bool *must_retry)
@@ -274,7 +277,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
/*
* N.B. The POWER/Open ABI allows programs to access up to
* 288 bytes below the stack pointer.
- * The kernel signal delivery code writes up to about 1.5kB
+ * The kernel signal delivery code writes a bit over 4KB
* below the stack pointer (r1) before decrementing it.
* The exec code can write slightly over 640kB to the stack
* before setting the user r1. Thus we allow the stack to
@@ -299,7 +302,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
- if (address + 2048 >= uregs->gpr[1])
+ if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
return false;
if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 63dee5df43a31f3844efabc58972f0a206ca4534 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe(a)ellerman.id.au>
Date: Fri, 24 Jul 2020 19:25:25 +1000
Subject: [PATCH] powerpc: Allow 4224 bytes of stack expansion for the signal
frame
We have powerpc specific logic in our page fault handling to decide if
an access to an unmapped address below the stack pointer should expand
the stack VMA.
The code was originally added in 2004 "ported from 2.4". The rough
logic is that the stack is allowed to grow to 1MB with no extra
checking. Over 1MB the access must be within 2048 bytes of the stack
pointer, or be from a user instruction that updates the stack pointer.
The 2048 byte allowance below the stack pointer is there to cover the
288 byte "red zone" as well as the "about 1.5kB" needed by the signal
delivery code.
Unfortunately since then the signal frame has expanded, and is now
4224 bytes on 64-bit kernels with transactional memory enabled. This
means if a process has consumed more than 1MB of stack, and its stack
pointer lies less than 4224 bytes from the next page boundary, signal
delivery will fault when trying to expand the stack and the process
will see a SEGV.
The total size of the signal frame is the size of struct rt_sigframe
(which includes the red zone) plus __SIGNAL_FRAMESIZE (128 bytes on
64-bit).
The 2048 byte allowance was correct until 2008 as the signal frame
was:
struct rt_sigframe {
struct ucontext uc; /* 0 1440 */
/* --- cacheline 11 boundary (1408 bytes) was 32 bytes ago --- */
long unsigned int _unused[2]; /* 1440 16 */
unsigned int tramp[6]; /* 1456 24 */
struct siginfo * pinfo; /* 1480 8 */
void * puc; /* 1488 8 */
struct siginfo info; /* 1496 128 */
/* --- cacheline 12 boundary (1536 bytes) was 88 bytes ago --- */
char abigap[288]; /* 1624 288 */
/* size: 1920, cachelines: 15, members: 7 */
/* padding: 8 */
};
1920 + 128 = 2048
Then in commit ce48b2100785 ("powerpc: Add VSX context save/restore,
ptrace and signal support") (Jul 2008) the signal frame expanded to
2304 bytes:
struct rt_sigframe {
struct ucontext uc; /* 0 1696 */ <--
/* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
long unsigned int _unused[2]; /* 1696 16 */
unsigned int tramp[6]; /* 1712 24 */
struct siginfo * pinfo; /* 1736 8 */
void * puc; /* 1744 8 */
struct siginfo info; /* 1752 128 */
/* --- cacheline 14 boundary (1792 bytes) was 88 bytes ago --- */
char abigap[288]; /* 1880 288 */
/* size: 2176, cachelines: 17, members: 7 */
/* padding: 8 */
};
2176 + 128 = 2304
At this point we should have been exposed to the bug, though as far as
I know it was never reported. I no longer have a system old enough to
easily test on.
Then in 2010 commit 320b2b8de126 ("mm: keep a guard page below a
grow-down stack segment") caused our stack expansion code to never
trigger, as there was always a VMA found for a write up to PAGE_SIZE
below r1.
That meant the bug was hidden as we continued to expand the signal
frame in commit 2b0a576d15e0 ("powerpc: Add new transactional memory
state to the signal context") (Feb 2013):
struct rt_sigframe {
struct ucontext uc; /* 0 1696 */
/* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
struct ucontext uc_transact; /* 1696 1696 */ <--
/* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
long unsigned int _unused[2]; /* 3392 16 */
unsigned int tramp[6]; /* 3408 24 */
struct siginfo * pinfo; /* 3432 8 */
void * puc; /* 3440 8 */
struct siginfo info; /* 3448 128 */
/* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
char abigap[288]; /* 3576 288 */
/* size: 3872, cachelines: 31, members: 8 */
/* padding: 8 */
/* last cacheline: 32 bytes */
};
3872 + 128 = 4000
And commit 573ebfa6601f ("powerpc: Increase stack redzone for 64-bit
userspace to 512 bytes") (Feb 2014):
struct rt_sigframe {
struct ucontext uc; /* 0 1696 */
/* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
struct ucontext uc_transact; /* 1696 1696 */
/* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
long unsigned int _unused[2]; /* 3392 16 */
unsigned int tramp[6]; /* 3408 24 */
struct siginfo * pinfo; /* 3432 8 */
void * puc; /* 3440 8 */
struct siginfo info; /* 3448 128 */
/* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
char abigap[512]; /* 3576 512 */ <--
/* size: 4096, cachelines: 32, members: 8 */
/* padding: 8 */
};
4096 + 128 = 4224
Then finally in 2017, commit 1be7107fbe18 ("mm: larger stack guard
gap, between vmas") exposed us to the existing bug, because it changed
the stack VMA to be the correct/real size, meaning our stack expansion
code is now triggered.
Fix it by increasing the allowance to 4224 bytes.
Hard-coding 4224 is obviously unsafe against future expansions of the
signal frame in the same way as the existing code. We can't easily use
sizeof() because the signal frame structure is not in a header. We
will either fix that, or rip out all the custom stack expansion
checking logic entirely.
Fixes: ce48b2100785 ("powerpc: Add VSX context save/restore, ptrace and signal support")
Cc: stable(a)vger.kernel.org # v2.6.27+
Reported-by: Tom Lane <tgl(a)sss.pgh.pa.us>
Tested-by: Daniel Axtens <dja(a)axtens.net>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20200724092528.1578671-2-mpe@ellerman.id.au
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 641fc5f3d7dd..3ebb1792e636 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -267,6 +267,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
return false;
}
+// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
+#define SIGFRAME_MAX_SIZE (4096 + 128)
+
static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
struct vm_area_struct *vma, unsigned int flags,
bool *must_retry)
@@ -274,7 +277,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
/*
* N.B. The POWER/Open ABI allows programs to access up to
* 288 bytes below the stack pointer.
- * The kernel signal delivery code writes up to about 1.5kB
+ * The kernel signal delivery code writes a bit over 4KB
* below the stack pointer (r1) before decrementing it.
* The exec code can write slightly over 640kB to the stack
* before setting the user r1. Thus we allow the stack to
@@ -299,7 +302,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
- if (address + 2048 >= uregs->gpr[1])
+ if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
return false;
if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 63dee5df43a31f3844efabc58972f0a206ca4534 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe(a)ellerman.id.au>
Date: Fri, 24 Jul 2020 19:25:25 +1000
Subject: [PATCH] powerpc: Allow 4224 bytes of stack expansion for the signal
frame
We have powerpc specific logic in our page fault handling to decide if
an access to an unmapped address below the stack pointer should expand
the stack VMA.
The code was originally added in 2004 "ported from 2.4". The rough
logic is that the stack is allowed to grow to 1MB with no extra
checking. Over 1MB the access must be within 2048 bytes of the stack
pointer, or be from a user instruction that updates the stack pointer.
The 2048 byte allowance below the stack pointer is there to cover the
288 byte "red zone" as well as the "about 1.5kB" needed by the signal
delivery code.
Unfortunately since then the signal frame has expanded, and is now
4224 bytes on 64-bit kernels with transactional memory enabled. This
means if a process has consumed more than 1MB of stack, and its stack
pointer lies less than 4224 bytes from the next page boundary, signal
delivery will fault when trying to expand the stack and the process
will see a SEGV.
The total size of the signal frame is the size of struct rt_sigframe
(which includes the red zone) plus __SIGNAL_FRAMESIZE (128 bytes on
64-bit).
The 2048 byte allowance was correct until 2008 as the signal frame
was:
struct rt_sigframe {
struct ucontext uc; /* 0 1440 */
/* --- cacheline 11 boundary (1408 bytes) was 32 bytes ago --- */
long unsigned int _unused[2]; /* 1440 16 */
unsigned int tramp[6]; /* 1456 24 */
struct siginfo * pinfo; /* 1480 8 */
void * puc; /* 1488 8 */
struct siginfo info; /* 1496 128 */
/* --- cacheline 12 boundary (1536 bytes) was 88 bytes ago --- */
char abigap[288]; /* 1624 288 */
/* size: 1920, cachelines: 15, members: 7 */
/* padding: 8 */
};
1920 + 128 = 2048
Then in commit ce48b2100785 ("powerpc: Add VSX context save/restore,
ptrace and signal support") (Jul 2008) the signal frame expanded to
2304 bytes:
struct rt_sigframe {
struct ucontext uc; /* 0 1696 */ <--
/* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
long unsigned int _unused[2]; /* 1696 16 */
unsigned int tramp[6]; /* 1712 24 */
struct siginfo * pinfo; /* 1736 8 */
void * puc; /* 1744 8 */
struct siginfo info; /* 1752 128 */
/* --- cacheline 14 boundary (1792 bytes) was 88 bytes ago --- */
char abigap[288]; /* 1880 288 */
/* size: 2176, cachelines: 17, members: 7 */
/* padding: 8 */
};
2176 + 128 = 2304
At this point we should have been exposed to the bug, though as far as
I know it was never reported. I no longer have a system old enough to
easily test on.
Then in 2010 commit 320b2b8de126 ("mm: keep a guard page below a
grow-down stack segment") caused our stack expansion code to never
trigger, as there was always a VMA found for a write up to PAGE_SIZE
below r1.
That meant the bug was hidden as we continued to expand the signal
frame in commit 2b0a576d15e0 ("powerpc: Add new transactional memory
state to the signal context") (Feb 2013):
struct rt_sigframe {
struct ucontext uc; /* 0 1696 */
/* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
struct ucontext uc_transact; /* 1696 1696 */ <--
/* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
long unsigned int _unused[2]; /* 3392 16 */
unsigned int tramp[6]; /* 3408 24 */
struct siginfo * pinfo; /* 3432 8 */
void * puc; /* 3440 8 */
struct siginfo info; /* 3448 128 */
/* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
char abigap[288]; /* 3576 288 */
/* size: 3872, cachelines: 31, members: 8 */
/* padding: 8 */
/* last cacheline: 32 bytes */
};
3872 + 128 = 4000
And commit 573ebfa6601f ("powerpc: Increase stack redzone for 64-bit
userspace to 512 bytes") (Feb 2014):
struct rt_sigframe {
struct ucontext uc; /* 0 1696 */
/* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
struct ucontext uc_transact; /* 1696 1696 */
/* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
long unsigned int _unused[2]; /* 3392 16 */
unsigned int tramp[6]; /* 3408 24 */
struct siginfo * pinfo; /* 3432 8 */
void * puc; /* 3440 8 */
struct siginfo info; /* 3448 128 */
/* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
char abigap[512]; /* 3576 512 */ <--
/* size: 4096, cachelines: 32, members: 8 */
/* padding: 8 */
};
4096 + 128 = 4224
Then finally in 2017, commit 1be7107fbe18 ("mm: larger stack guard
gap, between vmas") exposed us to the existing bug, because it changed
the stack VMA to be the correct/real size, meaning our stack expansion
code is now triggered.
Fix it by increasing the allowance to 4224 bytes.
Hard-coding 4224 is obviously unsafe against future expansions of the
signal frame in the same way as the existing code. We can't easily use
sizeof() because the signal frame structure is not in a header. We
will either fix that, or rip out all the custom stack expansion
checking logic entirely.
Fixes: ce48b2100785 ("powerpc: Add VSX context save/restore, ptrace and signal support")
Cc: stable(a)vger.kernel.org # v2.6.27+
Reported-by: Tom Lane <tgl(a)sss.pgh.pa.us>
Tested-by: Daniel Axtens <dja(a)axtens.net>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20200724092528.1578671-2-mpe@ellerman.id.au
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 641fc5f3d7dd..3ebb1792e636 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -267,6 +267,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
return false;
}
+// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
+#define SIGFRAME_MAX_SIZE (4096 + 128)
+
static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
struct vm_area_struct *vma, unsigned int flags,
bool *must_retry)
@@ -274,7 +277,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
/*
* N.B. The POWER/Open ABI allows programs to access up to
* 288 bytes below the stack pointer.
- * The kernel signal delivery code writes up to about 1.5kB
+ * The kernel signal delivery code writes a bit over 4KB
* below the stack pointer (r1) before decrementing it.
* The exec code can write slightly over 640kB to the stack
* before setting the user r1. Thus we allow the stack to
@@ -299,7 +302,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
- if (address + 2048 >= uregs->gpr[1])
+ if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
return false;
if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
Hi,
please add patch
881a3a11c2b8 ("btrfs: fix return value mixup in btrfs_get_extent")
to 4.19 tree.
It's a fixup for patch 6bf9e4bd6a27 ("btrfs: inode: Verify inode mode to
avoid NULL pointer dereference"). I don't see it queued yet and the patch
is tagged for 5.4, this is just a heads up so it's not forgotten.
All the related patches:
6bf9e4bd6a27 ("btrfs: inode: Verify inode mode to avoid NULL pointer dereference")
9f7fec0ba891 ("Btrfs: fix selftests failure due to uninitialized i_mode in test inodes")
881a3a11c2b8 ("btrfs: fix return value mixup in btrfs_get_extent")
>From 5.4 up it's fine.
Thanks.
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ef3959b29c4a5bd65526ab310a1a18ae533172a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Wed, 22 Jul 2020 11:12:46 -0400
Subject: [PATCH] btrfs: don't show full path of bind mounts in subvol=
Chris Murphy reported a problem where rpm ostree will bind mount a bunch
of things for whatever voodoo it's doing. But when it does this
/proc/mounts shows something like
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo/bar 0 0
Despite subvolid=256 being subvol=/foo. This is because we're just
spitting out the dentry of the mount point, which in the case of bind
mounts is the source path for the mountpoint. Instead we should spit
out the path to the actual subvol. Fix this by looking up the name for
the subvolid we have mounted. With this fix the same test looks like
this
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
Reported-by: Chris Murphy <chris(a)colorremedies.com>
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index aa73422b0678..9b4e9c4c4673 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1386,6 +1386,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
+ const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1472,8 +1473,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ef3959b29c4a5bd65526ab310a1a18ae533172a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Wed, 22 Jul 2020 11:12:46 -0400
Subject: [PATCH] btrfs: don't show full path of bind mounts in subvol=
Chris Murphy reported a problem where rpm ostree will bind mount a bunch
of things for whatever voodoo it's doing. But when it does this
/proc/mounts shows something like
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo/bar 0 0
Despite subvolid=256 being subvol=/foo. This is because we're just
spitting out the dentry of the mount point, which in the case of bind
mounts is the source path for the mountpoint. Instead we should spit
out the path to the actual subvol. Fix this by looking up the name for
the subvolid we have mounted. With this fix the same test looks like
this
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
Reported-by: Chris Murphy <chris(a)colorremedies.com>
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index aa73422b0678..9b4e9c4c4673 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1386,6 +1386,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
+ const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1472,8 +1473,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1e6e238c3002ea3611465ce5f32777ddd6a40126 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Tue, 28 Jul 2020 16:39:26 +0800
Subject: [PATCH] btrfs: inode: fix NULL pointer dereference if inode doesn't
need compression
[BUG]
There is a bug report of NULL pointer dereference caused in
compress_file_extent():
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Workqueue: btrfs-delalloc btrfs_delalloc_helper [btrfs]
NIP [c008000006dd4d34] compress_file_range.constprop.41+0x75c/0x8a0 [btrfs]
LR [c008000006dd4d1c] compress_file_range.constprop.41+0x744/0x8a0 [btrfs]
Call Trace:
[c000000c69093b00] [c008000006dd4d1c] compress_file_range.constprop.41+0x744/0x8a0 [btrfs] (unreliable)
[c000000c69093bd0] [c008000006dd4ebc] async_cow_start+0x44/0xa0 [btrfs]
[c000000c69093c10] [c008000006e14824] normal_work_helper+0xdc/0x598 [btrfs]
[c000000c69093c80] [c0000000001608c0] process_one_work+0x2c0/0x5b0
[c000000c69093d10] [c000000000160c38] worker_thread+0x88/0x660
[c000000c69093db0] [c00000000016b55c] kthread+0x1ac/0x1c0
[c000000c69093e20] [c00000000000b660] ret_from_kernel_thread+0x5c/0x7c
---[ end trace f16954aa20d822f6 ]---
[CAUSE]
For the following execution route of compress_file_range(), it's
possible to hit NULL pointer dereference:
compress_file_extent()
|- pages = NULL;
|- start = async_chunk->start = 0;
|- end = async_chunk = 4095;
|- nr_pages = 1;
|- inode_need_compress() == false; <<< Possible, see later explanation
| Now, we have nr_pages = 1, pages = NULL
|- cont:
|- ret = cow_file_range_inline();
|- if (ret <= 0) {
|- for (i = 0; i < nr_pages; i++) {
|- WARN_ON(pages[i]->mapping); <<< Crash
To enter above call execution branch, we need the following race:
Thread 1 (chattr) | Thread 2 (writeback)
--------------------------+------------------------------
| btrfs_run_delalloc_range
| |- inode_need_compress = true
| |- cow_file_range_async()
btrfs_ioctl_set_flag() |
|- binode_flags |= |
BTRFS_INODE_NOCOMPRESS |
| compress_file_range()
| |- inode_need_compress = false
| |- nr_page = 1 while pages = NULL
| | Then hit the crash
[FIX]
This patch will fix it by checking @pages before doing accessing it.
This patch is only designed as a hot fix and easy to backport.
More elegant fix may make btrfs only check inode_need_compress() once to
avoid such race, but that would be another story.
Reported-by: Luciano Chavez <chavez(a)us.ibm.com>
Fixes: 4d3a800ebb12 ("btrfs: merge nr_pages input and output parameter in compress_pages")
CC: stable(a)vger.kernel.org # 4.14.x: cecc8d9038d16: btrfs: Move free_pages_out label in inline extent handling branch in compress_file_range
CC: stable(a)vger.kernel.org # 4.14+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b3412fbfd..9988d754e465 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -653,12 +653,18 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
page_error_op |
PAGE_END_WRITEBACK);
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ /*
+ * Ensure we only free the compressed pages if we have
+ * them allocated, as we can still reach here with
+ * inode_need_compress() == false.
+ */
+ if (pages) {
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
}
- kfree(pages);
-
return 0;
}
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1e6e238c3002ea3611465ce5f32777ddd6a40126 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Tue, 28 Jul 2020 16:39:26 +0800
Subject: [PATCH] btrfs: inode: fix NULL pointer dereference if inode doesn't
need compression
[BUG]
There is a bug report of NULL pointer dereference caused in
compress_file_extent():
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Workqueue: btrfs-delalloc btrfs_delalloc_helper [btrfs]
NIP [c008000006dd4d34] compress_file_range.constprop.41+0x75c/0x8a0 [btrfs]
LR [c008000006dd4d1c] compress_file_range.constprop.41+0x744/0x8a0 [btrfs]
Call Trace:
[c000000c69093b00] [c008000006dd4d1c] compress_file_range.constprop.41+0x744/0x8a0 [btrfs] (unreliable)
[c000000c69093bd0] [c008000006dd4ebc] async_cow_start+0x44/0xa0 [btrfs]
[c000000c69093c10] [c008000006e14824] normal_work_helper+0xdc/0x598 [btrfs]
[c000000c69093c80] [c0000000001608c0] process_one_work+0x2c0/0x5b0
[c000000c69093d10] [c000000000160c38] worker_thread+0x88/0x660
[c000000c69093db0] [c00000000016b55c] kthread+0x1ac/0x1c0
[c000000c69093e20] [c00000000000b660] ret_from_kernel_thread+0x5c/0x7c
---[ end trace f16954aa20d822f6 ]---
[CAUSE]
For the following execution route of compress_file_range(), it's
possible to hit NULL pointer dereference:
compress_file_extent()
|- pages = NULL;
|- start = async_chunk->start = 0;
|- end = async_chunk = 4095;
|- nr_pages = 1;
|- inode_need_compress() == false; <<< Possible, see later explanation
| Now, we have nr_pages = 1, pages = NULL
|- cont:
|- ret = cow_file_range_inline();
|- if (ret <= 0) {
|- for (i = 0; i < nr_pages; i++) {
|- WARN_ON(pages[i]->mapping); <<< Crash
To enter above call execution branch, we need the following race:
Thread 1 (chattr) | Thread 2 (writeback)
--------------------------+------------------------------
| btrfs_run_delalloc_range
| |- inode_need_compress = true
| |- cow_file_range_async()
btrfs_ioctl_set_flag() |
|- binode_flags |= |
BTRFS_INODE_NOCOMPRESS |
| compress_file_range()
| |- inode_need_compress = false
| |- nr_page = 1 while pages = NULL
| | Then hit the crash
[FIX]
This patch will fix it by checking @pages before doing accessing it.
This patch is only designed as a hot fix and easy to backport.
More elegant fix may make btrfs only check inode_need_compress() once to
avoid such race, but that would be another story.
Reported-by: Luciano Chavez <chavez(a)us.ibm.com>
Fixes: 4d3a800ebb12 ("btrfs: merge nr_pages input and output parameter in compress_pages")
CC: stable(a)vger.kernel.org # 4.14.x: cecc8d9038d16: btrfs: Move free_pages_out label in inline extent handling branch in compress_file_range
CC: stable(a)vger.kernel.org # 4.14+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b3412fbfd..9988d754e465 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -653,12 +653,18 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
page_error_op |
PAGE_END_WRITEBACK);
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ /*
+ * Ensure we only free the compressed pages if we have
+ * them allocated, as we can still reach here with
+ * inode_need_compress() == false.
+ */
+ if (pages) {
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
}
- kfree(pages);
-
return 0;
}
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cecc8d9038d164eda61fbcd72520975a554ea63e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Wed, 17 Jul 2019 14:41:45 +0300
Subject: [PATCH] btrfs: Move free_pages_out label in inline extent handling
branch in compress_file_range
This label is only executed if compress_file_range fails to create an
inline extent. So move its code in the semantically related inline
extent handling branch. No functional changes.
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index db814f555b26..385127ab0841 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -622,7 +622,14 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
PAGE_SET_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
- goto free_pages_out;
+
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+
+ return 0;
}
}
@@ -700,15 +707,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
compressed_extents++;
return compressed_extents;
-
-free_pages_out:
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
- }
- kfree(pages);
-
- return 0;
}
static void free_async_extent_pages(struct async_extent *async_extent)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cecc8d9038d164eda61fbcd72520975a554ea63e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Wed, 17 Jul 2019 14:41:45 +0300
Subject: [PATCH] btrfs: Move free_pages_out label in inline extent handling
branch in compress_file_range
This label is only executed if compress_file_range fails to create an
inline extent. So move its code in the semantically related inline
extent handling branch. No functional changes.
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index db814f555b26..385127ab0841 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -622,7 +622,14 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
PAGE_SET_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
- goto free_pages_out;
+
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+
+ return 0;
}
}
@@ -700,15 +707,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
compressed_extents++;
return compressed_extents;
-
-free_pages_out:
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
- }
- kfree(pages);
-
- return 0;
}
static void free_async_extent_pages(struct async_extent *async_extent)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ef3959b29c4a5bd65526ab310a1a18ae533172a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Wed, 22 Jul 2020 11:12:46 -0400
Subject: [PATCH] btrfs: don't show full path of bind mounts in subvol=
Chris Murphy reported a problem where rpm ostree will bind mount a bunch
of things for whatever voodoo it's doing. But when it does this
/proc/mounts shows something like
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo/bar 0 0
Despite subvolid=256 being subvol=/foo. This is because we're just
spitting out the dentry of the mount point, which in the case of bind
mounts is the source path for the mountpoint. Instead we should spit
out the path to the actual subvol. Fix this by looking up the name for
the subvolid we have mounted. With this fix the same test looks like
this
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
Reported-by: Chris Murphy <chris(a)colorremedies.com>
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index aa73422b0678..9b4e9c4c4673 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1386,6 +1386,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
+ const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1472,8 +1473,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ef3959b29c4a5bd65526ab310a1a18ae533172a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Wed, 22 Jul 2020 11:12:46 -0400
Subject: [PATCH] btrfs: don't show full path of bind mounts in subvol=
Chris Murphy reported a problem where rpm ostree will bind mount a bunch
of things for whatever voodoo it's doing. But when it does this
/proc/mounts shows something like
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo/bar 0 0
Despite subvolid=256 being subvol=/foo. This is because we're just
spitting out the dentry of the mount point, which in the case of bind
mounts is the source path for the mountpoint. Instead we should spit
out the path to the actual subvol. Fix this by looking up the name for
the subvolid we have mounted. With this fix the same test looks like
this
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
Reported-by: Chris Murphy <chris(a)colorremedies.com>
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index aa73422b0678..9b4e9c4c4673 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1386,6 +1386,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
+ const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1472,8 +1473,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ef3959b29c4a5bd65526ab310a1a18ae533172a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Wed, 22 Jul 2020 11:12:46 -0400
Subject: [PATCH] btrfs: don't show full path of bind mounts in subvol=
Chris Murphy reported a problem where rpm ostree will bind mount a bunch
of things for whatever voodoo it's doing. But when it does this
/proc/mounts shows something like
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo/bar 0 0
Despite subvolid=256 being subvol=/foo. This is because we're just
spitting out the dentry of the mount point, which in the case of bind
mounts is the source path for the mountpoint. Instead we should spit
out the path to the actual subvol. Fix this by looking up the name for
the subvolid we have mounted. With this fix the same test looks like
this
/dev/sda /mnt/test btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
/dev/sda /mnt/test/baz btrfs rw,relatime,subvolid=256,subvol=/foo 0 0
Reported-by: Chris Murphy <chris(a)colorremedies.com>
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index aa73422b0678..9b4e9c4c4673 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1386,6 +1386,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
+ const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1472,8 +1473,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From fbabd4a36faaf74c83142d0b3d950c11ec14fda1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Tue, 21 Jul 2020 10:38:37 -0400
Subject: [PATCH] btrfs: return EROFS for BTRFS_FS_STATE_ERROR cases
Eric reported seeing this message while running generic/475
BTRFS: error (device dm-3) in btrfs_sync_log:3084: errno=-117 Filesystem corrupted
Full stack trace:
BTRFS: error (device dm-0) in btrfs_commit_transaction:2323: errno=-5 IO failure (Error while writing out transaction)
BTRFS info (device dm-0): forced readonly
BTRFS warning (device dm-0): Skipping commit of aborted transaction.
------------[ cut here ]------------
BTRFS: error (device dm-0) in cleanup_transaction:1894: errno=-5 IO failure
BTRFS: Transaction aborted (error -117)
BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6480 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6488 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6490 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6498 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64a0 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64a8 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64b0 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64b8 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64c0 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3572 rw 0,0 sector 0x1b85e8 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3572 rw 0,0 sector 0x1b85f0 len 4096 err no 10
WARNING: CPU: 3 PID: 23985 at fs/btrfs/tree-log.c:3084 btrfs_sync_log+0xbc8/0xd60 [btrfs]
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d4288 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d4290 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d4298 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42a0 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42a8 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42b0 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42b8 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42c0 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42c8 len 4096 err no 10
BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42d0 len 4096 err no 10
CPU: 3 PID: 23985 Comm: fsstress Tainted: G W L 5.8.0-rc4-default+ #1181
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
RIP: 0010:btrfs_sync_log+0xbc8/0xd60 [btrfs]
RSP: 0018:ffff909a44d17bd0 EFLAGS: 00010286
RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001
RDX: ffff8f3be41cb940 RSI: ffffffffb0108d2b RDI: ffffffffb0108ff7
RBP: ffff909a44d17e70 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000037988 R12: ffff8f3bd20e4000
R13: ffff8f3bd20e4428 R14: 00000000ffffff8b R15: ffff909a44d17c70
FS: 00007f6a6ed3fb80(0000) GS:ffff8f3c3dc00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f6a6ed3e000 CR3: 00000000525c0003 CR4: 0000000000160ee0
Call Trace:
? finish_wait+0x90/0x90
? __mutex_unlock_slowpath+0x45/0x2a0
? lock_acquire+0xa3/0x440
? lockref_put_or_lock+0x9/0x30
? dput+0x20/0x4a0
? dput+0x20/0x4a0
? do_raw_spin_unlock+0x4b/0xc0
? _raw_spin_unlock+0x1f/0x30
btrfs_sync_file+0x335/0x490 [btrfs]
do_fsync+0x38/0x70
__x64_sys_fsync+0x10/0x20
do_syscall_64+0x50/0xe0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f6a6ef1b6e3
Code: Bad RIP value.
RSP: 002b:00007ffd01e20038 EFLAGS: 00000246 ORIG_RAX: 000000000000004a
RAX: ffffffffffffffda RBX: 000000000007a120 RCX: 00007f6a6ef1b6e3
RDX: 00007ffd01e1ffa0 RSI: 00007ffd01e1ffa0 RDI: 0000000000000003
RBP: 0000000000000003 R08: 0000000000000001 R09: 00007ffd01e2004c
R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000009f
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffffb007fe0b>] copy_process+0x67b/0x1b00
softirqs last enabled at (0): [<ffffffffb007fe0b>] copy_process+0x67b/0x1b00
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace af146e0e38433456 ]---
BTRFS: error (device dm-0) in btrfs_sync_log:3084: errno=-117 Filesystem corrupted
This ret came from btrfs_write_marked_extents(). If we get an aborted
transaction via EIO before, we'll see it in btree_write_cache_pages()
and return EUCLEAN, which gets printed as "Filesystem corrupted".
Except we shouldn't be returning EUCLEAN here, we need to be returning
EROFS because EUCLEAN is reserved for actual corruption, not IO errors.
We are inconsistent about our handling of BTRFS_FS_STATE_ERROR
elsewhere, but we want to use EROFS for this particular case. The
original transaction abort has the real error code for why we ended up
with an aborted transaction, all subsequent actions just need to return
EROFS because they may not have a trans handle and have no idea about
the original cause of the abort.
After patch "btrfs: don't WARN if we abort a transaction with EROFS" the
stacktrace will not be dumped either.
Reported-by: Eric Sandeen <esandeen(a)redhat.com>
CC: stable(a)vger.kernel.org # 5.4+
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
[ add full test stacktrace ]
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 73c9c59cd535..3fbc37692592 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4119,7 +4119,7 @@ int btree_write_cache_pages(struct address_space *mapping,
if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
ret = flush_write_bio(&epd);
} else {
- ret = -EUCLEAN;
+ ret = -EROFS;
end_write_bio(&epd, ret);
}
return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d935ac06323f..5a6cb9db512e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3691,7 +3691,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
- return -EIO;
+ return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
if (scrub_dev->fs_devices != fs_info->fs_devices)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index efafc286323c..20c6ac1a5de7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -937,7 +937,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (TRANS_ABORTED(trans) ||
test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
wake_up_process(info->transaction_kthread);
- err = -EIO;
+ if (TRANS_ABORTED(trans))
+ err = trans->aborted;
+ else
+ err = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3502a8c0dc1bd4b4970b59b06e348f22a1c05581 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba(a)suse.com>
Date: Thu, 25 Jun 2020 12:35:28 +0200
Subject: [PATCH] btrfs: allow use of global block reserve for balance item
deletion
On a filesystem with exhausted metadata, but still enough to start
balance, it's possible to hit this error:
[324402.053842] BTRFS info (device loop0): 1 enospc errors during balance
[324402.060769] BTRFS info (device loop0): balance: ended with status: -28
[324402.172295] BTRFS: error (device loop0) in reset_balance_state:3321: errno=-28 No space left
It fails inside reset_balance_state and turns the filesystem to
read-only, which is unnecessary and should be fixed too, but the problem
is caused by lack for space when the balance item is deleted. This is a
one-time operation and from the same rank as unlink that is allowed to
use the global block reserve. So do the same for the balance item.
Status of the filesystem (100GiB) just after the balance fails:
$ btrfs fi df mnt
Data, single: total=80.01GiB, used=38.58GiB
System, single: total=4.00MiB, used=16.00KiB
Metadata, single: total=19.99GiB, used=19.48GiB
GlobalReserve, single: total=512.00MiB, used=50.11MiB
CC: stable(a)vger.kernel.org # 4.4+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f403fb1e6d37..62ae89b078f4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3231,7 +3231,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3502a8c0dc1bd4b4970b59b06e348f22a1c05581 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba(a)suse.com>
Date: Thu, 25 Jun 2020 12:35:28 +0200
Subject: [PATCH] btrfs: allow use of global block reserve for balance item
deletion
On a filesystem with exhausted metadata, but still enough to start
balance, it's possible to hit this error:
[324402.053842] BTRFS info (device loop0): 1 enospc errors during balance
[324402.060769] BTRFS info (device loop0): balance: ended with status: -28
[324402.172295] BTRFS: error (device loop0) in reset_balance_state:3321: errno=-28 No space left
It fails inside reset_balance_state and turns the filesystem to
read-only, which is unnecessary and should be fixed too, but the problem
is caused by lack for space when the balance item is deleted. This is a
one-time operation and from the same rank as unlink that is allowed to
use the global block reserve. So do the same for the balance item.
Status of the filesystem (100GiB) just after the balance fails:
$ btrfs fi df mnt
Data, single: total=80.01GiB, used=38.58GiB
System, single: total=4.00MiB, used=16.00KiB
Metadata, single: total=19.99GiB, used=19.48GiB
GlobalReserve, single: total=512.00MiB, used=50.11MiB
CC: stable(a)vger.kernel.org # 4.4+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f403fb1e6d37..62ae89b078f4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3231,7 +3231,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3502a8c0dc1bd4b4970b59b06e348f22a1c05581 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba(a)suse.com>
Date: Thu, 25 Jun 2020 12:35:28 +0200
Subject: [PATCH] btrfs: allow use of global block reserve for balance item
deletion
On a filesystem with exhausted metadata, but still enough to start
balance, it's possible to hit this error:
[324402.053842] BTRFS info (device loop0): 1 enospc errors during balance
[324402.060769] BTRFS info (device loop0): balance: ended with status: -28
[324402.172295] BTRFS: error (device loop0) in reset_balance_state:3321: errno=-28 No space left
It fails inside reset_balance_state and turns the filesystem to
read-only, which is unnecessary and should be fixed too, but the problem
is caused by lack for space when the balance item is deleted. This is a
one-time operation and from the same rank as unlink that is allowed to
use the global block reserve. So do the same for the balance item.
Status of the filesystem (100GiB) just after the balance fails:
$ btrfs fi df mnt
Data, single: total=80.01GiB, used=38.58GiB
System, single: total=4.00MiB, used=16.00KiB
Metadata, single: total=19.99GiB, used=19.48GiB
GlobalReserve, single: total=512.00MiB, used=50.11MiB
CC: stable(a)vger.kernel.org # 4.4+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f403fb1e6d37..62ae89b078f4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3231,7 +3231,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3502a8c0dc1bd4b4970b59b06e348f22a1c05581 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba(a)suse.com>
Date: Thu, 25 Jun 2020 12:35:28 +0200
Subject: [PATCH] btrfs: allow use of global block reserve for balance item
deletion
On a filesystem with exhausted metadata, but still enough to start
balance, it's possible to hit this error:
[324402.053842] BTRFS info (device loop0): 1 enospc errors during balance
[324402.060769] BTRFS info (device loop0): balance: ended with status: -28
[324402.172295] BTRFS: error (device loop0) in reset_balance_state:3321: errno=-28 No space left
It fails inside reset_balance_state and turns the filesystem to
read-only, which is unnecessary and should be fixed too, but the problem
is caused by lack for space when the balance item is deleted. This is a
one-time operation and from the same rank as unlink that is allowed to
use the global block reserve. So do the same for the balance item.
Status of the filesystem (100GiB) just after the balance fails:
$ btrfs fi df mnt
Data, single: total=80.01GiB, used=38.58GiB
System, single: total=4.00MiB, used=16.00KiB
Metadata, single: total=19.99GiB, used=19.48GiB
GlobalReserve, single: total=512.00MiB, used=50.11MiB
CC: stable(a)vger.kernel.org # 4.4+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f403fb1e6d37..62ae89b078f4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3231,7 +3231,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);