commit 1007843a91909a4995ee78a538f62d8665705b66 upstream.
syzbot is reporting circular locking dependency which involves zonelist_update_seq seqlock [1], for this lock is checked by memory allocation requests which do not need to be retried.
One deadlock scenario is kmalloc(GFP_ATOMIC) from an interrupt handler.
CPU0 ---- __build_all_zonelists() { write_seqlock(&zonelist_update_seq); // makes zonelist_update_seq.seqcount odd // e.g. timer interrupt handler runs at this moment some_timer_func() { kmalloc(GFP_ATOMIC) { __alloc_pages_slowpath() { read_seqbegin(&zonelist_update_seq) { // spins forever because zonelist_update_seq.seqcount is odd } } } } // e.g. timer interrupt handler finishes write_sequnlock(&zonelist_update_seq); // makes zonelist_update_seq.seqcount even }
This deadlock scenario can be easily eliminated by not calling read_seqbegin(&zonelist_update_seq) from !__GFP_DIRECT_RECLAIM allocation requests, for retry is applicable to only __GFP_DIRECT_RECLAIM allocation requests. But Michal Hocko does not know whether we should go with this approach.
Another deadlock scenario which syzbot is reporting is a race between kmalloc(GFP_ATOMIC) from tty_insert_flip_string_and_push_buffer() with port->lock held and printk() from __build_all_zonelists() with zonelist_update_seq held.
CPU0 CPU1 ---- ---- pty_write() { tty_insert_flip_string_and_push_buffer() { __build_all_zonelists() { write_seqlock(&zonelist_update_seq); build_zonelists() { printk() { vprintk() { vprintk_default() { vprintk_emit() { console_unlock() { console_flush_all() { console_emit_next_record() { con->write() = serial8250_console_write() { spin_lock_irqsave(&port->lock, flags); tty_insert_flip_string() { tty_insert_flip_string_fixed_flag() { __tty_buffer_request_room() { tty_buffer_alloc() { kmalloc(GFP_ATOMIC | __GFP_NOWARN) { __alloc_pages_slowpath() { zonelist_iter_begin() { read_seqbegin(&zonelist_update_seq); // spins forever because zonelist_update_seq.seqcount is odd spin_lock_irqsave(&port->lock, flags); // spins forever because port->lock is held } } } } } } } } spin_unlock_irqrestore(&port->lock, flags); // message is printed to console spin_unlock_irqrestore(&port->lock, flags); } } } } } } } } } write_sequnlock(&zonelist_update_seq); } } }
This deadlock scenario can be eliminated by
preventing interrupt context from calling kmalloc(GFP_ATOMIC)
and
preventing printk() from calling console_flush_all()
while zonelist_update_seq.seqcount is odd.
Since Petr Mladek thinks that __build_all_zonelists() can become a candidate for deferring printk() [2], let's address this problem by
disabling local interrupts in order to avoid kmalloc(GFP_ATOMIC)
and
disabling synchronous printk() in order to avoid console_flush_all()
.
As a side effect of minimizing duration of zonelist_update_seq.seqcount being odd by disabling synchronous printk(), latency at read_seqbegin(&zonelist_update_seq) for both !__GFP_DIRECT_RECLAIM and __GFP_DIRECT_RECLAIM allocation requests will be reduced. Although, from lockdep perspective, not calling read_seqbegin(&zonelist_update_seq) (i.e. do not record unnecessary locking dependency) from interrupt context is still preferable, even if we don't allow calling kmalloc(GFP_ATOMIC) inside write_seqlock(&zonelist_update_seq)/write_sequnlock(&zonelist_update_seq) section...
Link: https://lkml.kernel.org/r/8796b95c-3da3-5885-fddd-6ef55f30e4d3@I-love.SAKURA... Fixes: 3d36424b3b58 ("mm/page_alloc: fix race condition between build_all_zonelists and page allocation") Link: https://lkml.kernel.org/r/ZCrs+1cDqPWTDFNM@alley [2] Reported-by: syzbot syzbot+223c7461c58c58a4cb10@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=223c7461c58c58a4cb10 [1] Signed-off-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Acked-by: Michal Hocko mhocko@suse.com Acked-by: Mel Gorman mgorman@techsingularity.net Cc: Petr Mladek pmladek@suse.com Cc: David Hildenbrand david@redhat.com Cc: Ilpo Järvinen ilpo.jarvinen@linux.intel.com Cc: John Ogness john.ogness@linutronix.de Cc: Patrick Daly quic_pdaly@quicinc.com Cc: Sergey Senozhatsky senozhatsky@chromium.org Cc: Steven Rostedt rostedt@goodmis.org Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [ Copied printk_deferred_{enter,safe}() definition from commit 85e3e7fbbb72 ("printk: remove NMI tracking"). -- penguin-kernel ] Signed-off-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp --- include/linux/printk.h | 20 ++++++++++++++++++++ mm/page_alloc.c | 16 ++++++++++++++++ 2 files changed, 36 insertions(+)
diff --git a/include/linux/printk.h b/include/linux/printk.h index 6106befed756..3441f181faa3 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -528,4 +528,24 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, } #endif
+/* Partial backport of commit 85e3e7fbbb72 ("printk: remove NMI tracking") */ +#ifdef CONFIG_PRINTK +extern void __printk_safe_enter(void); +extern void __printk_safe_exit(void); +/* + * The printk_deferred_enter/exit macros are available only as a hack for + * some code paths that need to defer all printk console printing. Interrupts + * must be disabled for the deferred duration. + */ +#define printk_deferred_enter __printk_safe_enter +#define printk_deferred_exit __printk_safe_exit +#else +static inline void printk_deferred_enter(void) +{ +} +static inline void printk_deferred_exit(void) +{ +} +#endif + #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f88307fee38d..7163cf682582 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5213,7 +5213,21 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; + unsigned long flags;
+ /* + * Explicitly disable this CPU's interrupts before taking seqlock + * to prevent any IRQ handler from calling into the page allocator + * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + */ + local_irq_save(flags); + /* + * Explicitly disable this CPU's synchronous printk() before taking + * seqlock to prevent any printk() from trying to hold port->lock, for + * tty_insert_flip_string_and_push_buffer() on other CPU might be + * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. + */ + printk_deferred_enter(); write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA @@ -5248,6 +5262,8 @@ static void __build_all_zonelists(void *data) }
write_sequnlock(&zonelist_update_seq); + printk_deferred_exit(); + local_irq_restore(flags); }
static noinline void __init