@ arch/arm/Kconfig:37 @ config ARM
 	select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
+	select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_USE_MEMTEST
@ arch/arm/Kconfig:76 @ config ARM
 	select HARDIRQS_SW_RESEND
 	select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT
 	select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
-	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
+	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT
 	select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL
 	select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
 	select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
@ arch/arm/Kconfig:121 @ config ARM
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM
+	select HAVE_PREEMPT_LAZY
 	select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RSEQ
@ arch/arm/include/asm/assembler.h:247 @ THUMB(	fpreg	.req	r7	)
 	.endm
 #endif
 
-	.macro	local_bh_disable, ti, tmp
-	ldr	\tmp, [\ti, #TI_PREEMPT]
-	add	\tmp, \tmp, #SOFTIRQ_DISABLE_OFFSET
-	str	\tmp, [\ti, #TI_PREEMPT]
-	.endm
-
-	.macro	local_bh_enable_ti, ti, tmp
-	get_thread_info \ti
-	ldr	\tmp, [\ti, #TI_PREEMPT]
-	sub	\tmp, \tmp, #SOFTIRQ_DISABLE_OFFSET
-	str	\tmp, [\ti, #TI_PREEMPT]
-	.endm
-
 #define USERL(l, x...)				\
 9999:	x;					\
 	.pushsection __ex_table,"a";		\
@ arch/arm/include/asm/thread_info.h:65 @ struct cpu_context_save {
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
+	int			preempt_lazy_count; /* 0 => preemptable, <0 => bug */
 	__u32			cpu;		/* cpu */
 	__u32			cpu_domain;	/* cpu domain */
 	struct cpu_context_save	cpu_context;	/* cpu context */
@ arch/arm/include/asm/thread_info.h:133 @ extern int vfp_restore_user_hwstate(struct user_vfp *,
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_UPROBE		3	/* breakpointed or singlestepping */
 #define TIF_NOTIFY_SIGNAL	4	/* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY	5
 
 #define TIF_USING_IWMMXT	17
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
@ arch/arm/include/asm/thread_info.h:153 @ extern int vfp_restore_user_hwstate(struct user_vfp *,
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
 
 /* Checks for any syscall work in entry-common.S */
@ arch/arm/include/asm/thread_info.h:163 @ extern int vfp_restore_user_hwstate(struct user_vfp *,
 /*
  * Change these and you break ASM code in entry-common.S
  */
-#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+				 _TIF_SIGPENDING | \
 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
 				 _TIF_NOTIFY_SIGNAL)
 
@ arch/arm/kernel/asm-offsets.c:46 @ int main(void)
   BLANK();
   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
+  DEFINE(TI_PREEMPT_LAZY,	offsetof(struct thread_info, preempt_lazy_count));
   DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
   DEFINE(TI_CPU_DOMAIN,		offsetof(struct thread_info, cpu_domain));
   DEFINE(TI_CPU_SAVE,		offsetof(struct thread_info, cpu_context));
@ arch/arm/kernel/entry-armv.S:225 @ ENDPROC(__dabt_svc)
 
 #ifdef CONFIG_PREEMPTION
 	ldr	r8, [tsk, #TI_PREEMPT]		@ get preempt count
-	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
 	teq	r8, #0				@ if preempt count != 0
+	bne	1f				@ return from exeption
+	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
+	tst	r0, #_TIF_NEED_RESCHED		@ if NEED_RESCHED is set
+	blne	svc_preempt			@ preempt!
+
+	ldr	r8, [tsk, #TI_PREEMPT_LAZY]	@ get preempt lazy count
+	teq	r8, #0				@ if preempt lazy count != 0
 	movne	r0, #0				@ force flags to 0
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED_LAZY
 	blne	svc_preempt
+1:
 #endif
 
 	svc_exit r5, irq = 1			@ return from exception
@ arch/arm/kernel/entry-armv.S:251 @ ENDPROC(__irq_svc)
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
 	tst	r0, #_TIF_NEED_RESCHED
+	bne	1b
+	tst	r0, #_TIF_NEED_RESCHED_LAZY
 	reteq	r8				@ go again
-	b	1b
+	ldr	r0, [tsk, #TI_PREEMPT_LAZY]	@ get preempt lazy count
+	teq	r0, #0				@ if preempt lazy count != 0
+	beq	1b
+	ret	r8				@ go again
+
 #endif
 
 __und_fault:
@ arch/arm/kernel/signal.c:610 @ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 	 */
 	trace_hardirqs_off();
 	do {
-		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
+		if (likely(thread_flags & (_TIF_NEED_RESCHED |
+					   _TIF_NEED_RESCHED_LAZY))) {
 			schedule();
 		} else {
 			if (unlikely(!user_mode(regs)))
@ arch/arm/mm/fault.c:439 @ do_translation_fault(unsigned long addr, unsigned int fsr,
 	if (addr < TASK_SIZE)
 		return do_page_fault(addr, fsr, regs);
 
+	if (interrupts_enabled(regs))
+		local_irq_enable();
+
 	if (user_mode(regs))
 		goto bad_area;
 
@ arch/arm/mm/fault.c:512 @ do_translation_fault(unsigned long addr, unsigned int fsr,
 static int
 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
+	if (interrupts_enabled(regs))
+		local_irq_enable();
+
 	do_bad_area(addr, fsr, regs);
 	return 0;
 }
@ arch/arm/vfp/entry.S:26 @
 @
 ENTRY(do_vfp)
 	mov	r1, r10
-	mov	r3, r9
- 	ldr	r4, .LCvfp
-	ldr	pc, [r4]		@ call VFP entry point
+	str	lr, [sp, #-8]!
+	add	r3, sp, #4
+	str	r9, [r3]
+	bl	vfp_entry
+	ldr	pc, [sp], #8
 ENDPROC(do_vfp)
-
-ENTRY(vfp_null_entry)
-	ret	lr
-ENDPROC(vfp_null_entry)
-
-	.align	2
-.LCvfp:
-	.word	vfp_vector
@ arch/arm/vfp/vfphw.S:78 @
 @  lr  = unrecognised instruction return address
 @  IRQs enabled.
 ENTRY(vfp_support_entry)
-	local_bh_disable r1, r4
-
 	ldr	r11, [r1, #TI_CPU]	@ CPU number
 	add	r10, r1, #TI_VFPSTATE	@ r10 = workspace
 
@ arch/arm/vfp/vfphw.S:175 @ ENTRY(vfp_support_entry)
 					@ out before setting an FPEXC that
 					@ stops us reading stuff
 	VFPFMXR	FPEXC, r1		@ Restore FPEXC last
+	mov	sp, r3			@ we think we have handled things
+	pop	{lr}
 	sub	r2, r2, #4		@ Retry current instruction - if Thumb
 	str	r2, [sp, #S_PC]		@ mode it's two 16-bit instructions,
 					@ else it's one 32-bit instruction, so
 					@ always subtract 4 from the following
 					@ instruction address.
-	local_bh_enable_ti r10, r4
-	ret	r3			@ we think we have handled things
 
+	b	vfp_exit	@ tail call
 
 look_for_VFP_exceptions:
 	@ Check for synchronous or asynchronous exception
@ arch/arm/vfp/vfphw.S:206 @ ENTRY(vfp_support_entry)
 	@ not recognised by VFP
 
 	DBGSTR	"not VFP"
-	local_bh_enable_ti r10, r4
-	ret	lr
+	b	vfp_exit	@ tail call
 
 process_exception:
 	DBGSTR	"bounce"
+	mov	sp, r3			@ setup for a return to the user code.
+	pop	{lr}
 	mov	r2, sp			@ nothing stacked - regdump is at TOS
-	mov	lr, r3			@ setup for a return to the user code.
 
 	@ Now call the C code to package up the bounce to the support code
 	@   r0 holds the trigger instruction
@ arch/arm/vfp/vfpmodule.c:35 @
 /*
  * Our undef handlers (in entry.S)
  */
-asmlinkage void vfp_support_entry(void);
-asmlinkage void vfp_null_entry(void);
+asmlinkage void vfp_support_entry(u32, void *, u32, u32);
 
-asmlinkage void (*vfp_vector)(void) = vfp_null_entry;
+static bool have_vfp __ro_after_init;
 
 /*
  * Dual-use variable.
@ arch/arm/vfp/vfpmodule.c:57 @ static unsigned int __initdata VFP_arch;
  */
 union vfp_state *vfp_current_hw_state[NR_CPUS];
 
+/*
+ * Claim ownership of the VFP unit.
+ *
+ * The caller may change VFP registers until vfp_unlock() is called.
+ *
+ * local_bh_disable() is used to disable preemption and to disable VFP
+ * processing in softirq context. On PREEMPT_RT kernels local_bh_disable() is
+ * not sufficient because it only serializes soft interrupt related sections
+ * via a local lock, but stays preemptible. Disabling preemption is the right
+ * choice here as bottom half processing is always in thread context on RT
+ * kernels so it implicitly prevents bottom half processing as well.
+ */
+static void vfp_lock(void)
+{
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_bh_disable();
+	else
+		preempt_disable();
+}
+
+static void vfp_unlock(void)
+{
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_bh_enable();
+	else
+		preempt_enable();
+}
+
 /*
  * Is 'thread's most up to date state stored in this CPUs hardware?
  * Must be called from non-preemptible context.
@ arch/arm/vfp/vfpmodule.c:270 @ static void vfp_panic(char *reason, u32 inst)
 /*
  * Process bitmask of exception conditions.
  */
-static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_regs *regs)
+static int vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr)
 {
 	int si_code = 0;
 
@ arch/arm/vfp/vfpmodule.c:278 @ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_
 
 	if (exceptions == VFP_EXCEPTION_ERROR) {
 		vfp_panic("unhandled bounce", inst);
-		vfp_raise_sigfpe(FPE_FLTINV, regs);
-		return;
+		return FPE_FLTINV;
 	}
 
 	/*
@ arch/arm/vfp/vfpmodule.c:306 @ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_
 	RAISE(FPSCR_OFC, FPSCR_OFE, FPE_FLTOVF);
 	RAISE(FPSCR_IOC, FPSCR_IOE, FPE_FLTINV);
 
-	if (si_code)
-		vfp_raise_sigfpe(si_code, regs);
+	return si_code;
 }
 
 /*
@ arch/arm/vfp/vfpmodule.c:351 @ static u32 vfp_emulate_instruction(u32 inst, u32 fpscr, struct pt_regs *regs)
 void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
 {
 	u32 fpscr, orig_fpscr, fpsid, exceptions;
+	int si_code2 = 0;
+	int si_code = 0;
 
 	pr_debug("VFP: bounce: trigger %08x fpexc %08x\n", trigger, fpexc);
 
@ arch/arm/vfp/vfpmodule.c:400 @ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
 		 * unallocated VFP instruction but with FPSCR.IXE set and not
 		 * on VFP subarch 1.
 		 */
-		 vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr, regs);
+		si_code = vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr);
 		goto exit;
 	}
 
@ arch/arm/vfp/vfpmodule.c:425 @ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
 	 */
 	exceptions = vfp_emulate_instruction(trigger, fpscr, regs);
 	if (exceptions)
-		vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs);
+		si_code2 = vfp_raise_exceptions(exceptions, trigger, orig_fpscr);
 
 	/*
 	 * If there isn't a second FP instruction, exit now. Note that
@ arch/arm/vfp/vfpmodule.c:444 @ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
  emulate:
 	exceptions = vfp_emulate_instruction(trigger, orig_fpscr, regs);
 	if (exceptions)
-		vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs);
+		si_code = vfp_raise_exceptions(exceptions, trigger, orig_fpscr);
+
  exit:
-	local_bh_enable();
+	vfp_unlock();
+	if (si_code2)
+		vfp_raise_sigfpe(si_code2, regs);
+	if (si_code)
+		vfp_raise_sigfpe(si_code, regs);
 }
 
 static void vfp_enable(void *unused)
@ arch/arm/vfp/vfpmodule.c:550 @ static inline void vfp_pm_init(void) { }
  */
 void vfp_sync_hwstate(struct thread_info *thread)
 {
-	unsigned int cpu = get_cpu();
+	vfp_lock();
 
-	local_bh_disable();
-
-	if (vfp_state_in_hw(cpu, thread)) {
+	if (vfp_state_in_hw(raw_smp_processor_id(), thread)) {
 		u32 fpexc = fmrx(FPEXC);
 
 		/*
@ arch/arm/vfp/vfpmodule.c:563 @ void vfp_sync_hwstate(struct thread_info *thread)
 		fmxr(FPEXC, fpexc);
 	}
 
-	local_bh_enable();
-	put_cpu();
+	vfp_unlock();
 }
 
 /* Ensure that the thread reloads the hardware VFP state on the next use. */
@ arch/arm/vfp/vfpmodule.c:677 @ static int vfp_starting_cpu(unsigned int unused)
 	return 0;
 }
 
+/*
+ * Entered with:
+ *
+ *  r0  = instruction opcode (32-bit ARM or two 16-bit Thumb)
+ *  r1  = thread_info pointer
+ *  r2  = PC value to resume execution after successful emulation
+ *  r3  = normal "successful" return address
+ *  lr  = unrecognised instruction return address
+ */
+asmlinkage void vfp_entry(u32 trigger, struct thread_info *ti, u32 resume_pc,
+			  u32 resume_return_address)
+{
+	if (unlikely(!have_vfp))
+		return;
+
+	vfp_lock();
+	vfp_support_entry(trigger, ti, resume_pc, resume_return_address);
+}
+
+asmlinkage void vfp_exit(void)
+{
+	vfp_unlock();
+}
+
 #ifdef CONFIG_KERNEL_MODE_NEON
 
 static int vfp_kmode_exception(struct pt_regs *regs, unsigned int instr)
@ arch/arm/vfp/vfpmodule.c:776 @ void kernel_neon_begin(void)
 	unsigned int cpu;
 	u32 fpexc;
 
-	local_bh_disable();
+	vfp_lock();
 
 	/*
 	 * Kernel mode NEON is only allowed outside of hardirq context with
@ arch/arm/vfp/vfpmodule.c:807 @ void kernel_neon_end(void)
 {
 	/* Disable the NEON/VFP unit. */
 	fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
-	local_bh_enable();
+	vfp_unlock();
 }
 EXPORT_SYMBOL(kernel_neon_end);
 
@ arch/arm/vfp/vfpmodule.c:854 @ static int __init vfp_init(void)
 	vfpsid = fmrx(FPSID);
 	barrier();
 	unregister_undef_hook(&vfp_detect_hook);
-	vfp_vector = vfp_null_entry;
 
 	pr_info("VFP support v0.3: ");
 	if (VFP_arch) {
@ arch/arm/vfp/vfpmodule.c:938 @ static int __init vfp_init(void)
 				  "arm/vfp:starting", vfp_starting_cpu,
 				  vfp_dying_cpu);
 
-	vfp_vector = vfp_support_entry;
+	have_vfp = true;
 
 	thread_register_notifier(&vfp_notifier_block);
 	vfp_pm_init();
@ arch/arm64/Kconfig:98 @ config ARM64
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
+	select ARCH_SUPPORTS_RT
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
@ arch/arm64/Kconfig:211 @ config ARM64
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_PREEMPT_DYNAMIC_KEY
 	select HAVE_REGS_AND_STACK_ACCESS_API
+	select HAVE_PREEMPT_LAZY
 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select MMU_GATHER_RCU_TABLE_FREE
@ arch/arm64/include/asm/preempt.h:74 @ static inline bool __preempt_count_dec_and_test(void)
 	 * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE
 	 * pair.
 	 */
-	return !pc || !READ_ONCE(ti->preempt_count);
+	if (!pc || !READ_ONCE(ti->preempt_count))
+		return true;
+#ifdef CONFIG_PREEMPT_LAZY
+	if ((pc & ~PREEMPT_NEED_RESCHED))
+		return false;
+	if (current_thread_info()->preempt_lazy_count)
+		return false;
+	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+	return false;
+#endif
 }
 
 static inline bool should_resched(int preempt_offset)
 {
+#ifdef CONFIG_PREEMPT_LAZY
+	u64 pc = READ_ONCE(current_thread_info()->preempt_count);
+	if (pc == preempt_offset)
+		return true;
+
+	if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset)
+		return false;
+
+	if (current_thread_info()->preempt_lazy_count)
+		return false;
+	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
 	u64 pc = READ_ONCE(current_thread_info()->preempt_count);
 	return pc == preempt_offset;
+#endif
 }
 
 #ifdef CONFIG_PREEMPTION
@ arch/arm64/include/asm/thread_info.h:29 @ struct thread_info {
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	u64			ttbr0;		/* saved TTBR0_EL1 */
 #endif
+	int			preempt_lazy_count;	/* 0 => preemptable, <0 => bug */
 	union {
 		u64		preempt_count;	/* 0 => preemptible, <0 => bug */
 		struct {
@ arch/arm64/include/asm/thread_info.h:72 @ int arch_dup_task_struct(struct task_struct *dst,
 #define TIF_UPROBE		4	/* uprobe breakpoint or singlestep */
 #define TIF_MTE_ASYNC_FAULT	5	/* MTE Asynchronous Tag Check Fault */
 #define TIF_NOTIFY_SIGNAL	6	/* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY	7
 #define TIF_SYSCALL_TRACE	8	/* syscall trace active */
 #define TIF_SYSCALL_AUDIT	9	/* syscall auditing */
 #define TIF_SYSCALL_TRACEPOINT	10	/* syscall tracepoint for ftrace */
@ arch/arm64/include/asm/thread_info.h:105 @ int arch_dup_task_struct(struct task_struct *dst,
 #define _TIF_SVE		(1 << TIF_SVE)
 #define _TIF_MTE_ASYNC_FAULT	(1 << TIF_MTE_ASYNC_FAULT)
 #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 
-#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+				 _TIF_SIGPENDING | \
 				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
 				 _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
 				 _TIF_NOTIFY_SIGNAL)
@ arch/arm64/include/asm/thread_info.h:117 @ int arch_dup_task_struct(struct task_struct *dst,
 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
 				 _TIF_SYSCALL_EMU)
 
+#define _TIF_NEED_RESCHED_MASK	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+
 #ifdef CONFIG_SHADOW_CALL_STACK
 #define INIT_SCS							\
 	.scs_base	= init_shadow_call_stack,			\
@ arch/arm64/kernel/asm-offsets.c:36 @ int main(void)
   DEFINE(TSK_TI_CPU,		offsetof(struct task_struct, thread_info.cpu));
   DEFINE(TSK_TI_FLAGS,		offsetof(struct task_struct, thread_info.flags));
   DEFINE(TSK_TI_PREEMPT,	offsetof(struct task_struct, thread_info.preempt_count));
+  DEFINE(TSK_TI_PREEMPT_LAZY,	offsetof(struct task_struct, thread_info.preempt_lazy_count));
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
   DEFINE(TSK_TI_TTBR0,		offsetof(struct task_struct, thread_info.ttbr0));
 #endif
@ arch/arm64/kernel/signal.c:1281 @ static void do_signal(struct pt_regs *regs)
 void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
 {
 	do {
-		if (thread_flags & _TIF_NEED_RESCHED) {
+		if (thread_flags & _TIF_NEED_RESCHED_MASK) {
 			/* Unmask Debug and SError for the next task */
 			local_daif_restore(DAIF_PROCCTX_NOIRQ);
 
@ arch/powerpc/Kconfig:157 @ config PPC
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx || 40x
+	select ARCH_SUPPORTS_RT			if HAVE_POSIX_CPU_TIMERS_TASK_WORK
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_USE_MEMTEST
@ arch/powerpc/Kconfig:251 @ config PPC
 	select HAVE_PERF_EVENTS_NMI		if PPC64
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select HAVE_PREEMPT_LAZY
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE
+	select HAVE_POSIX_CPU_TIMERS_TASK_WORK	if !KVM
 	select HAVE_RSEQ
 	select HAVE_SETUP_PER_CPU_AREA		if PPC64
 	select HAVE_SOFTIRQ_ON_OWN_STACK
@ arch/powerpc/include/asm/stackprotector.h:22 @
  */
 static __always_inline void boot_init_stack_canary(void)
 {
-	unsigned long canary = get_random_canary();
+	unsigned long canary;
 
+#ifndef CONFIG_PREEMPT_RT
+	canary = get_random_canary();
+#else
+	canary = ((unsigned long)&canary) & CANARY_MASK;
+#endif
 	current->stack_canary = canary;
 #ifdef CONFIG_PPC64
 	get_paca()->canary = canary;
@ arch/powerpc/include/asm/thread_info.h:56 @
 struct thread_info {
 	int		preempt_count;		/* 0 => preemptable,
 						   <0 => BUG */
+	int		preempt_lazy_count;	/* 0 => preemptable,
+						   <0 => BUG */
 #ifdef CONFIG_SMP
 	unsigned int	cpu;
 #endif
@ arch/powerpc/include/asm/thread_info.h:82 @ struct thread_info {
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.preempt_count = INIT_PREEMPT_COUNT,	\
+	.preempt_lazy_count = 0,		\
 	.flags =	0,			\
 }
 
@ arch/powerpc/include/asm/thread_info.h:108 @ void arch_setup_new_exec(void);
 #define TIF_PATCH_PENDING	6	/* pending live patching update */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SINGLESTEP		8	/* singlestepping active */
+#define TIF_NEED_RESCHED_LAZY	9	/* lazy rescheduling necessary */
 #define TIF_SECCOMP		10	/* secure computing */
 #define TIF_RESTOREALL		11	/* Restore all regs (implies NOERROR) */
 #define TIF_NOERROR		12	/* Force successful syscall return */
@ arch/powerpc/include/asm/thread_info.h:124 @ void arch_setup_new_exec(void);
 #define TIF_POLLING_NRFLAG	19	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_32BIT		20	/* 32 bit binary */
 
+
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@ arch/powerpc/include/asm/thread_info.h:136 @ void arch_setup_new_exec(void);
 #define _TIF_PATCH_PENDING	(1<<TIF_PATCH_PENDING)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
+#define _TIF_NEED_RESCHED_LAZY	(1<<TIF_NEED_RESCHED_LAZY)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTOREALL		(1<<TIF_RESTOREALL)
 #define _TIF_NOERROR		(1<<TIF_NOERROR)
@ arch/powerpc/include/asm/thread_info.h:150 @ void arch_setup_new_exec(void);
 				 _TIF_SYSCALL_EMU)
 
 #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
+				 _TIF_NEED_RESCHED_LAZY | \
 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
 				 _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \
 				 _TIF_NOTIFY_SIGNAL)
 #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
+#define _TIF_NEED_RESCHED_MASK	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
 
 /* Bits in local_flags */
 /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
@ arch/powerpc/kernel/interrupt.c:189 @ interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs)
 	ti_flags = read_thread_flags();
 	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
 		local_irq_enable();
-		if (ti_flags & _TIF_NEED_RESCHED) {
+		if (ti_flags & _TIF_NEED_RESCHED_MASK) {
 			schedule();
 		} else {
 			/*
@ arch/powerpc/kernel/interrupt.c:401 @ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
 		/* Returning to a kernel context with local irqs enabled. */
 		WARN_ON_ONCE(!(regs->msr & MSR_EE));
 again:
-		if (IS_ENABLED(CONFIG_PREEMPT)) {
+		if (IS_ENABLED(CONFIG_PREEMPTION)) {
 			/* Return to preemptible kernel context */
 			if (unlikely(read_thread_flags() & _TIF_NEED_RESCHED)) {
 				if (preempt_count() == 0)
 					preempt_schedule_irq();
+			} else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) {
+				if ((preempt_count() == 0) &&
+				    (current_thread_info()->preempt_lazy_count == 0))
+					preempt_schedule_irq();
 			}
 		}
 
@ arch/powerpc/kernel/traps.c:264 @ static char *get_mmu_str(void)
 
 static int __die(const char *str, struct pt_regs *regs, long err)
 {
+	const char *pr = "";
+
 	printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
 
+	if (IS_ENABLED(CONFIG_PREEMPTION))
+		pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
+
 	printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n",
 	       IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
 	       PAGE_SIZE / 1024, get_mmu_str(),
-	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
+	       pr,
 	       IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
 	       IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
 	       debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
@ arch/powerpc/kvm/Kconfig:228 @ config KVM_E500MC
 config KVM_MPIC
 	bool "KVM in-kernel MPIC emulation"
 	depends on KVM && PPC_E500
+	depends on !PREEMPT_RT
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
@ arch/powerpc/perf/imc-pmu.c:54 @ static int trace_imc_mem_size;
  * core and trace-imc
  */
 static struct imc_pmu_ref imc_global_refc = {
-	.lock = __SPIN_LOCK_INITIALIZER(imc_global_refc.lock),
+	.lock = __SPIN_LOCK_UNLOCKED(imc_global_refc.lock),
 	.id = 0,
 	.refc = 0,
 };
@ arch/powerpc/platforms/pseries/Kconfig:5 @
 config PPC_PSERIES
 	depends on PPC64 && PPC_BOOK3S
 	bool "IBM pSeries & new (POWER5-based) iSeries"
+	select GENERIC_ALLOCATOR
 	select HAVE_PCSPKR_PLATFORM
 	select MPIC
 	select OF_DYNAMIC
@ arch/powerpc/platforms/pseries/iommu.c:27 @
 #include <linux/of.h>
 #include <linux/iommu.h>
 #include <linux/rculist.h>
+#include <linux/local_lock.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
@ arch/powerpc/platforms/pseries/iommu.c:199 @ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
 	return ret;
 }
 
-static DEFINE_PER_CPU(__be64 *, tce_page);
+struct tce_page {
+	__be64 * page;
+	local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct tce_page, tce_page) = {
+	.lock = INIT_LOCAL_LOCK(lock),
+};
 
 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				     long npages, unsigned long uaddr,
@ arch/powerpc/platforms/pseries/iommu.c:228 @ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		                           direction, attrs);
 	}
 
-	local_irq_save(flags);	/* to protect tcep and the page behind it */
+	/* to protect tcep and the page behind it */
+	local_lock_irqsave(&tce_page.lock, flags);
 
-	tcep = __this_cpu_read(tce_page);
+	tcep = __this_cpu_read(tce_page.page);
 
 	/* This is safe to do since interrupts are off when we're called
 	 * from iommu_alloc{,_sg}()
@ arch/powerpc/platforms/pseries/iommu.c:240 @ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
 		/* If allocation fails, fall back to the loop implementation */
 		if (!tcep) {
-			local_irq_restore(flags);
+			local_unlock_irqrestore(&tce_page.lock, flags);
 			return tce_build_pSeriesLP(tbl->it_index, tcenum,
 					tceshift,
 					npages, uaddr, direction, attrs);
 		}
-		__this_cpu_write(tce_page, tcep);
+		__this_cpu_write(tce_page.page, tcep);
 	}
 
 	rpn = __pa(uaddr) >> tceshift;
@ arch/powerpc/platforms/pseries/iommu.c:275 @ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcenum += limit;
 	} while (npages > 0 && !rc);
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&tce_page.lock, flags);
 
 	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
 		ret = (int)rc;
@ arch/powerpc/platforms/pseries/iommu.c:451 @ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
 				DMA_BIDIRECTIONAL, 0);
 	}
 
-	local_irq_disable();	/* to protect tcep and the page behind it */
-	tcep = __this_cpu_read(tce_page);
+	/* to protect tcep and the page behind it */
+	local_lock_irq(&tce_page.lock);
+	tcep = __this_cpu_read(tce_page.page);
 
 	if (!tcep) {
 		tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
 		if (!tcep) {
-			local_irq_enable();
+			local_unlock_irq(&tce_page.lock);
 			return -ENOMEM;
 		}
-		__this_cpu_write(tce_page, tcep);
+		__this_cpu_write(tce_page.page, tcep);
 	}
 
 	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
@ arch/powerpc/platforms/pseries/iommu.c:504 @ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
 
 	/* error cleanup: caller will clear whole range */
 
-	local_irq_enable();
+	local_unlock_irq(&tce_page.lock);
 	return rc;
 }
 
@ arch/x86/Kconfig:117 @ config X86
 	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG_THIN
+	select ARCH_SUPPORTS_RT
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
@ arch/x86/Kconfig:256 @ config X86
 	select HAVE_PCI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select HAVE_PREEMPT_LAZY
 	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
 	select MMU_GATHER_MERGE_VMAS
 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
@ arch/x86/include/asm/preempt.h:93 @ static __always_inline void __preempt_count_sub(int val)
  * a decrement which hits zero means we have no preempt_count and should
  * reschedule.
  */
-static __always_inline bool __preempt_count_dec_and_test(void)
+static __always_inline bool ____preempt_count_dec_and_test(void)
 {
 	return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e,
 			       __percpu_arg([var]));
 }
 
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+	if (____preempt_count_dec_and_test())
+		return true;
+#ifdef CONFIG_PREEMPT_LAZY
+	if (preempt_count())
+		return false;
+	if (current_thread_info()->preempt_lazy_count)
+		return false;
+	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+	return false;
+#endif
+}
+
 /*
  * Returns true when we need to resched and can (barring IRQ state).
  */
 static __always_inline bool should_resched(int preempt_offset)
 {
+#ifdef CONFIG_PREEMPT_LAZY
+	u32 tmp;
+	tmp = raw_cpu_read_4(pcpu_hot.preempt_count);
+	if (tmp == preempt_offset)
+		return true;
+
+	/* preempt count == 0 ? */
+	tmp &= ~PREEMPT_NEED_RESCHED;
+	if (tmp != preempt_offset)
+		return false;
+	/* XXX PREEMPT_LOCK_OFFSET */
+	if (current_thread_info()->preempt_lazy_count)
+		return false;
+	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
 	return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset);
+#endif
 }
 
 #ifdef CONFIG_PREEMPTION
@ arch/x86/include/asm/thread_info.h:60 @ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		syscall_work;	/* SYSCALL_WORK_ flags */
 	u32			status;		/* thread synchronous flags */
+	int			preempt_lazy_count;	/* 0 => lazy preemptable
+							  <0 => BUG */
 #ifdef CONFIG_SMP
 	u32			cpu;		/* current CPU */
 #endif
@ arch/x86/include/asm/thread_info.h:70 @ struct thread_info {
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.flags		= 0,			\
+	.preempt_lazy_count	= 0,		\
 }
 
 #else /* !__ASSEMBLY__ */
@ arch/x86/include/asm/thread_info.h:98 @ struct thread_info {
 #define TIF_NOCPUID		15	/* CPUID is not accessible in userland */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_NOTIFY_SIGNAL	17	/* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY	19	/* lazy rescheduling necessary */
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
 #define TIF_POLLING_NRFLAG	21	/* idle is polling for TIF_NEED_RESCHED */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
@ arch/x86/include/asm/thread_info.h:122 @ struct thread_info {
 #define _TIF_NOCPUID		(1 << TIF_NOCPUID)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_SPEC_FORCE_UPDATE	(1 << TIF_SPEC_FORCE_UPDATE)
@ drivers/block/zram/zram_drv.c:60 @ static void zram_free_page(struct zram *zram, size_t index);
 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 				u32 index, int offset, struct bio *bio);
 
+#ifdef CONFIG_PREEMPT_RT
+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
+{
+	size_t index;
+
+	for (index = 0; index < num_pages; index++)
+		spin_lock_init(&zram->table[index].lock);
+}
+
+static int zram_slot_trylock(struct zram *zram, u32 index)
+{
+	int ret;
+
+	ret = spin_trylock(&zram->table[index].lock);
+	if (ret)
+		__set_bit(ZRAM_LOCK, &zram->table[index].flags);
+	return ret;
+}
+
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+	spin_lock(&zram->table[index].lock);
+	__set_bit(ZRAM_LOCK, &zram->table[index].flags);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+	__clear_bit(ZRAM_LOCK, &zram->table[index].flags);
+	spin_unlock(&zram->table[index].lock);
+}
+
+#else
+
+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
 
 static int zram_slot_trylock(struct zram *zram, u32 index)
 {
@ drivers/block/zram/zram_drv.c:109 @ static void zram_slot_unlock(struct zram *zram, u32 index)
 {
 	bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
 }
+#endif
 
 static inline bool init_done(struct zram *zram)
 {
@ drivers/block/zram/zram_drv.c:1349 @ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 
 	if (!huge_class_size)
 		huge_class_size = zs_huge_class_size(zram->mem_pool);
+	zram_meta_init_table_locks(zram, num_pages);
 	return true;
 }
 
@ drivers/block/zram/zram_drv.h:72 @ struct zram_table_entry {
 		unsigned long element;
 	};
 	unsigned long flags;
+#ifdef CONFIG_PREEMPT_RT
+	spinlock_t lock;
+#endif
 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
 	ktime_t ac_time;
 #endif
@ drivers/char/tpm/tpm_tis.c:53 @ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
 	return container_of(data, struct tpm_tis_tcg_phy, priv);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Flush previous write operations with a dummy read operation to the
+ * TPM MMIO base address.
+ */
+static inline void tpm_tis_flush(void __iomem *iobase)
+{
+	ioread8(iobase + TPM_ACCESS(0));
+}
+#else
+#define tpm_tis_flush(iobase) do { } while (0)
+#endif
+
+/*
+ * Write a byte word to the TPM MMIO address, and flush the write queue.
+ * The flush ensures that the data is sent immediately over the bus and not
+ * aggregated with further requests and transferred later in a batch. The large
+ * write requests can lead to unwanted latency spikes by blocking the CPU until
+ * the complete batch has been transferred.
+ */
+static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
+{
+	iowrite8(b, iobase + addr);
+	tpm_tis_flush(iobase);
+}
+
+/*
+ * Write a 32-bit word to the TPM MMIO address, and flush the write queue.
+ * The flush ensures that the data is sent immediately over the bus and not
+ * aggregated with further requests and transferred later in a batch. The large
+ * write requests can lead to unwanted latency spikes by blocking the CPU until
+ * the complete batch has been transferred.
+ */
+static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
+{
+	iowrite32(b, iobase + addr);
+	tpm_tis_flush(iobase);
+}
+
 static int interrupts = -1;
 module_param(interrupts, int, 0444);
 MODULE_PARM_DESC(interrupts, "Enable interrupts");
@ drivers/char/tpm/tpm_tis.c:228 @ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
 	switch (io_mode) {
 	case TPM_TIS_PHYS_8:
 		while (len--)
-			iowrite8(*value++, phy->iobase + addr);
+			tpm_tis_iowrite8(*value++, phy->iobase, addr);
 		break;
 	case TPM_TIS_PHYS_16:
 		return -EINVAL;
 	case TPM_TIS_PHYS_32:
-		iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase + addr);
+		tpm_tis_iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase, addr);
 		break;
 	}
 
@ drivers/gpu/drm/i915/Kconfig:6 @ config DRM_I915
 	tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics"
 	depends on DRM
 	depends on X86 && PCI
-	depends on !PREEMPT_RT
 	select INTEL_GTT if X86
 	select INTERVAL_TREE
 	# we need shmfs for the swappable backing store, and in particular
@ drivers/gpu/drm/i915/display/intel_crtc.c:523 @ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
 	 */
 	intel_psr_wait_for_idle_locked(new_crtc_state);
 
-	local_irq_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_irq_disable();
 
 	crtc->debug.min_vbl = min;
 	crtc->debug.max_vbl = max;
@ drivers/gpu/drm/i915/display/intel_crtc.c:549 @ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
 			break;
 		}
 
-		local_irq_enable();
+		if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+			local_irq_enable();
 
 		timeout = schedule_timeout(timeout);
 
-		local_irq_disable();
+		if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+			local_irq_disable();
 	}
 
 	finish_wait(wq, &wait);
@ drivers/gpu/drm/i915/display/intel_crtc.c:588 @ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
 	return;
 
 irq_disable:
-	local_irq_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_irq_disable();
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE)
@ drivers/gpu/drm/i915/display/intel_crtc.c:698 @ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state)
 	if (new_crtc_state->seamless_m_n && intel_crtc_needs_fastset(new_crtc_state))
 		intel_crtc_update_active_timings(new_crtc_state);
 
-	local_irq_enable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		local_irq_enable();
 
 	if (intel_vgpu_active(dev_priv))
 		return;
@ drivers/gpu/drm/i915/display/intel_vblank.c:296 @ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
 	 */
 	spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
 
-	/* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
 
 	/* Get optional system timestamp before query. */
 	if (stime)
@ drivers/gpu/drm/i915/display/intel_vblank.c:362 @ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
 	if (etime)
 		*etime = ktime_get();
 
-	/* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
 
 	spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
 
@ drivers/gpu/drm/i915/gt/intel_breadcrumbs.c:315 @ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
 	/* Kick the work once more to drain the signalers, and disarm the irq */
 	irq_work_sync(&b->irq_work);
 	while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
-		local_irq_disable();
-		signal_irq_work(&b->irq_work);
-		local_irq_enable();
+		irq_work_queue(&b->irq_work);
 		cond_resched();
+		irq_work_sync(&b->irq_work);
 	}
 }
 
@ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1306 @ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * and context switches) submission.
 	 */
 
-	spin_lock(&sched_engine->lock);
+	spin_lock_irq(&sched_engine->lock);
 
 	/*
 	 * If the queue is higher priority than the last
@ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1406 @ static void execlists_dequeue(struct intel_engine_cs *engine)
 				 * Even if ELSP[1] is occupied and not worthy
 				 * of timeslices, our queue might be.
 				 */
-				spin_unlock(&sched_engine->lock);
+				spin_unlock_irq(&sched_engine->lock);
 				return;
 			}
 		}
@ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1432 @ static void execlists_dequeue(struct intel_engine_cs *engine)
 
 		if (last && !can_merge_rq(last, rq)) {
 			spin_unlock(&ve->base.sched_engine->lock);
-			spin_unlock(&engine->sched_engine->lock);
+			spin_unlock_irq(&engine->sched_engine->lock);
 			return; /* leave this for another sibling */
 		}
 
@ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1594 @ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */
 	sched_engine->queue_priority_hint = queue_prio(sched_engine);
 	i915_sched_engine_reset_on_empty(sched_engine);
-	spin_unlock(&sched_engine->lock);
+	spin_unlock_irq(&sched_engine->lock);
 
 	/*
 	 * We can skip poking the HW if we ended up with exactly the same set
@ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1620 @ static void execlists_dequeue(struct intel_engine_cs *engine)
 	}
 }
 
-static void execlists_dequeue_irq(struct intel_engine_cs *engine)
-{
-	local_irq_disable(); /* Suspend interrupts across request submission */
-	execlists_dequeue(engine);
-	local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */
-}
-
 static void clear_ports(struct i915_request **ports, int count)
 {
 	memset_p((void **)ports, NULL, count);
@ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2473 @ static void execlists_submission_tasklet(struct tasklet_struct *t)
 	}
 
 	if (!engine->execlists.pending[0]) {
-		execlists_dequeue_irq(engine);
+		execlists_dequeue(engine);
 		start_timeslice(engine);
 	}
 
@ drivers/gpu/drm/i915/i915_request.c:616 @ bool __i915_request_submit(struct i915_request *request)
 
 	RQ_TRACE(request, "\n");
 
-	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&engine->sched_engine->lock);
 
 	/*
@ drivers/gpu/drm/i915/i915_request.c:724 @ void __i915_request_unsubmit(struct i915_request *request)
 	 */
 	RQ_TRACE(request, "\n");
 
-	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&engine->sched_engine->lock);
 
 	/*
@ drivers/gpu/drm/i915/i915_trace.h:9 @
 #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
 #define _I915_TRACE_H_
 
+#ifdef CONFIG_PREEMPT_RT
+#define NOTRACE
+#endif
+
 #include <linux/stringify.h>
 #include <linux/types.h>
 #include <linux/tracepoint.h>
@ drivers/gpu/drm/i915/i915_trace.h:329 @ DEFINE_EVENT(i915_request, i915_request_add,
 	     TP_ARGS(rq)
 );
 
-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS)
+#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE)
 DEFINE_EVENT(i915_request, i915_request_guc_submit,
 	     TP_PROTO(struct i915_request *rq),
 	     TP_ARGS(rq)
@ drivers/gpu/drm/i915/i915_utils.h:291 @ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
 #define wait_for(COND, MS)		_wait_for((COND), (MS) * 1000, 10, 1000)
 
 /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */
-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT)
+#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT)
 # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic())
 #else
 # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0)
@ drivers/tty/serial/8250/8250.h:180 @ static inline void serial_dl_write(struct uart_8250_port *up, int value)
 	up->dl_write(up, value);
 }
 
+static inline bool serial8250_is_console(struct uart_port *port)
+{
+	return uart_console(port) && !hlist_unhashed_lockless(&port->cons->node);
+}
+
+/**
+ * serial8250_init_wctxt - Initialize a write context for
+ *	non-console-printing usage
+ * @wctxt:	The write context to initialize
+ * @cons:	The console to assign to the write context
+ *
+ * In order to mark an unsafe region, drivers must acquire the console. This
+ * requires providing an initialized write context (even if that driver will
+ * not be doing any printing).
+ *
+ * This function should not be used for console printing contexts.
+ */
+static inline void serial8250_init_wctxt(struct cons_write_context *wctxt,
+					 struct console *cons)
+{
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	memset(wctxt, 0, sizeof(*wctxt));
+	ctxt->console = cons;
+	ctxt->prio = CONS_PRIO_NORMAL;
+}
+
+/**
+ * __serial8250_console_acquire - Acquire a console for
+ *	non-console-printing usage
+ * @wctxt:	An uninitialized write context to use for acquiring
+ * @cons:	The console to assign to the write context
+ *
+ * The caller is holding the port->lock.
+ * The caller is holding the console_srcu_read_lock.
+ *
+ * This function should not be used for console printing contexts.
+ */
+static inline void __serial8250_console_acquire(struct cons_write_context *wctxt,
+						struct console *cons)
+{
+	for (;;) {
+		serial8250_init_wctxt(wctxt, cons);
+		if (console_try_acquire(wctxt))
+			break;
+		cpu_relax();
+	}
+}
+
+/**
+ * serial8250_enter_unsafe - Mark the beginning of an unsafe region for
+ *		non-console-printing usage
+ * @up:	The port that is entering the unsafe state
+ *
+ * The caller should ensure @up is a console before calling this function.
+ *
+ * The caller is holding the port->lock.
+ * This function takes the console_srcu_read_lock and becomes owner of the
+ * console associated with @up.
+ *
+ * This function should not be used for console printing contexts.
+ */
+static inline void serial8250_enter_unsafe(struct uart_8250_port *up)
+{
+	struct uart_port *port = &up->port;
+
+	lockdep_assert_held_once(&port->lock);
+
+	for (;;) {
+		up->cookie = console_srcu_read_lock();
+
+		__serial8250_console_acquire(&up->wctxt, port->cons);
+
+		if (console_enter_unsafe(&up->wctxt))
+			break;
+
+		console_srcu_read_unlock(up->cookie);
+		cpu_relax();
+	}
+}
+
+/**
+ * serial8250_exit_unsafe - Mark the end of an unsafe region for
+ *		non-console-printing usage
+ * @up:	The port that is exiting the unsafe state
+ *
+ * The caller is holding the port->lock.
+ * This function releases ownership of the console associated with @up and
+ * releases the console_srcu_read_lock.
+ *
+ * This function should not be used for console printing contexts.
+ */
+static inline void serial8250_exit_unsafe(struct uart_8250_port *up)
+{
+	struct uart_port *port = &up->port;
+
+	lockdep_assert_held_once(&port->lock);
+
+	if (console_exit_unsafe(&up->wctxt))
+		console_release(&up->wctxt);
+
+	console_srcu_read_unlock(up->cookie);
+}
+
+/**
+ * serial8250_in_IER - Read the IER register for
+ *		non-console-printing usage
+ * @up:	The port to work on
+ *
+ * Returns:	The value read from IER
+ *
+ * The caller is holding the port->lock.
+ *
+ * This is the top-level function for non-console-printing contexts to
+ * read the IER register. The caller does not need to care if @up is a
+ * console before calling this function.
+ *
+ * This function should not be used for printing contexts.
+ */
+static inline int serial8250_in_IER(struct uart_8250_port *up)
+{
+	struct uart_port *port = &up->port;
+	bool is_console;
+	int ier;
+
+	is_console = serial8250_is_console(port);
+
+	if (is_console)
+		serial8250_enter_unsafe(up);
+
+	ier = serial_in(up, UART_IER);
+
+	if (is_console)
+		serial8250_exit_unsafe(up);
+
+	return ier;
+}
+
+/**
+ * __serial8250_set_IER - Directly write to the IER register
+ * @up:		The port to work on
+ * @wctxt:	The current write context
+ * @ier:	The value to write
+ *
+ * Returns:	True if IER was written to. False otherwise
+ *
+ * The caller is holding the port->lock.
+ * The caller is holding the console_srcu_read_unlock.
+ * The caller is the owner of the console associated with @up.
+ *
+ * This function should only be directly called within console printing
+ * contexts. Other contexts should use serial8250_set_IER().
+ */
+static inline bool __serial8250_set_IER(struct uart_8250_port *up,
+					struct cons_write_context *wctxt,
+					int ier)
+{
+	if (wctxt && !console_can_proceed(wctxt))
+		return false;
+	serial_out(up, UART_IER, ier);
+	return true;
+}
+
+/**
+ * serial8250_set_IER - Write a new value to the IER register for
+ *	non-console-printing usage
+ * @up:		The port to work on
+ * @ier:	The value to write
+ *
+ * The caller is holding the port->lock.
+ *
+ * This is the top-level function for non-console-printing contexts to
+ * write to the IER register. The caller does not need to care if @up is a
+ * console before calling this function.
+ *
+ * This function should not be used for printing contexts.
+ */
+static inline void serial8250_set_IER(struct uart_8250_port *up, int ier)
+{
+	struct uart_port *port = &up->port;
+	bool is_console;
+
+	is_console = serial8250_is_console(port);
+
+	if (is_console) {
+		serial8250_enter_unsafe(up);
+		while (!__serial8250_set_IER(up, &up->wctxt, ier)) {
+			console_srcu_read_unlock(up->cookie);
+			console_enter_unsafe(&up->wctxt);
+		}
+		serial8250_exit_unsafe(up);
+	} else {
+		__serial8250_set_IER(up, NULL, ier);
+	}
+}
+
+/**
+ * __serial8250_clear_IER - Directly clear the IER register
+ * @up:		The port to work on
+ * @wctxt:	The current write context
+ * @prior:	Gets set to the previous value of IER
+ *
+ * Returns:	True if IER was cleared and @prior points to the previous
+ *		value of IER. False otherwise and @prior is invalid
+ *
+ * The caller is holding the port->lock.
+ * The caller is holding the console_srcu_read_unlock.
+ * The caller is the owner of the console associated with @up.
+ *
+ * This function should only be directly called within console printing
+ * contexts. Other contexts should use serial8250_clear_IER().
+ */
+static inline bool __serial8250_clear_IER(struct uart_8250_port *up,
+					  struct cons_write_context *wctxt,
+					  int *prior)
+{
+	unsigned int clearval = 0;
+
+	if (up->capabilities & UART_CAP_UUE)
+		clearval = UART_IER_UUE;
+
+	*prior = serial_in(up, UART_IER);
+	if (wctxt && !console_can_proceed(wctxt))
+		return false;
+	serial_out(up, UART_IER, clearval);
+	return true;
+}
+
+/**
+ * serial8250_clear_IER - Clear the IER register for
+ *		non-console-printing usage
+ * @up:	The port to work on
+ *
+ * Returns:	The previous value of IER
+ *
+ * The caller is holding the port->lock.
+ *
+ * This is the top-level function for non-console-printing contexts to
+ * clear the IER register. The caller does not need to care if @up is a
+ * console before calling this function.
+ *
+ * This function should not be used for printing contexts.
+ */
+static inline int serial8250_clear_IER(struct uart_8250_port *up)
+{
+	struct uart_port *port = &up->port;
+	bool is_console;
+	int prior;
+
+	is_console = serial8250_is_console(port);
+
+	if (is_console) {
+		serial8250_enter_unsafe(up);
+		while (!__serial8250_clear_IER(up, &up->wctxt, &prior)) {
+			console_srcu_read_unlock(up->cookie);
+			console_enter_unsafe(&up->wctxt);
+		}
+		serial8250_exit_unsafe(up);
+	} else {
+		__serial8250_clear_IER(up, NULL, &prior);
+	}
+
+	return prior;
+}
+
 static inline bool serial8250_set_THRI(struct uart_8250_port *up)
 {
 	if (up->ier & UART_IER_THRI)
 		return false;
 	up->ier |= UART_IER_THRI;
-	serial_out(up, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 	return true;
 }
 
@ drivers/tty/serial/8250/8250.h:459 @ static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
 	if (!(up->ier & UART_IER_THRI))
 		return false;
 	up->ier &= ~UART_IER_THRI;
-	serial_out(up, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 	return true;
 }
 
@ drivers/tty/serial/8250/8250_aspeed_vuart.c:281 @ static void __aspeed_vuart_set_throttle(struct uart_8250_port *up,
 	up->ier &= ~irqs;
 	if (!throttle)
 		up->ier |= irqs;
-	serial_out(up, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 }
 static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle)
 {
@ drivers/tty/serial/8250/8250_bcm7271.c:609 @ static int brcmuart_startup(struct uart_port *port)
 	 * Disable the Receive Data Interrupt because the DMA engine
 	 * will handle this.
 	 */
+	spin_lock_irq(&port->lock);
 	up->ier &= ~UART_IER_RDI;
-	serial_port_out(port, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
+	spin_unlock_irq(&port->lock);
 
 	priv->tx_running = false;
 	priv->dma.rx_dma = NULL;
@ drivers/tty/serial/8250/8250_bcm7271.c:792 @ static int brcmuart_handle_irq(struct uart_port *p)
 		spin_lock_irqsave(&p->lock, flags);
 		status = serial_port_in(p, UART_LSR);
 		if ((status & UART_LSR_DR) == 0) {
+			bool is_console;
+
+			is_console = serial8250_is_console(p);
+
+			if (is_console)
+				serial8250_enter_unsafe(up);
 
 			ier = serial_port_in(p, UART_IER);
 			/*
@ drivers/tty/serial/8250/8250_bcm7271.c:818 @ static int brcmuart_handle_irq(struct uart_port *p)
 				serial_port_in(p, UART_RX);
 			}
 
+			if (is_console)
+				serial8250_exit_unsafe(up);
+
 			handled = 1;
 		}
 		spin_unlock_irqrestore(&p->lock, flags);
@ drivers/tty/serial/8250/8250_bcm7271.c:858 @ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t)
 	/* re-enable receive unless upper layer has disabled it */
 	if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) ==
 	    (UART_IER_RLSI | UART_IER_RDI)) {
+		bool is_console;
+
+		is_console = serial8250_is_console(p);
+
+		if (is_console)
+			serial8250_enter_unsafe(up);
+
 		status = serial_port_in(p, UART_IER);
 		status |= (UART_IER_RLSI | UART_IER_RDI);
 		serial_port_out(p, UART_IER, status);
 		status = serial_port_in(p, UART_MCR);
 		status |= UART_MCR_RTS;
 		serial_port_out(p, UART_MCR, status);
+
+		if (is_console)
+			serial8250_exit_unsafe(up);
 	}
 	spin_unlock_irqrestore(&p->lock, flags);
 	return HRTIMER_NORESTART;
@ drivers/tty/serial/8250/8250_core.c:259 @ static void serial8250_timeout(struct timer_list *t)
 static void serial8250_backup_timeout(struct timer_list *t)
 {
 	struct uart_8250_port *up = from_timer(up, t, timer);
+	struct uart_port *port = &up->port;
 	unsigned int iir, ier = 0, lsr;
 	unsigned long flags;
 
@ drivers/tty/serial/8250/8250_core.c:270 @ static void serial8250_backup_timeout(struct timer_list *t)
 	 * based handler.
 	 */
 	if (up->port.irq) {
+		bool is_console;
+
+		/*
+		 * Do not use serial8250_clear_IER() because this code
+		 * ignores capabilties.
+		 */
+
+		is_console = serial8250_is_console(port);
+
+		if (is_console)
+			serial8250_enter_unsafe(up);
+
 		ier = serial_in(up, UART_IER);
 		serial_out(up, UART_IER, 0);
+
+		if (is_console)
+			serial8250_exit_unsafe(up);
 	}
 
 	iir = serial_in(up, UART_IIR);
@ drivers/tty/serial/8250/8250_core.c:309 @ static void serial8250_backup_timeout(struct timer_list *t)
 		serial8250_tx_chars(up);
 
 	if (up->port.irq)
-		serial_out(up, UART_IER, ier);
+		serial8250_set_IER(up, ier);
 
 	spin_unlock_irqrestore(&up->port.lock, flags);
 
@ drivers/tty/serial/8250/8250_core.c:595 @ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
 
 #ifdef CONFIG_SERIAL_8250_CONSOLE
 
-static void univ8250_console_write(struct console *co, const char *s,
-				   unsigned int count)
+static void univ8250_console_port_lock(struct console *con, bool do_lock, unsigned long *flags)
+{
+	struct uart_8250_port *up = &serial8250_ports[con->index];
+
+	if (do_lock)
+		spin_lock_irqsave(&up->port.lock, *flags);
+	else
+		spin_unlock_irqrestore(&up->port.lock, *flags);
+}
+
+static bool univ8250_console_write_atomic(struct console *co,
+					  struct cons_write_context *wctxt)
 {
 	struct uart_8250_port *up = &serial8250_ports[co->index];
 
-	serial8250_console_write(up, s, count);
+	return serial8250_console_write_atomic(up, wctxt);
+}
+
+static bool univ8250_console_write_thread(struct console *co,
+					  struct cons_write_context *wctxt)
+{
+	struct uart_8250_port *up = &serial8250_ports[co->index];
+
+	return serial8250_console_write_thread(up, wctxt);
 }
 
 static int univ8250_console_setup(struct console *co, char *options)
@ drivers/tty/serial/8250/8250_core.c:706 @ static int univ8250_console_match(struct console *co, char *name, int idx,
 
 static struct console univ8250_console = {
 	.name		= "ttyS",
-	.write		= univ8250_console_write,
+	.write_atomic	= univ8250_console_write_atomic,
+	.write_thread	= univ8250_console_write_thread,
+	.port_lock	= univ8250_console_port_lock,
 	.device		= uart_console_device,
 	.setup		= univ8250_console_setup,
 	.exit		= univ8250_console_exit,
 	.match		= univ8250_console_match,
-	.flags		= CON_PRINTBUFFER | CON_ANYTIME,
+	.flags		= CON_PRINTBUFFER | CON_ANYTIME | CON_NO_BKL,
 	.index		= -1,
 	.data		= &serial8250_reg,
 };
@ drivers/tty/serial/8250/8250_core.c:1001 @ static void serial_8250_overrun_backoff_work(struct work_struct *work)
 	spin_lock_irqsave(&port->lock, flags);
 	up->ier |= UART_IER_RLSI | UART_IER_RDI;
 	up->port.read_status_mask |= UART_LSR_DR;
-	serial_out(up, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 	spin_unlock_irqrestore(&port->lock, flags);
 }
 
@ drivers/tty/serial/8250/8250_exar.c:188 @ static void xr17v35x_set_divisor(struct uart_port *p, unsigned int baud,
 
 static int xr17v35x_startup(struct uart_port *port)
 {
+	struct uart_8250_port *up = up_to_u8250p(port);
+
+	spin_lock_irq(&port->lock);
+
 	/*
 	 * First enable access to IER [7:5], ISR [5:4], FCR [5:4],
 	 * MCR [7:5] and MSR [7:0]
@ drivers/tty/serial/8250/8250_exar.c:202 @ static int xr17v35x_startup(struct uart_port *port)
 	 * Make sure all interrups are masked until initialization is
 	 * complete and the FIFOs are cleared
 	 */
-	serial_port_out(port, UART_IER, 0);
+	serial8250_set_IER(up, 0);
+
+	spin_unlock_irq(&port->lock);
 
 	return serial8250_do_startup(port);
 }
@ drivers/tty/serial/8250/8250_fsl.c:61 @ int fsl8250_handle_irq(struct uart_port *port)
 	if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
 		unsigned long delay;
 
-		up->ier = port->serial_in(port, UART_IER);
+		up->ier = serial8250_in_IER(up);
+
 		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
 			port->ops->stop_rx(port);
 		} else {
@ drivers/tty/serial/8250/8250_mtk.c:225 @ static void mtk8250_shutdown(struct uart_port *port)
 
 static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask)
 {
-	serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask));
+	struct uart_port *port = &up->port;
+	bool is_console;
+	int ier;
+
+	is_console = serial8250_is_console(port);
+
+	if (is_console)
+		serial8250_enter_unsafe(up);
+
+	ier = serial_in(up, UART_IER);
+	serial_out(up, UART_IER, ier & (~mask));
+
+	if (is_console)
+		serial8250_exit_unsafe(up);
 }
 
 static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask)
 {
-	serial_out(up, UART_IER, serial_in(up, UART_IER) | mask);
+	struct uart_port *port = &up->port;
+	bool is_console;
+	int ier;
+
+	is_console = serial8250_is_console(port);
+
+	if (is_console)
+		serial8250_enter_unsafe(up);
+
+	ier = serial_in(up, UART_IER);
+	serial_out(up, UART_IER, ier | mask);
+
+	if (is_console)
+		serial8250_exit_unsafe(up);
 }
 
 static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode)
@ drivers/tty/serial/8250/8250_omap.c:337 @ static void omap8250_restore_regs(struct uart_8250_port *up)
 
 	/* drop TCR + TLR access, we setup XON/XOFF later */
 	serial8250_out_MCR(up, mcr);
-
-	serial_out(up, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 
 	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
 	serial_dl_write(up, priv->quot);
@ drivers/tty/serial/8250/8250_omap.c:525 @ static void omap_8250_pm(struct uart_port *port, unsigned int state,
 	u8 efr;
 
 	pm_runtime_get_sync(port->dev);
+
+	spin_lock_irq(&port->lock);
+
 	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
 	efr = serial_in(up, UART_EFR);
 	serial_out(up, UART_EFR, efr | UART_EFR_ECB);
 	serial_out(up, UART_LCR, 0);
 
-	serial_out(up, UART_IER, (state != 0) ? UART_IERX_SLEEP : 0);
+	serial8250_set_IER(up, (state != 0) ? UART_IERX_SLEEP : 0);
 	serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
 	serial_out(up, UART_EFR, efr);
 	serial_out(up, UART_LCR, 0);
 
+	spin_unlock_irq(&port->lock);
+
 	pm_runtime_mark_last_busy(port->dev);
 	pm_runtime_put_autosuspend(port->dev);
 }
@ drivers/tty/serial/8250/8250_omap.c:656 @ static irqreturn_t omap8250_irq(int irq, void *dev_id)
 	if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) {
 		unsigned long delay;
 
-		up->ier = port->serial_in(port, UART_IER);
+		spin_lock(&port->lock);
+		up->ier = serial8250_in_IER(up);
 		if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
 			port->ops->stop_rx(port);
 		} else {
@ drivers/tty/serial/8250/8250_omap.c:666 @ static irqreturn_t omap8250_irq(int irq, void *dev_id)
 			 */
 			cancel_delayed_work(&up->overrun_backoff);
 		}
+		spin_unlock(&port->lock);
 
 		delay = msecs_to_jiffies(up->overrun_backoff_time_ms);
 		schedule_delayed_work(&up->overrun_backoff, delay);
@ drivers/tty/serial/8250/8250_omap.c:716 @ static int omap_8250_startup(struct uart_port *port)
 	if (ret < 0)
 		goto err;
 
+	spin_lock_irq(&port->lock);
 	up->ier = UART_IER_RLSI | UART_IER_RDI;
-	serial_out(up, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
+	spin_unlock_irq(&port->lock);
 
 #ifdef CONFIG_PM
 	up->capabilities |= UART_CAP_RPM;
@ drivers/tty/serial/8250/8250_omap.c:759 @ static void omap_8250_shutdown(struct uart_port *port)
 	if (priv->habit & UART_HAS_EFR2)
 		serial_out(up, UART_OMAP_EFR2, 0x0);
 
+	spin_lock_irq(&port->lock);
 	up->ier = 0;
-	serial_out(up, UART_IER, 0);
+	serial8250_set_IER(up, 0);
+	spin_unlock_irq(&port->lock);
 
 	if (up->dma)
 		serial8250_release_dma(up);
@ drivers/tty/serial/8250/8250_omap.c:810 @ static void omap_8250_unthrottle(struct uart_port *port)
 		up->dma->rx_dma(up);
 	up->ier |= UART_IER_RLSI | UART_IER_RDI;
 	port->read_status_mask |= UART_LSR_DR;
-	serial_out(up, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 	spin_unlock_irqrestore(&port->lock, flags);
 
 	pm_runtime_mark_last_busy(port->dev);
@ drivers/tty/serial/8250/8250_omap.c:969 @ static void __dma_rx_complete(void *param)
 	__dma_rx_do_complete(p);
 	if (!priv->throttled) {
 		p->ier |= UART_IER_RLSI | UART_IER_RDI;
-		serial_out(p, UART_IER, p->ier);
+		serial8250_set_IER(p, p->ier);
 		if (!(priv->habit & UART_HAS_EFR2))
 			omap_8250_rx_dma(p);
 	}
@ drivers/tty/serial/8250/8250_omap.c:1026 @ static int omap_8250_rx_dma(struct uart_8250_port *p)
 			 * callback to run.
 			 */
 			p->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
-			serial_out(p, UART_IER, p->ier);
+			serial8250_set_IER(p, p->ier);
 		}
 		goto out;
 	}
@ drivers/tty/serial/8250/8250_omap.c:1239 @ static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir,
 		 * periodic timeouts, re-enable interrupts.
 		 */
 		up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
-		serial_out(up, UART_IER, up->ier);
+		serial8250_set_IER(up, up->ier);
 		omap_8250_rx_dma_flush(up);
 		serial_in(up, UART_IIR);
 		serial_out(up, UART_OMAP_EFR2, 0x0);
 		up->ier |= UART_IER_RLSI | UART_IER_RDI;
-		serial_out(up, UART_IER, up->ier);
+		serial8250_set_IER(up, up->ier);
 	}
 }
 
@ drivers/tty/serial/8250/8250_omap.c:1730 @ static int omap8250_runtime_resume(struct device *dev)
 
 	up = serial8250_get_port(priv->line);
 
+	spin_lock_irq(&up->port.lock);
+
 	if (omap8250_lost_context(up))
 		omap8250_restore_regs(up);
 
 	if (up->dma && up->dma->rxchan && !(priv->habit & UART_HAS_EFR2))
 		omap_8250_rx_dma(up);
 
+	spin_unlock_irq(&up->port.lock);
+
 	priv->latency = priv->calc_latency;
 	schedule_work(&priv->qos_work);
 	return 0;
@ drivers/tty/serial/8250/8250_port.c:748 @ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
 	serial8250_rpm_get(p);
 
 	if (p->capabilities & UART_CAP_SLEEP) {
+		spin_lock_irq(&p->port.lock);
 		if (p->capabilities & UART_CAP_EFR) {
 			lcr = serial_in(p, UART_LCR);
 			efr = serial_in(p, UART_EFR);
@ drivers/tty/serial/8250/8250_port.c:756 @ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
 			serial_out(p, UART_EFR, UART_EFR_ECB);
 			serial_out(p, UART_LCR, 0);
 		}
-		serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
+		serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0);
 		if (p->capabilities & UART_CAP_EFR) {
 			serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
 			serial_out(p, UART_EFR, efr);
 			serial_out(p, UART_LCR, lcr);
 		}
+		spin_unlock_irq(&p->port.lock);
 	}
 
 	serial8250_rpm_put(p);
 }
 
-static void serial8250_clear_IER(struct uart_8250_port *up)
-{
-	if (up->capabilities & UART_CAP_UUE)
-		serial_out(up, UART_IER, UART_IER_UUE);
-	else
-		serial_out(up, UART_IER, 0);
-}
-
 #ifdef CONFIG_SERIAL_8250_RSA
 /*
  * Attempts to turn on the RSA FIFO.  Returns zero on failure.
@ drivers/tty/serial/8250/8250_port.c:1031 @ static int broken_efr(struct uart_8250_port *up)
  */
 static void autoconfig_16550a(struct uart_8250_port *up)
 {
+	struct uart_port *port = &up->port;
 	unsigned char status1, status2;
 	unsigned int iersave;
+	bool is_console;
 
 	up->port.type = PORT_16550A;
 	up->capabilities |= UART_CAP_FIFO;
@ drivers/tty/serial/8250/8250_port.c:1150 @ static void autoconfig_16550a(struct uart_8250_port *up)
 		return;
 	}
 
+	is_console = serial8250_is_console(port);
+
+	if (is_console)
+		serial8250_enter_unsafe(up);
+
 	/*
 	 * Try writing and reading the UART_IER_UUE bit (b6).
 	 * If it works, this is probably one of the Xscale platform's
@ drivers/tty/serial/8250/8250_port.c:1190 @ static void autoconfig_16550a(struct uart_8250_port *up)
 	}
 	serial_out(up, UART_IER, iersave);
 
+	if (is_console)
+		serial8250_exit_unsafe(up);
+
 	/*
 	 * We distinguish between 16550A and U6 16550A by counting
 	 * how many bytes are in the FIFO.
@ drivers/tty/serial/8250/8250_port.c:1234 @ static void autoconfig(struct uart_8250_port *up)
 	up->bugs = 0;
 
 	if (!(port->flags & UPF_BUGGY_UART)) {
+		bool is_console;
+
+		is_console = serial8250_is_console(port);
+
+		if (is_console)
+			serial8250_enter_unsafe(up);
+
 		/*
 		 * Do a simple existence test first; if we fail this,
 		 * there's no point trying anything else.
@ drivers/tty/serial/8250/8250_port.c:1270 @ static void autoconfig(struct uart_8250_port *up)
 #endif
 		scratch3 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
 		serial_out(up, UART_IER, scratch);
+
+		if (is_console)
+			serial8250_exit_unsafe(up);
+
 		if (scratch2 != 0 || scratch3 != UART_IER_ALL_INTR) {
 			/*
 			 * We failed; there's nothing here
@ drivers/tty/serial/8250/8250_port.c:1395 @ static void autoconfig_irq(struct uart_8250_port *up)
 	unsigned char save_ICP = 0;
 	unsigned int ICP = 0;
 	unsigned long irqs;
+	bool is_console;
 	int irq;
 
 	if (port->flags & UPF_FOURPORT) {
@ drivers/tty/serial/8250/8250_port.c:1405 @ static void autoconfig_irq(struct uart_8250_port *up)
 		inb_p(ICP);
 	}
 
-	if (uart_console(port))
+	is_console = serial8250_is_console(port);
+
+	if (is_console) {
 		console_lock();
+		serial8250_enter_unsafe(up);
+	}
 
 	/* forget possible initially masked and pending IRQ */
 	probe_irq_off(probe_irq_on());
@ drivers/tty/serial/8250/8250_port.c:1442 @ static void autoconfig_irq(struct uart_8250_port *up)
 	if (port->flags & UPF_FOURPORT)
 		outb_p(save_ICP, ICP);
 
-	if (uart_console(port))
+	if (is_console) {
+		serial8250_exit_unsafe(up);
 		console_unlock();
+	}
 
 	port->irq = (irq > 0) ? irq : 0;
 }
@ drivers/tty/serial/8250/8250_port.c:1458 @ static void serial8250_stop_rx(struct uart_port *port)
 
 	up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
 	up->port.read_status_mask &= ~UART_LSR_DR;
-	serial_port_out(port, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 
 	serial8250_rpm_put(up);
 }
@ drivers/tty/serial/8250/8250_port.c:1488 @ void serial8250_em485_stop_tx(struct uart_8250_port *p)
 		serial8250_clear_and_reinit_fifos(p);
 
 		p->ier |= UART_IER_RLSI | UART_IER_RDI;
-		serial_port_out(&p->port, UART_IER, p->ier);
+		serial8250_set_IER(p, p->ier);
 	}
 }
 EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);
@ drivers/tty/serial/8250/8250_port.c:1735 @ static void serial8250_disable_ms(struct uart_port *port)
 	mctrl_gpio_disable_ms(up->gpios);
 
 	up->ier &= ~UART_IER_MSI;
-	serial_port_out(port, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 }
 
 static void serial8250_enable_ms(struct uart_port *port)
@ drivers/tty/serial/8250/8250_port.c:1751 @ static void serial8250_enable_ms(struct uart_port *port)
 	up->ier |= UART_IER_MSI;
 
 	serial8250_rpm_get(up);
-	serial_port_out(port, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 	serial8250_rpm_put(up);
 }
 
@ drivers/tty/serial/8250/8250_port.c:2201 @ static void serial8250_put_poll_char(struct uart_port *port,
 	serial8250_rpm_get(up);
 	/*
 	 *	First save the IER then disable the interrupts
+	 *
+	 *	Best-effort IER access because other CPUs are quiesced.
 	 */
-	ier = serial_port_in(port, UART_IER);
-	serial8250_clear_IER(up);
+	__serial8250_clear_IER(up, NULL, &ier);
 
 	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
 	/*
@ drivers/tty/serial/8250/8250_port.c:2217 @ static void serial8250_put_poll_char(struct uart_port *port,
 	 *	and restore the IER
 	 */
 	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
-	serial_port_out(port, UART_IER, ier);
+	__serial8250_set_IER(up, NULL, ier);
 	serial8250_rpm_put(up);
 }
 
@ drivers/tty/serial/8250/8250_port.c:2228 @ int serial8250_do_startup(struct uart_port *port)
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned long flags;
 	unsigned char iir;
+	bool is_console;
 	int retval;
 	u16 lsr;
 
@ drivers/tty/serial/8250/8250_port.c:2246 @ int serial8250_do_startup(struct uart_port *port)
 	serial8250_rpm_get(up);
 	if (port->type == PORT_16C950) {
 		/* Wake up and initialize UART */
+		spin_lock_irqsave(&port->lock, flags);
 		up->acr = 0;
 		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
 		serial_port_out(port, UART_EFR, UART_EFR_ECB);
-		serial_port_out(port, UART_IER, 0);
+		serial8250_set_IER(up, 0);
 		serial_port_out(port, UART_LCR, 0);
 		serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
 		serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
 		serial_port_out(port, UART_EFR, UART_EFR_ECB);
 		serial_port_out(port, UART_LCR, 0);
+		spin_unlock_irqrestore(&port->lock, flags);
 	}
 
 	if (port->type == PORT_DA830) {
 		/* Reset the port */
-		serial_port_out(port, UART_IER, 0);
+		spin_lock_irqsave(&port->lock, flags);
+		serial8250_set_IER(up, 0);
 		serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
+		spin_unlock_irqrestore(&port->lock, flags);
 		mdelay(10);
 
 		/* Enable Tx, Rx and free run mode */
@ drivers/tty/serial/8250/8250_port.c:2362 @ int serial8250_do_startup(struct uart_port *port)
 	if (retval)
 		goto out;
 
+	is_console = serial8250_is_console(port);
+
 	if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) {
 		unsigned char iir1;
 
@ drivers/tty/serial/8250/8250_port.c:2380 @ int serial8250_do_startup(struct uart_port *port)
 		 */
 		spin_lock_irqsave(&port->lock, flags);
 
+		if (is_console)
+			serial8250_enter_unsafe(up);
+
 		wait_for_xmitr(up, UART_LSR_THRE);
 		serial_port_out_sync(port, UART_IER, UART_IER_THRI);
 		udelay(1); /* allow THRE to set */
@ drivers/tty/serial/8250/8250_port.c:2393 @ int serial8250_do_startup(struct uart_port *port)
 		iir = serial_port_in(port, UART_IIR);
 		serial_port_out(port, UART_IER, 0);
 
+		if (is_console)
+			serial8250_exit_unsafe(up);
+
 		spin_unlock_irqrestore(&port->lock, flags);
 
 		if (port->irqflags & IRQF_SHARED)
@ drivers/tty/serial/8250/8250_port.c:2450 @ int serial8250_do_startup(struct uart_port *port)
 	 * Do a quick test to see if we receive an interrupt when we enable
 	 * the TX irq.
 	 */
+	if (is_console)
+		serial8250_enter_unsafe(up);
 	serial_port_out(port, UART_IER, UART_IER_THRI);
 	lsr = serial_port_in(port, UART_LSR);
 	iir = serial_port_in(port, UART_IIR);
 	serial_port_out(port, UART_IER, 0);
+	if (is_console)
+		serial8250_exit_unsafe(up);
 
 	if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
 		if (!(up->bugs & UART_BUG_TXEN)) {
@ drivers/tty/serial/8250/8250_port.c:2489 @ int serial8250_do_startup(struct uart_port *port)
 	if (up->dma) {
 		const char *msg = NULL;
 
-		if (uart_console(port))
+		if (is_console)
 			msg = "forbid DMA for kernel console";
 		else if (serial8250_request_dma(up))
 			msg = "failed to request DMA";
@ drivers/tty/serial/8250/8250_port.c:2540 @ void serial8250_do_shutdown(struct uart_port *port)
 	 */
 	spin_lock_irqsave(&port->lock, flags);
 	up->ier = 0;
-	serial_port_out(port, UART_IER, 0);
+	serial8250_set_IER(up, 0);
 	spin_unlock_irqrestore(&port->lock, flags);
 
 	synchronize_irq(port->irq);
@ drivers/tty/serial/8250/8250_port.c:2906 @ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
 	if (up->capabilities & UART_CAP_RTOIE)
 		up->ier |= UART_IER_RTOIE;
 
-	serial_port_out(port, UART_IER, up->ier);
+	serial8250_set_IER(up, up->ier);
 
 	if (up->capabilities & UART_CAP_EFR) {
 		unsigned char efr = 0;
@ drivers/tty/serial/8250/8250_port.c:3371 @ EXPORT_SYMBOL_GPL(serial8250_set_defaults);
 
 #ifdef CONFIG_SERIAL_8250_CONSOLE
 
-static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
+static bool serial8250_console_putchar(struct uart_port *port, unsigned char ch,
+				       struct cons_write_context *wctxt)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 
 	wait_for_xmitr(up, UART_LSR_THRE);
+	if (!console_can_proceed(wctxt))
+		return false;
 	serial_port_out(port, UART_TX, ch);
+	if (ch == '\n')
+		up->console_newline_needed = false;
+	else
+		up->console_newline_needed = true;
+
+	return true;
 }
 
 /*
@ drivers/tty/serial/8250/8250_port.c:3414 @ static void serial8250_console_restore(struct uart_8250_port *up)
 	serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
 }
 
-/*
- * Print a string to the serial port using the device FIFO
- *
- * It sends fifosize bytes and then waits for the fifo
- * to get empty.
- */
-static void serial8250_console_fifo_write(struct uart_8250_port *up,
-					  const char *s, unsigned int count)
+static bool __serial8250_console_write(struct uart_port *port, struct cons_write_context *wctxt,
+		const char *s, unsigned int count,
+		bool (*putchar)(struct uart_port *, unsigned char, struct cons_write_context *))
 {
-	int i;
-	const char *end = s + count;
-	unsigned int fifosize = up->tx_loadsz;
-	bool cr_sent = false;
+	bool finished = false;
+	unsigned int i;
 
-	while (s != end) {
-		wait_for_lsr(up, UART_LSR_THRE);
-
-		for (i = 0; i < fifosize && s != end; ++i) {
-			if (*s == '\n' && !cr_sent) {
-				serial_out(up, UART_TX, '\r');
-				cr_sent = true;
-			} else {
-				serial_out(up, UART_TX, *s++);
-				cr_sent = false;
-			}
+	for (i = 0; i < count; i++, s++) {
+		if (*s == '\n') {
+			if (!putchar(port, '\r', wctxt))
+				goto out;
 		}
+		if (!putchar(port, *s, wctxt))
+			goto out;
 	}
+	finished = true;
+out:
+	return finished;
+}
+
+static bool serial8250_console_write(struct uart_port *port, struct cons_write_context *wctxt,
+		const char *s, unsigned int count,
+		bool (*putchar)(struct uart_port *, unsigned char, struct cons_write_context *))
+{
+	return __serial8250_console_write(port, wctxt, s, count, putchar);
+}
+
+static bool atomic_print_line(struct uart_8250_port *up,
+			      struct cons_write_context *wctxt)
+{
+	struct uart_port *port = &up->port;
+
+	if (up->console_newline_needed &&
+	    !__serial8250_console_write(port, wctxt, "\n", 1, serial8250_console_putchar)) {
+		return false;
+	}
+
+	return __serial8250_console_write(port, wctxt, wctxt->outbuf, wctxt->len,
+					  serial8250_console_putchar);
+}
+
+static void atomic_console_reacquire(struct cons_write_context *wctxt,
+				     struct cons_write_context *wctxt_init)
+{
+	memcpy(wctxt, wctxt_init, sizeof(*wctxt));
+	while (!console_try_acquire(wctxt)) {
+		cpu_relax();
+		memcpy(wctxt, wctxt_init, sizeof(*wctxt));
+	}
+}
+
+bool serial8250_console_write_atomic(struct uart_8250_port *up,
+				     struct cons_write_context *wctxt)
+{
+	struct cons_write_context wctxt_init = { };
+	struct cons_context *ctxt_init = &ACCESS_PRIVATE(&wctxt_init, ctxt);
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	bool finished = false;
+	unsigned int ier;
+
+	touch_nmi_watchdog();
+
+	/* With write_atomic, another context may hold the port->lock. */
+
+	ctxt_init->console = ctxt->console;
+	ctxt_init->prio = ctxt->prio;
+	ctxt_init->thread = ctxt->thread;
+
+	/*
+	 * Enter unsafe in order to disable interrupts. If the console is
+	 * lost before the interrupts are disabled, bail out because another
+	 * context took over the printing. If the console is lost after the
+	 * interrutps are disabled, the console must be reacquired in order
+	 * to re-enable the interrupts. However in that case no printing is
+	 * allowed because another context took over the printing.
+	 */
+
+	if (!console_enter_unsafe(wctxt))
+		return false;
+
+	if (!__serial8250_clear_IER(up, wctxt, &ier))
+		return false;
+
+	if (!console_exit_unsafe(wctxt)) {
+		atomic_console_reacquire(wctxt, &wctxt_init);
+		goto enable_irq;
+	}
+
+	if (!atomic_print_line(up, wctxt)) {
+		atomic_console_reacquire(wctxt, &wctxt_init);
+		goto enable_irq;
+	}
+
+	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
+	finished = true;
+enable_irq:
+	/*
+	 * Enter unsafe in order to enable interrupts. If the console is
+	 * lost before the interrupts are enabled, the console must be
+	 * reacquired in order to re-enable the interrupts.
+	 */
+	for (;;) {
+		if (console_enter_unsafe(wctxt) &&
+		    __serial8250_set_IER(up, wctxt, ier)) {
+			break;
+		}
+
+		/* HW-IRQs still disabled. Reacquire to enable them. */
+		atomic_console_reacquire(wctxt, &wctxt_init);
+	}
+	console_exit_unsafe(wctxt);
+
+	return finished;
 }
 
 /*
@ drivers/tty/serial/8250/8250_port.c:3538 @ static void serial8250_console_fifo_write(struct uart_8250_port *up,
  *	Doing runtime PM is really a bad idea for the kernel console.
  *	Thus, we assume the function is called when device is powered up.
  */
-void serial8250_console_write(struct uart_8250_port *up, const char *s,
-			      unsigned int count)
+bool serial8250_console_write_thread(struct uart_8250_port *up,
+				     struct cons_write_context *wctxt)
 {
+	struct cons_write_context wctxt_init = { };
+	struct cons_context *ctxt_init = &ACCESS_PRIVATE(&wctxt_init, ctxt);
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
 	struct uart_8250_em485 *em485 = up->em485;
 	struct uart_port *port = &up->port;
-	unsigned long flags;
-	unsigned int ier, use_fifo;
-	int locked = 1;
+	unsigned int count = wctxt->len;
+	const char *s = wctxt->outbuf;
+	bool rs485_started = false;
+	bool finished = false;
+	unsigned int ier;
 
-	touch_nmi_watchdog();
-
-	if (oops_in_progress)
-		locked = spin_trylock_irqsave(&port->lock, flags);
-	else
-		spin_lock_irqsave(&port->lock, flags);
+	ctxt_init->console = ctxt->console;
+	ctxt_init->prio = ctxt->prio;
+	ctxt_init->thread = ctxt->thread;
 
 	/*
-	 *	First save the IER then disable the interrupts
+	 * Enter unsafe in order to disable interrupts. If the console is
+	 * lost before the interrupts are disabled, bail out because another
+	 * context took over the printing. If the console is lost after the
+	 * interrutps are disabled, the console must be reacquired in order
+	 * to re-enable the interrupts. However in that case no printing is
+	 * allowed because another context took over the printing.
 	 */
-	ier = serial_port_in(port, UART_IER);
-	serial8250_clear_IER(up);
+
+	if (!console_enter_unsafe(wctxt))
+		return false;
+
+	if (!__serial8250_clear_IER(up, wctxt, &ier))
+		return false;
+
+	if (!console_exit_unsafe(wctxt)) {
+		atomic_console_reacquire(wctxt, &wctxt_init);
+		goto enable_irq;
+	}
 
 	/* check scratch reg to see if port powered off during system sleep */
 	if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
+		if (!console_enter_unsafe(wctxt)) {
+			atomic_console_reacquire(wctxt, &wctxt_init);
+			goto enable_irq;
+		}
 		serial8250_console_restore(up);
+		if (!console_exit_unsafe(wctxt)) {
+			atomic_console_reacquire(wctxt, &wctxt_init);
+			goto enable_irq;
+		}
 		up->canary = 0;
 	}
 
 	if (em485) {
-		if (em485->tx_stopped)
+		if (em485->tx_stopped) {
+			if (!console_enter_unsafe(wctxt)) {
+				atomic_console_reacquire(wctxt, &wctxt_init);
+				goto enable_irq;
+			}
 			up->rs485_start_tx(up);
-		mdelay(port->rs485.delay_rts_before_send);
+			rs485_started = true;
+			if (!console_exit_unsafe(wctxt)) {
+				atomic_console_reacquire(wctxt, &wctxt_init);
+				goto enable_irq;
+			}
+		}
+		if (port->rs485.delay_rts_before_send) {
+			mdelay(port->rs485.delay_rts_before_send);
+			if (!console_can_proceed(wctxt)) {
+				atomic_console_reacquire(wctxt, &wctxt_init);
+				goto enable_irq;
+			}
+		}
 	}
 
-	use_fifo = (up->capabilities & UART_CAP_FIFO) &&
-		/*
-		 * BCM283x requires to check the fifo
-		 * after each byte.
-		 */
-		!(up->capabilities & UART_CAP_MINI) &&
-		/*
-		 * tx_loadsz contains the transmit fifo size
-		 */
-		up->tx_loadsz > 1 &&
-		(up->fcr & UART_FCR_ENABLE_FIFO) &&
-		port->state &&
-		test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) &&
-		/*
-		 * After we put a data in the fifo, the controller will send
-		 * it regardless of the CTS state. Therefore, only use fifo
-		 * if we don't use control flow.
-		 */
-		!(up->port.flags & UPF_CONS_FLOW);
+	if (!serial8250_console_write(port, wctxt, s, count, serial8250_console_putchar)) {
+		atomic_console_reacquire(wctxt, &wctxt_init);
+		goto enable_irq;
+	}
 
-	if (likely(use_fifo))
-		serial8250_console_fifo_write(up, s, count);
-	else
-		uart_console_write(port, s, count, serial8250_console_putchar);
-
-	/*
-	 *	Finally, wait for transmitter to become empty
-	 *	and restore the IER
-	 */
 	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
-
+	finished = true;
+enable_irq:
+	/*
+	 * Enter unsafe in order to stop rs485_tx. If the console is
+	 * lost before the rs485_tx is stopped, the console must be
+	 * reacquired in order to stop rs485_tx.
+	 */
 	if (em485) {
 		mdelay(port->rs485.delay_rts_after_send);
-		if (em485->tx_stopped)
+		if (em485->tx_stopped && rs485_started) {
+			while (!console_enter_unsafe(wctxt))
+				atomic_console_reacquire(wctxt, &wctxt_init);
 			up->rs485_stop_tx(up);
+			if (!console_exit_unsafe(wctxt))
+				atomic_console_reacquire(wctxt, &wctxt_init);
+		}
 	}
 
-	serial_port_out(port, UART_IER, ier);
+	/*
+	 * Enter unsafe in order to enable interrupts. If the console is
+	 * lost before the interrupts are enabled, the console must be
+	 * reacquired in order to re-enable the interrupts.
+	 */
+	for (;;) {
+		if (console_enter_unsafe(wctxt) &&
+		    __serial8250_set_IER(up, wctxt, ier)) {
+			break;
+		}
+		atomic_console_reacquire(wctxt, &wctxt_init);
+	}
 
 	/*
 	 *	The receive handling will happen properly because the
@ drivers/tty/serial/8250/8250_port.c:3659 @ void serial8250_console_write(struct uart_8250_port *up, const char *s,
 	if (up->msr_saved_flags)
 		serial8250_modem_status(up);
 
-	if (locked)
-		spin_unlock_irqrestore(&port->lock, flags);
+	console_exit_unsafe(wctxt);
+
+	return finished;
 }
 
 static unsigned int probe_baud(struct uart_port *port)
@ drivers/tty/serial/8250/8250_port.c:3681 @ static unsigned int probe_baud(struct uart_port *port)
 
 int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
 {
+	struct uart_8250_port *up = up_to_u8250p(port);
 	int baud = 9600;
 	int bits = 8;
 	int parity = 'n';
@ drivers/tty/serial/8250/8250_port.c:3691 @ int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
 	if (!port->iobase && !port->membase)
 		return -ENODEV;
 
+	up->console_newline_needed = false;
+
 	if (options)
 		uart_parse_options(options, &baud, &parity, &bits, &flow);
 	else if (probe)
@ drivers/tty/serial/8250/Kconfig:12 @ config SERIAL_8250
 	depends on !S390
 	select SERIAL_CORE
 	select SERIAL_MCTRL_GPIO if GPIOLIB
+	select HAVE_ATOMIC_CONSOLE
 	help
 	  This selects whether you want to include the driver for the standard
 	  serial ports.  The standard answer is Y.  People who might say N
@ drivers/tty/serial/amba-pl011.c:2322 @ pl011_console_write(struct console *co, const char *s, unsigned int count)
 {
 	struct uart_amba_port *uap = amba_ports[co->index];
 	unsigned int old_cr = 0, new_cr;
-	unsigned long flags;
+	unsigned long flags = 0;
 	int locked = 1;
 
 	clk_enable(uap->clk);
 
-	local_irq_save(flags);
+	/*
+	 * local_irq_save(flags);
+	 *
+	 * This local_irq_save() is nonsense. If we come in via sysrq
+	 * handling then interrupts are already disabled. Aside of
+	 * that the port.sysrq check is racy on SMP regardless.
+	*/
 	if (uap->port.sysrq)
 		locked = 0;
 	else if (oops_in_progress)
-		locked = spin_trylock(&uap->port.lock);
+		locked = spin_trylock_irqsave(&uap->port.lock, flags);
 	else
-		spin_lock(&uap->port.lock);
+		spin_lock_irqsave(&uap->port.lock, flags);
 
 	/*
 	 *	First save the CR then disable the interrupts
@ drivers/tty/serial/amba-pl011.c:2365 @ pl011_console_write(struct console *co, const char *s, unsigned int count)
 		pl011_write(old_cr, uap, REG_CR);
 
 	if (locked)
-		spin_unlock(&uap->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&uap->port.lock, flags);
 
 	clk_disable(uap->clk);
 }
@ drivers/tty/serial/omap-serial.c:1222 @ serial_omap_console_write(struct console *co, const char *s,
 	unsigned int ier;
 	int locked = 1;
 
-	local_irq_save(flags);
-	if (up->port.sysrq)
-		locked = 0;
-	else if (oops_in_progress)
-		locked = spin_trylock(&up->port.lock);
+	if (up->port.sysrq || oops_in_progress)
+		locked = spin_trylock_irqsave(&up->port.lock, flags);
 	else
-		spin_lock(&up->port.lock);
+		spin_lock_irqsave(&up->port.lock, flags);
 
 	/*
 	 * First save the IER then disable the interrupts
@ drivers/tty/serial/omap-serial.c:1252 @ serial_omap_console_write(struct console *co, const char *s,
 		check_modem_status(up);
 
 	if (locked)
-		spin_unlock(&up->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static int __init
@ drivers/tty/serial/serial_core.c:2339 @ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 	 * able to Re-start_rx later.
 	 */
 	if (!console_suspend_enabled && uart_console(uport)) {
-		if (uport->ops->start_rx)
+		if (uport->ops->start_rx) {
+			spin_lock_irq(&uport->lock);
 			uport->ops->stop_rx(uport);
+			spin_unlock_irq(&uport->lock);
+		}
 		goto unlock;
 	}
 
@ drivers/tty/serial/serial_core.c:2436 @ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 		if (console_suspend_enabled)
 			uart_change_pm(state, UART_PM_STATE_ON);
 		uport->ops->set_termios(uport, &termios, NULL);
-		if (!console_suspend_enabled && uport->ops->start_rx)
+		if (!console_suspend_enabled && uport->ops->start_rx) {
+			spin_lock_irq(&uport->lock);
 			uport->ops->start_rx(uport);
+			spin_unlock_irq(&uport->lock);
+		}
 		if (console_suspend_enabled)
 			console_start(uport->cons);
 	}
@ drivers/tty/tty_io.c:3546 @ static ssize_t show_cons_active(struct device *dev,
 	for_each_console(c) {
 		if (!c->device)
 			continue;
-		if (!c->write)
-			continue;
+		if (c->flags & CON_NO_BKL) {
+			if (!(c->write_thread || c->write_atomic))
+				continue;
+		} else {
+			if (!c->write)
+				continue;
+		}
 		if ((c->flags & CON_ENABLED) == 0)
 			continue;
 		cs[i++] = c;
@ fs/proc/consoles.c:24 @ static int show_console_dev(struct seq_file *m, void *v)
 		{ CON_ENABLED,		'E' },
 		{ CON_CONSDEV,		'C' },
 		{ CON_BOOT,		'B' },
+		{ CON_NO_BKL,		'N' },
 		{ CON_PRINTBUFFER,	'p' },
 		{ CON_BRL,		'b' },
 		{ CON_ANYTIME,		'a' },
 	};
 	char flags[ARRAY_SIZE(con_flags) + 1];
 	struct console *con = v;
+	char con_write = '-';
 	unsigned int a;
 	dev_t dev = 0;
 
@ fs/proc/consoles.c:62 @ static int show_console_dev(struct seq_file *m, void *v)
 	seq_setwidth(m, 21 - 1);
 	seq_printf(m, "%s%d", con->name, con->index);
 	seq_pad(m, ' ');
-	seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
-			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
-			flags);
+	if (con->flags & CON_NO_BKL) {
+		if (con->write_thread || con->write_atomic)
+			con_write = 'W';
+	} else {
+		if (con->write)
+			con_write = 'W';
+	}
+	seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con_write,
+		   con->unblank ? 'U' : '-', flags);
 	if (dev)
 		seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
 
@ include/linux/console.h:19 @
 
 #include <linux/atomic.h>
 #include <linux/bits.h>
+#include <linux/irq_work.h>
 #include <linux/rculist.h>
+#include <linux/rcuwait.h>
 #include <linux/types.h>
 
 struct vc_data;
@ include/linux/console.h:159 @ static inline int con_debug_leave(void)
  *			receiving the printk spam for obvious reasons.
  * @CON_EXTENDED:	The console supports the extended output format of
  *			/dev/kmesg which requires a larger output buffer.
+ * @CON_SUSPENDED:	Indicates if a console is suspended. If true, the
+ *			printing callbacks must not be called.
+ * @CON_NO_BKL:		Console can operate outside of the BKL style console_lock
+ *			constraints.
  */
 enum cons_flags {
 	CON_PRINTBUFFER		= BIT(0),
@ include/linux/console.h:172 @ enum cons_flags {
 	CON_ANYTIME		= BIT(4),
 	CON_BRL			= BIT(5),
 	CON_EXTENDED		= BIT(6),
+	CON_SUSPENDED		= BIT(7),
+	CON_NO_BKL		= BIT(8),
 };
 
+/**
+ * struct cons_state - console state for NOBKL consoles
+ * @atom:	Compound of the state fields for atomic operations
+ * @seq:	Sequence for record tracking (64bit only)
+ * @bits:	Compound of the state bits below
+ *
+ * @locked:	Console is locked by a writer
+ * @unsafe:	Console is busy in a non takeover region
+ * @thread:	Current owner is the printk thread
+ * @cur_prio:	The priority of the current output
+ * @req_prio:	The priority of a handover request
+ * @cpu:	The CPU on which the writer runs
+ *
+ * To be used for state read and preparation of atomic_long_cmpxchg()
+ * operations.
+ *
+ * The @req_prio field is particularly important to allow spin-waiting to
+ * timeout and give up without the risk of it being assigned the lock
+ * after giving up. The @req_prio field has a nice side-effect that it
+ * also makes it possible for a single read+cmpxchg in the common case of
+ * acquire and release.
+ */
+struct cons_state {
+	union {
+		unsigned long	atom;
+		struct {
+#ifdef CONFIG_64BIT
+			u32	seq;
+#endif
+			union {
+				u32	bits;
+				struct {
+					u32 locked	:  1;
+					u32 unsafe	:  1;
+					u32 thread	:  1;
+					u32 cur_prio	:  2;
+					u32 req_prio	:  2;
+					u32 cpu		: 18;
+				};
+			};
+		};
+	};
+};
+
+/**
+ * cons_prio - console writer priority for NOBKL consoles
+ * @CONS_PRIO_NONE:		Unused
+ * @CONS_PRIO_NORMAL:		Regular printk
+ * @CONS_PRIO_EMERGENCY:	Emergency output (WARN/OOPS...)
+ * @CONS_PRIO_PANIC:		Panic output
+ * @CONS_PRIO_MAX:		The number of priority levels
+ *
+ * Emergency output can carefully takeover the console even without consent
+ * of the owner, ideally only when @cons_state::unsafe is not set. Panic
+ * output can ignore the unsafe flag as a last resort. If panic output is
+ * active no takeover is possible until the panic output releases the
+ * console.
+ */
+enum cons_prio {
+	CONS_PRIO_NONE = 0,
+	CONS_PRIO_NORMAL,
+	CONS_PRIO_EMERGENCY,
+	CONS_PRIO_PANIC,
+	CONS_PRIO_MAX,
+};
+
+struct console;
+struct printk_buffers;
+
+/**
+ * struct cons_context - Context for console acquire/release
+ * @console:		The associated console
+ * @state:		The state at acquire time
+ * @old_state:		The old state when try_acquire() failed for analysis
+ *			by the caller
+ * @hov_state:		The handover state for spin and cleanup
+ * @req_state:		The request state for spin and cleanup
+ * @spinwait_max_us:	Limit for spinwait acquire
+ * @oldseq:		The sequence number at acquire()
+ * @newseq:		The sequence number for progress
+ * @prio:		Priority of the context
+ * @pbufs:		Pointer to the text buffer for this context
+ * @dropped:		Dropped counter for the current context
+ * @thread:		The acquire is printk thread context
+ * @hostile:		Hostile takeover requested. Cleared on normal
+ *			acquire or friendly handover
+ * @spinwait:		Spinwait on acquire if possible
+ * @backlog:		Ringbuffer has pending records
+ */
+struct cons_context {
+	struct console		*console;
+	struct cons_state	state;
+	struct cons_state	old_state;
+	struct cons_state	hov_state;
+	struct cons_state	req_state;
+	u64			oldseq;
+	u64			newseq;
+	unsigned int		spinwait_max_us;
+	enum cons_prio		prio;
+	struct printk_buffers	*pbufs;
+	unsigned long		dropped;
+	unsigned int		thread		: 1;
+	unsigned int		hostile		: 1;
+	unsigned int		spinwait	: 1;
+	unsigned int		backlog		: 1;
+};
+
+/**
+ * struct cons_write_context - Context handed to the write callbacks
+ * @ctxt:	The core console context
+ * @outbuf:	Pointer to the text buffer for output
+ * @len:	Length to write
+ * @unsafe:	Invoked in unsafe state due to force takeover
+ */
+struct cons_write_context {
+	struct cons_context	__private ctxt;
+	char			*outbuf;
+	unsigned int		len;
+	bool			unsafe;
+};
+
+struct cons_context_data;
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
@ include/linux/console.h:318 @ enum cons_flags {
  * @dropped:		Number of unreported dropped ringbuffer records
  * @data:		Driver private data
  * @node:		hlist node for the console list
+ *
+ * @atomic_state:	State array for NOBKL consoles; real and handover
+ * @atomic_seq:		Sequence for record tracking (32bit only)
+ * @thread_pbufs:	Pointer to thread private buffer
+ * @kthread:		Pointer to kernel thread
+ * @rcuwait:		RCU wait for the kernel thread
+ * @irq_work:		IRQ work for thread wakeup
+ * @kthread_waiting:	Indicator whether the kthread is waiting to be woken
+ * @write_atomic:	Write callback for atomic context
+ * @write_thread:	Write callback for printk threaded printing
+ * @port_lock:		Callback to lock/unlock the port lock
+ * @pcpu_data:		Pointer to percpu context data
  */
 struct console {
 	char			name[16];
@ include/linux/console.h:349 @ struct console {
 	unsigned long		dropped;
 	void			*data;
 	struct hlist_node	node;
+
+	/* NOBKL console specific members */
+	atomic_long_t		__private atomic_state[2];
+#ifndef CONFIG_64BIT
+	atomic_t		__private atomic_seq;
+#endif
+	struct printk_buffers	*thread_pbufs;
+	struct task_struct	*kthread;
+	struct rcuwait		rcuwait;
+	struct irq_work		irq_work;
+	atomic_t		kthread_waiting;
+
+	bool (*write_atomic)(struct console *con, struct cons_write_context *wctxt);
+	bool (*write_thread)(struct console *con, struct cons_write_context *wctxt);
+	void (*port_lock)(struct console *con, bool do_lock, unsigned long *flags);
+
+	struct cons_context_data	__percpu *pcpu_data;
 };
 
 #ifdef CONFIG_LOCKDEP
@ include/linux/console.h:492 @ static inline bool console_is_registered(const struct console *con)
 	lockdep_assert_console_list_lock_held();			\
 	hlist_for_each_entry(con, &console_list, node)
 
+#ifdef CONFIG_PRINTK
+extern enum cons_prio cons_atomic_enter(enum cons_prio prio);
+extern void cons_atomic_exit(enum cons_prio prio, enum cons_prio prev_prio);
+extern bool console_can_proceed(struct cons_write_context *wctxt);
+extern bool console_enter_unsafe(struct cons_write_context *wctxt);
+extern bool console_exit_unsafe(struct cons_write_context *wctxt);
+extern bool console_try_acquire(struct cons_write_context *wctxt);
+extern bool console_release(struct cons_write_context *wctxt);
+#else
+static inline enum cons_prio cons_atomic_enter(enum cons_prio prio) { return CONS_PRIO_NONE; }
+static inline void cons_atomic_exit(enum cons_prio prio, enum cons_prio prev_prio) { }
+static inline bool console_can_proceed(struct cons_write_context *wctxt) { return false; }
+static inline bool console_enter_unsafe(struct cons_write_context *wctxt) { return false; }
+static inline bool console_exit_unsafe(struct cons_write_context *wctxt) { return false; }
+static inline bool console_try_acquire(struct cons_write_context *wctxt) { return false; }
+static inline bool console_release(struct cons_write_context *wctxt) { return false; }
+#endif
+
 extern int console_set_on_cmdline;
 extern struct console *early_console;
 
@ include/linux/entry-common.h:60 @
 # define ARCH_EXIT_TO_USER_MODE_WORK		(0)
 #endif
 
+#ifdef CONFIG_PREEMPT_LAZY
+# define _TIF_NEED_RESCHED_MASK	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+#else
+# define _TIF_NEED_RESCHED_MASK	(_TIF_NEED_RESCHED)
+#endif
+
 #define EXIT_TO_USER_MODE_WORK						\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
-	 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |	\
+	 _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |	\
 	 ARCH_EXIT_TO_USER_MODE_WORK)
 
 /**
@ include/linux/interrupt.h:608 @ extern void __raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
+#ifdef CONFIG_PREEMPT_RT
+DECLARE_PER_CPU(struct task_struct *, timersd);
+DECLARE_PER_CPU(unsigned long, pending_timer_softirq);
+
+extern void raise_timer_softirq(void);
+extern void raise_hrtimer_softirq(void);
+
+static inline unsigned int local_pending_timers(void)
+{
+        return __this_cpu_read(pending_timer_softirq);
+}
+
+#else
+static inline void raise_timer_softirq(void)
+{
+	raise_softirq(TIMER_SOFTIRQ);
+}
+
+static inline void raise_hrtimer_softirq(void)
+{
+	raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+
+static inline unsigned int local_pending_timers(void)
+{
+        return local_softirq_pending();
+}
+#endif
+
 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
 
 static inline struct task_struct *this_cpu_ksoftirqd(void)
@ include/linux/io-mapping.h:72 @ io_mapping_map_atomic_wc(struct io_mapping *mapping,
 
 	BUG_ON(offset >= mapping->size);
 	phys_addr = mapping->base + offset;
-	preempt_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
+	else
+		migrate_disable();
 	pagefault_disable();
 	return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
 }
@ include/linux/io-mapping.h:85 @ io_mapping_unmap_atomic(void __iomem *vaddr)
 {
 	kunmap_local_indexed((void __force *)vaddr);
 	pagefault_enable();
-	preempt_enable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
+	else
+		migrate_enable();
 }
 
 static inline void __iomem *
@ include/linux/io-mapping.h:171 @ static inline void __iomem *
 io_mapping_map_atomic_wc(struct io_mapping *mapping,
 			 unsigned long offset)
 {
-	preempt_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
+	else
+		migrate_disable();
 	pagefault_disable();
 	return io_mapping_map_wc(mapping, offset, PAGE_SIZE);
 }
@ include/linux/io-mapping.h:184 @ io_mapping_unmap_atomic(void __iomem *vaddr)
 {
 	io_mapping_unmap(vaddr);
 	pagefault_enable();
-	preempt_enable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
+	else
+		migrate_enable();
 }
 
 static inline void __iomem *
@ include/linux/netdevice.h:3205 @ struct softnet_data {
 	int			defer_count;
 	int			defer_ipi_scheduled;
 	struct sk_buff		*defer_list;
+#ifndef CONFIG_PREEMPT_RT
 	call_single_data_t	defer_csd;
+#else
+	struct work_struct	defer_work;
+#endif
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
@ include/linux/preempt.h:199 @ extern void preempt_count_sub(int val);
 #define preempt_count_inc() preempt_count_add(1)
 #define preempt_count_dec() preempt_count_sub(1)
 
+#ifdef CONFIG_PREEMPT_LAZY
+#define add_preempt_lazy_count(val)	do { preempt_lazy_count() += (val); } while (0)
+#define sub_preempt_lazy_count(val)	do { preempt_lazy_count() -= (val); } while (0)
+#define inc_preempt_lazy_count()	add_preempt_lazy_count(1)
+#define dec_preempt_lazy_count()	sub_preempt_lazy_count(1)
+#define preempt_lazy_count()		(current_thread_info()->preempt_lazy_count)
+#else
+#define add_preempt_lazy_count(val)	do { } while (0)
+#define sub_preempt_lazy_count(val)	do { } while (0)
+#define inc_preempt_lazy_count()	do { } while (0)
+#define dec_preempt_lazy_count()	do { } while (0)
+#define preempt_lazy_count()		(0)
+#endif
+
 #ifdef CONFIG_PREEMPT_COUNT
 
 #define preempt_disable() \
@ include/linux/preempt.h:221 @ do { \
 	barrier(); \
 } while (0)
 
+#define preempt_lazy_disable() \
+do { \
+	inc_preempt_lazy_count(); \
+	barrier(); \
+} while (0)
+
 #define sched_preempt_enable_no_resched() \
 do { \
 	barrier(); \
@ include/linux/preempt.h:241 @ do { \
 #define preempt_enable() \
 do { \
 	barrier(); \
-	if (unlikely(preempt_count_dec_and_test())) \
+	if (unlikely(preempt_count_dec_and_test())) { \
+		instrumentation_begin(); \
 		__preempt_schedule(); \
+		instrumentation_end(); \
+	} \
 } while (0)
 
 #define preempt_enable_notrace() \
 do { \
 	barrier(); \
-	if (unlikely(__preempt_count_dec_and_test())) \
+	if (unlikely(__preempt_count_dec_and_test())) { \
+		instrumentation_begin(); \
 		__preempt_schedule_notrace(); \
+		instrumentation_end(); \
+	} \
 } while (0)
 
 #define preempt_check_resched() \
@ include/linux/preempt.h:264 @ do { \
 		__preempt_schedule(); \
 } while (0)
 
+/*
+ * open code preempt_check_resched() because it is not exported to modules and
+ * used by local_unlock() or bpf_enable_instrumentation().
+ */
+#define preempt_lazy_enable() \
+do { \
+	dec_preempt_lazy_count(); \
+	barrier(); \
+	if (should_resched(0)) \
+		__preempt_schedule(); \
+} while (0)
+
 #else /* !CONFIG_PREEMPTION */
 #define preempt_enable() \
 do { \
@ include/linux/preempt.h:283 @ do { \
 	preempt_count_dec(); \
 } while (0)
 
+#define preempt_lazy_enable() \
+do { \
+	dec_preempt_lazy_count(); \
+	barrier(); \
+} while (0)
+
 #define preempt_enable_notrace() \
 do { \
 	barrier(); \
@ include/linux/preempt.h:329 @ do { \
 #define preempt_enable_notrace()		barrier()
 #define preemptible()				0
 
+#define preempt_lazy_disable()			barrier()
+#define preempt_lazy_enable()			barrier()
+
 #endif /* CONFIG_PREEMPT_COUNT */
 
 #ifdef MODULE
@ include/linux/preempt.h:350 @ do { \
 } while (0)
 #define preempt_fold_need_resched() \
 do { \
-	if (tif_need_resched()) \
+	if (tif_need_resched_now()) \
 		set_preempt_need_resched(); \
 } while (0)
 
@ include/linux/preempt.h:466 @ extern void migrate_enable(void);
 
 #else
 
-static inline void migrate_disable(void) { }
-static inline void migrate_enable(void) { }
+static inline void migrate_disable(void)
+{
+	preempt_lazy_disable();
+}
+
+static inline void migrate_enable(void)
+{
+	preempt_lazy_enable();
+}
 
 #endif /* CONFIG_SMP */
 
@ include/linux/printk.h:142 @ void early_printk(const char *s, ...) { }
 #endif
 
 struct dev_printk_info;
+struct cons_write_context;
 
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(4, 0)
@ include/linux/printk.h:161 @ int _printk(const char *fmt, ...);
  */
 __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);
 
-extern void __printk_safe_enter(void);
-extern void __printk_safe_exit(void);
+extern void __printk_safe_enter(unsigned long *flags);
+extern void __printk_safe_exit(unsigned long *flags);
+extern void __printk_deferred_enter(void);
+extern void __printk_deferred_exit(void);
 /*
  * The printk_deferred_enter/exit macros are available only as a hack for
  * some code paths that need to defer all printk console printing. Interrupts
  * must be disabled for the deferred duration.
  */
-#define printk_deferred_enter __printk_safe_enter
-#define printk_deferred_exit __printk_safe_exit
+#define printk_deferred_enter() __printk_deferred_enter()
+#define printk_deferred_exit() __printk_deferred_exit()
 
 /*
  * Please don't use printk_ratelimit(), because it shares ratelimiting state
@ include/linux/printk.h:198 @ void show_regs_print_info(const char *log_lvl);
 extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
 extern asmlinkage void dump_stack(void) __cold;
 void printk_trigger_flush(void);
+extern void cons_atomic_flush(struct cons_write_context *printk_caller_wctxt,
+			      bool skip_unsafe);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@ include/linux/printk.h:279 @ static inline void dump_stack(void)
 static inline void printk_trigger_flush(void)
 {
 }
+
+static inline void cons_atomic_flush(struct cons_write_context *printk_caller_wctxt,
+				     bool skip_unsafe)
+{
+}
+
 #endif
 
 #ifdef CONFIG_SMP
@ include/linux/sched.h:306 @ extern long schedule_timeout_idle(long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 asmlinkage void preempt_schedule_irq(void);
+
+extern void sched_submit_work(void);
+extern void sched_resume_work(void);
+extern void schedule_rtmutex(void);
+
 #ifdef CONFIG_PREEMPT_RT
  extern void schedule_rtlock(void);
 #endif
@ include/linux/sched.h:2067 @ static inline int test_tsk_need_resched(struct task_struct *tsk)
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
+#ifdef CONFIG_PREEMPT_LAZY
+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
+}
+
+static inline int need_resched_lazy(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int need_resched_now(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#else
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
+static inline int need_resched_lazy(void) { return 0; }
+
+static inline int need_resched_now(void)
+{
+	return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#endif
+
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
@ include/linux/sched/signal.h:25 @ struct sighand_struct {
 	refcount_t		count;
 	wait_queue_head_t	signalfd_wqh;
 	struct k_sigaction	action[_NSIG];
+	struct sigqueue		*sigqueue_cache;
 };
 
 /*
@ include/linux/sched/signal.h:139 @ struct signal_struct {
 #ifdef CONFIG_POSIX_TIMERS
 
 	/* POSIX.1b Interval Timers */
-	int			posix_timer_id;
+	unsigned int		next_posix_timer_id;
 	struct list_head	posix_timers;
 
 	/* ITIMER_REAL timer for the process */
@ include/linux/sched/signal.h:353 @ extern int send_sig(int, struct task_struct *, int);
 extern int zap_other_threads(struct task_struct *p);
 extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
+extern void sigqueue_free_cached_entry(struct sigqueue *q);
 extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 
@ include/linux/serial_8250.h:128 @ struct uart_8250_port {
 #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
 	unsigned char		msr_saved_flags;
 
+	bool			console_newline_needed;
+
 	struct uart_8250_dma	*dma;
 	const struct uart_8250_ops *ops;
 
@ include/linux/serial_8250.h:144 @ struct uart_8250_port {
 	/* Serial port overrun backoff */
 	struct delayed_work overrun_backoff;
 	u32 overrun_backoff_time_ms;
+
+	struct cons_write_context wctxt;
+	int cookie;
 };
 
 static inline struct uart_8250_port *up_to_u8250p(struct uart_port *up)
@ include/linux/serial_8250.h:186 @ void serial8250_tx_chars(struct uart_8250_port *up);
 unsigned int serial8250_modem_status(struct uart_8250_port *up);
 void serial8250_init_port(struct uart_8250_port *up);
 void serial8250_set_defaults(struct uart_8250_port *up);
-void serial8250_console_write(struct uart_8250_port *up, const char *s,
-			      unsigned int count);
+bool serial8250_console_write_atomic(struct uart_8250_port *up,
+				     struct cons_write_context *wctxt);
+bool serial8250_console_write_thread(struct uart_8250_port *up,
+				     struct cons_write_context *wctxt);
 int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
 int serial8250_console_exit(struct uart_port *port);
 
@ include/linux/thread_info.h:181 @ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
 #endif /* !CONFIG_GENERIC_ENTRY */
 
 #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
+# ifdef CONFIG_PREEMPT_LAZY
+
+static __always_inline bool tif_need_resched(void)
+{
+	return read_thread_flags() & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY);
+}
+
+static __always_inline bool tif_need_resched_now(void)
+{
+	return arch_test_bit(TIF_NEED_RESCHED,
+			     (unsigned long *)(&current_thread_info()->flags));
+}
+
+static __always_inline bool tif_need_resched_lazy(void)
+{
+	return arch_test_bit(TIF_NEED_RESCHED_LAZY,
+			     (unsigned long *)(&current_thread_info()->flags));
+}
+
+# else /* !CONFIG_PREEMPT_LAZY */
 
 static __always_inline bool tif_need_resched(void)
 {
@ include/linux/thread_info.h:208 @ static __always_inline bool tif_need_resched(void)
 			     (unsigned long *)(&current_thread_info()->flags));
 }
 
-#else
+static __always_inline bool tif_need_resched_now(void)
+{
+	return tif_need_resched();
+}
+
+static __always_inline bool tif_need_resched_lazy(void)
+{
+	return false;
+}
+
+# endif /* CONFIG_PREEMPT_LAZY */
+#else /* !_ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
+# ifdef CONFIG_PREEMPT_LAZY
+
+static __always_inline bool tif_need_resched(void)
+{
+	return read_thread_flags() & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY);
+}
+
+static __always_inline bool tif_need_resched_now(void)
+{
+	return test_bit(TIF_NEED_RESCHED,
+			(unsigned long *)(&current_thread_info()->flags));
+}
+
+static __always_inline bool tif_need_resched_lazy(void)
+{
+	return test_bit(TIF_NEED_RESCHED_LAZY,
+			(unsigned long *)(&current_thread_info()->flags));
+}
+
+# else /* !CONFIG_PREEMPT_LAZY */
 
 static __always_inline bool tif_need_resched(void)
 {
@ include/linux/thread_info.h:247 @ static __always_inline bool tif_need_resched(void)
 			(unsigned long *)(&current_thread_info()->flags));
 }
 
+static __always_inline bool tif_need_resched_now(void)
+{
+	return tif_need_resched();
+}
+
+static __always_inline bool tif_need_resched_lazy(void)
+{
+	return false;
+}
+
+# endif /* !CONFIG_PREEMPT_LAZY */
 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
 
 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
@ include/linux/trace_events.h:73 @ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
+	unsigned char		preempt_lazy_count;
 };
 
 #define TRACE_EVENT_TYPE_MAX						\
@ include/linux/trace_events.h:162 @ static inline void tracing_generic_entry_update(struct trace_entry *entry,
 						unsigned int trace_ctx)
 {
 	entry->preempt_count		= trace_ctx & 0xff;
+	entry->preempt_lazy_count	= (trace_ctx >> 16) & 0xff;
 	entry->pid			= current->pid;
 	entry->type			= type;
-	entry->flags =			trace_ctx >> 16;
+	entry->flags			= trace_ctx >> 24;
 }
 
 unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
@ include/linux/trace_events.h:176 @ enum trace_flag_type {
 	TRACE_FLAG_NEED_RESCHED		= 0x04,
 	TRACE_FLAG_HARDIRQ		= 0x08,
 	TRACE_FLAG_SOFTIRQ		= 0x10,
+#ifdef CONFIG_PREEMPT_LAZY
+	TRACE_FLAG_PREEMPT_RESCHED	= 0x00,
+	TRACE_FLAG_NEED_RESCHED_LAZY	= 0x20,
+#else
+	TRACE_FLAG_NEED_RESCHED_LAZY	= 0x00,
 	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,
+#endif
 	TRACE_FLAG_NMI			= 0x40,
 	TRACE_FLAG_BH_OFF		= 0x80,
 };
@ include/trace/events/timer.h:161 @ DEFINE_EVENT(timer_class, timer_cancel,
 		{ HRTIMER_MODE_ABS_SOFT,	"ABS|SOFT"	},	\
 		{ HRTIMER_MODE_REL_SOFT,	"REL|SOFT"	},	\
 		{ HRTIMER_MODE_ABS_PINNED_SOFT,	"ABS|PINNED|SOFT" },	\
-		{ HRTIMER_MODE_REL_PINNED_SOFT,	"REL|PINNED|SOFT" })
+		{ HRTIMER_MODE_REL_PINNED_SOFT,	"REL|PINNED|SOFT" },	\
+		{ HRTIMER_MODE_ABS_HARD,	"ABS|HARD" },		\
+		{ HRTIMER_MODE_REL_HARD,	"REL|HARD" },		\
+		{ HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" },	\
+		{ HRTIMER_MODE_REL_PINNED_HARD,	"REL|PINNED|HARD" })
 
 /**
  * hrtimer_init - called when the hrtimer is initialized
@ kernel/Kconfig.preempt:4 @
 # SPDX-License-Identifier: GPL-2.0-only
 
+config HAVE_PREEMPT_LAZY
+	bool
+
+config PREEMPT_LAZY
+	def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT
+
 config PREEMPT_NONE_BUILD
 	bool
 
@ kernel/debug/kdb/kdb_io.c:579 @ static void kdb_msg_write(const char *msg, int msg_len)
 			continue;
 		if (c == dbg_io_ops->cons)
 			continue;
+		if (!c->write)
+			continue;
 		/*
 		 * Set oops_in_progress to encourage the console drivers to
 		 * disregard their internal spin locks: in the current calling
@ kernel/entry/common.c:158 @ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 
 		local_irq_enable_exit_to_user(ti_work);
 
-		if (ti_work & _TIF_NEED_RESCHED)
+		if (ti_work & _TIF_NEED_RESCHED_MASK)
 			schedule();
 
 		if (ti_work & _TIF_UPROBE)
@ kernel/entry/common.c:389 @ void raw_irqentry_exit_cond_resched(void)
 		rcu_irq_exit_check_preempt();
 		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
 			WARN_ON_ONCE(!on_thread_stack());
-		if (need_resched())
+		if (should_resched(0))
 			preempt_schedule_irq();
 	}
 }
@ kernel/fork.c:1668 @ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 	RCU_INIT_POINTER(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
+	sig->sigqueue_cache = NULL;
 
 	refcount_set(&sig->count, 1);
 	spin_lock_irq(&current->sighand->siglock);
@ kernel/fork.c:1685 @ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 void __cleanup_sighand(struct sighand_struct *sighand)
 {
 	if (refcount_dec_and_test(&sighand->count)) {
+		struct sigqueue *sigqueue = NULL;
+
 		signalfd_cleanup(sighand);
+		spin_lock_irq(&sighand->siglock);
+		if (sighand->sigqueue_cache) {
+			sigqueue = sighand->sigqueue_cache;
+			sighand->sigqueue_cache = NULL;
+		}
+		spin_unlock_irq(&sighand->siglock);
+
+		sigqueue_free_cached_entry(sigqueue);
 		/*
 		 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
 		 * without an RCU grace period, see __lock_task_sighand().
@ kernel/ksysfs.c:170 @ KERNEL_ATTR_RO(vmcoreinfo);
 
 #endif /* CONFIG_CRASH_CORE */
 
+#if defined(CONFIG_PREEMPT_RT)
+static ssize_t realtime_show(struct kobject *kobj,
+			     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
@ kernel/ksysfs.c:271 @ static struct attribute * kernel_attrs[] = {
 #ifndef CONFIG_TINY_RCU
 	&rcu_expedited_attr.attr,
 	&rcu_normal_attr.attr,
+#endif
+#ifdef CONFIG_PREEMPT_RT
+	&realtime_attr.attr,
 #endif
 	NULL
 };
@ kernel/locking/rtmutex.c:221 @ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
 	return try_cmpxchg_acquire(&lock->owner, &old, new);
 }
 
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+	return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
 						     struct task_struct *old,
 						     struct task_struct *new)
@ kernel/locking/rtmutex.c:305 @ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
 
 }
 
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+	/*
+	 * With debug enabled rt_mutex_cmpxchg trylock() will always fail,
+	 * which will unconditionally invoke sched_submit/resume_work() in
+	 * the slow path of __rt_mutex_lock() and __ww_rt_mutex_lock() even
+	 * in the non-contended case.
+	 *
+	 * Avoid that by using rt_mutex_slow_trylock() which is covered by
+	 * the debug code and can acquire a non-contended rtmutex. On
+	 * success the callsite avoids the sched_submit/resume_work()
+	 * dance.
+	 */
+	return rt_mutex_slowtrylock(lock);
+}
+
 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
 						     struct task_struct *old,
 						     struct task_struct *new)
@ kernel/locking/rtmutex.c:1581 @ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
 		raw_spin_unlock_irq(&lock->wait_lock);
 
 		if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
-			schedule();
+			schedule_rtmutex();
 
 		raw_spin_lock_irq(&lock->wait_lock);
 		set_current_state(state);
@ kernel/locking/rtmutex.c:1610 @ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	WARN(1, "rtmutex deadlock detected\n");
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule();
+		schedule_rtmutex();
 	}
 }
 
@ kernel/locking/rtmutex.c:1705 @ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	unsigned long flags;
 	int ret;
 
+	/*
+	 * The task is about to sleep. Invoke sched_submit_work() before
+	 * blocking as that might take locks and corrupt tsk::pi_blocked_on.
+	 */
+	sched_submit_work();
+
 	/*
 	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
 	 * be called in early boot if the cmpxchg() fast path is disabled
@ kernel/locking/rtmutex.c:1723 @ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
+	sched_resume_work();
 	return ret;
 }
 
 static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
 					   unsigned int state)
 {
-	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+	lockdep_assert(!current->pi_blocked_on);
+
+	if (likely(rt_mutex_try_acquire(lock)))
 		return 0;
 
 	return rt_mutex_slowlock(lock, NULL, state);
@ kernel/locking/rwbase_rt.c:75 @ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 	int ret;
 
 	raw_spin_lock_irq(&rtm->wait_lock);
-	/*
-	 * Allow readers, as long as the writer has not completely
-	 * acquired the semaphore for write.
-	 */
-	if (atomic_read(&rwb->readers) != WRITER_BIAS) {
-		atomic_inc(&rwb->readers);
-		raw_spin_unlock_irq(&rtm->wait_lock);
-		return 0;
-	}
 
 	/*
 	 * Call into the slow lock path with the rtmutex->wait_lock
@ kernel/locking/rwbase_rt.c:134 @ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
 					    unsigned int state)
 {
+	int ret;
+
+	lockdep_assert(!current->pi_blocked_on);
+
 	if (rwbase_read_trylock(rwb))
 		return 0;
 
-	return __rwbase_read_lock(rwb, state);
+	/*
+	 * The task is about to sleep. For rwsems this submits work as that
+	 * might take locks and corrupt tsk::pi_blocked_on. Must be
+	 * explicit here because __rwbase_read_lock() cannot invoke
+	 * rt_mutex_slowlock(). NOP for rwlocks.
+	 */
+	rwbase_sched_submit_work();
+	ret = __rwbase_read_lock(rwb, state);
+	rwbase_sched_resume_work();
+	return ret;
 }
 
 static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
@ kernel/locking/rwbase_rt.c:246 @ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
 	unsigned long flags;
 
-	/* Take the rtmutex as a first step */
+	/*
+	 * Take the rtmutex as a first step. For rwsem this will also
+	 * invoke sched_submit_work() to flush IO and workers.
+	 */
 	if (rwbase_rtmutex_lock_state(rtm, state))
 		return -EINTR;
 
@ kernel/locking/rwsem.c:1418 @ static inline void __downgrade_write(struct rw_semaphore *sem)
 #define rwbase_rtmutex_lock_state(rtm, state)		\
 	__rt_mutex_lock(rtm, state)
 
+#define rwbase_sched_submit_work()			\
+	sched_submit_work()
+
+#define rwbase_sched_resume_work()			\
+	sched_resume_work()
+
 #define rwbase_rtmutex_slowlock_locked(rtm, state)	\
 	__rt_mutex_slowlock_locked(rtm, NULL, state)
 
@ kernel/locking/spinlock_rt.c:40 @
 
 static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
 {
+	lockdep_assert(!current->pi_blocked_on);
+
 	if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
 		rtlock_slowlock(rtm);
 }
@ kernel/locking/spinlock_rt.c:164 @ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
 	return 0;
 }
 
+static __always_inline void rwbase_sched_submit_work(void) { }
+static __always_inline void rwbase_sched_resume_work(void) { }
+
 static __always_inline int
 rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
 {
@ kernel/locking/ww_rt_mutex.c:65 @ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
 	}
 	mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
 
-	if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) {
+	if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
 		if (ww_ctx)
 			ww_mutex_set_context_fastpath(lock, ww_ctx);
 		return 0;
@ kernel/panic.c:278 @ static void panic_other_cpus_shutdown(bool crash_kexec)
  */
 void panic(const char *fmt, ...)
 {
+	enum cons_prio prev_prio;
 	static char buf[1024];
 	va_list args;
 	long i, i_next = 0, len;
@ kernel/panic.c:326 @ void panic(const char *fmt, ...)
 	if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
 		panic_smp_self_stop();
 
+	prev_prio = cons_atomic_enter(CONS_PRIO_PANIC);
+
 	console_verbose();
 	bust_spinlocks(1);
 	va_start(args, fmt);
@ kernel/panic.c:388 @ void panic(const char *fmt, ...)
 	if (_crash_kexec_post_notifiers)
 		__crash_kexec(NULL);
 
+	cons_atomic_flush(NULL, true);
+
 	console_unblank();
 
 	/*
@ kernel/panic.c:414 @ void panic(const char *fmt, ...)
 		 * We can't use the "normal" timers since we just panicked.
 		 */
 		pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
+		cons_atomic_flush(NULL, true);
 
 		for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
 			touch_nmi_watchdog();
@ kernel/panic.c:433 @ void panic(const char *fmt, ...)
 		 */
 		if (panic_reboot_mode != REBOOT_UNDEFINED)
 			reboot_mode = panic_reboot_mode;
+		cons_atomic_flush(NULL, true);
 		emergency_restart();
 	}
 #ifdef __sparc__
@ kernel/panic.c:446 @ void panic(const char *fmt, ...)
 	}
 #endif
 #if defined(CONFIG_S390)
+	cons_atomic_flush(NULL, true);
 	disabled_wait();
 #endif
 	pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
 
 	/* Do not scroll important messages printed above */
 	suppress_printk = 1;
+
+	cons_atomic_exit(CONS_PRIO_PANIC, prev_prio);
+
 	local_irq_enable();
 	for (i = 0; ; i += PANIC_TIMER_STEP) {
 		touch_softlockup_watchdog();
@ kernel/panic.c:666 @ struct warn_args {
 void __warn(const char *file, int line, void *caller, unsigned taint,
 	    struct pt_regs *regs, struct warn_args *args)
 {
+	enum cons_prio prev_prio;
+
+	prev_prio = cons_atomic_enter(CONS_PRIO_EMERGENCY);
+
 	disable_trace_on_warning();
 
 	if (file)
@ kernel/panic.c:700 @ void __warn(const char *file, int line, void *caller, unsigned taint,
 
 	/* Just a warning, don't kill lockdep. */
 	add_taint(taint, LOCKDEP_STILL_OK);
+
+	cons_atomic_exit(CONS_PRIO_EMERGENCY, prev_prio);
 }
 
 #ifndef __WARN_FLAGS
@ kernel/printk/Makefile:3 @
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y	= printk.o
-obj-$(CONFIG_PRINTK)	+= printk_safe.o
+obj-$(CONFIG_PRINTK)	+= printk_safe.o printk_nobkl.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
 obj-$(CONFIG_PRINTK_INDEX)	+= index.o
 
@ kernel/printk/internal.h:6 @
  * internal.h - printk internal definitions
  */
 #include <linux/percpu.h>
+#include <linux/console.h>
+#include "printk_ringbuffer.h"
 
 #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
 void __init printk_sysctl_init(void);
@ kernel/printk/internal.h:17 @ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
 #define printk_sysctl_init() do { } while (0)
 #endif
 
-#ifdef CONFIG_PRINTK
+#define con_printk(lvl, con, fmt, ...)				\
+	printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),		\
+	       (con->flags & CON_NO_BKL) ? "" : "legacy ",	\
+	       (con->flags & CON_BOOT) ? "boot" : "",		\
+	       con->name, con->index, ##__VA_ARGS__)
 
+#ifdef CONFIG_PRINTK
 #ifdef CONFIG_PRINTK_CALLER
 #define PRINTK_PREFIX_MAX	48
 #else
@ kernel/printk/internal.h:45 @ enum printk_info_flags {
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
 
+extern struct printk_ringbuffer *prb;
+extern bool have_bkl_console;
+extern bool printk_threads_enabled;
+
+extern bool have_boot_console;
+
 __printf(4, 0)
 int vprintk_store(int facility, int level,
 		  const struct dev_printk_info *dev_info,
@ kernel/printk/internal.h:61 @ __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
 
 bool printk_percpu_data_ready(void);
 
+/*
+ * The printk_safe_enter()/_exit() macros mark code blocks using locks that
+ * would lead to deadlock if an interrupting context were to call printk()
+ * while the interrupted context was within such code blocks.
+ *
+ * When a CPU is in such a code block, an interrupting context calling
+ * printk() will only log the new message to the lockless ringbuffer and
+ * then trigger console printing using irqwork.
+ */
+
 #define printk_safe_enter_irqsave(flags)	\
 	do {					\
-		local_irq_save(flags);		\
-		__printk_safe_enter();		\
+		__printk_safe_enter(&flags);	\
 	} while (0)
 
 #define printk_safe_exit_irqrestore(flags)	\
 	do {					\
-		__printk_safe_exit();		\
-		local_irq_restore(flags);	\
+		__printk_safe_exit(&flags);	\
 	} while (0)
 
 void defer_console_output(void);
 
 u16 printk_parse_prefix(const char *text, int *level,
 			enum printk_info_flags *flags);
+
+u64 cons_read_seq(struct console *con);
+void cons_nobkl_cleanup(struct console *con);
+bool cons_nobkl_init(struct console *con);
+bool cons_alloc_percpu_data(struct console *con);
+void cons_kthread_create(struct console *con);
+void cons_wake_threads(void);
+void cons_force_seq(struct console *con, u64 seq);
+void console_bkl_kthread_create(void);
+
+/*
+ * Check if the given console is currently capable and allowed to print
+ * records. If the caller only works with certain types of consoles, the
+ * caller is responsible for checking the console type before calling
+ * this function.
+ */
+static inline bool console_is_usable(struct console *con, short flags)
+{
+	if (!(flags & CON_ENABLED))
+		return false;
+
+	if ((flags & CON_SUSPENDED))
+		return false;
+
+	/*
+	 * The usability of a console varies depending on whether
+	 * it is a NOBKL console or not.
+	 */
+
+	if (flags & CON_NO_BKL) {
+		if (have_boot_console)
+			return false;
+
+	} else {
+		if (!con->write)
+			return false;
+		/*
+		 * Console drivers may assume that per-cpu resources have
+		 * been allocated. So unless they're explicitly marked as
+		 * being able to cope (CON_ANYTIME) don't call them until
+		 * this CPU is officially up.
+		 */
+		if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * cons_kthread_wake - Wake up a printk thread
+ * @con:        Console to operate on
+ */
+static inline void cons_kthread_wake(struct console *con)
+{
+	rcuwait_wake_up(&con->rcuwait);
+}
+
 #else
 
 #define PRINTK_PREFIX_MAX	0
 #define PRINTK_MESSAGE_MAX	0
 #define PRINTKRB_RECORD_MAX	0
 
+static inline void cons_kthread_wake(struct console *con) { }
+static inline void cons_kthread_create(struct console *con) { }
+#define printk_threads_enabled	(false)
+
 /*
  * In !PRINTK builds we still export console_sem
  * semaphore and some of console functions (console_unlock()/etc.), so
@ kernel/printk/internal.h:162 @ u16 printk_parse_prefix(const char *text, int *level,
 #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
 
 static inline bool printk_percpu_data_ready(void) { return false; }
+static inline bool cons_nobkl_init(struct console *con) { return true; }
+static inline void cons_nobkl_cleanup(struct console *con) { }
+static inline bool console_is_usable(struct console *con, short flags) { return false; }
+static inline void cons_force_seq(struct console *con, u64 seq) { }
+
 #endif /* CONFIG_PRINTK */
 
+extern bool have_boot_console;
+
 /**
  * struct printk_buffers - Buffers to read/format/output printk messages.
  * @outbuf:	After formatting, contains text to output.
@ kernel/printk/internal.h:196 @ struct printk_message {
 	u64			seq;
 	unsigned long		dropped;
 };
+
+/**
+ * struct cons_context_data - console context data
+ * @wctxt:		Write context per priority level
+ * @pbufs:		Buffer for storing the text
+ *
+ * Used for early boot and for per CPU data.
+ *
+ * The write contexts are allocated to avoid having them on stack, e.g. in
+ * warn() or panic().
+ */
+struct cons_context_data {
+	struct cons_write_context	wctxt[CONS_PRIO_MAX];
+	struct printk_buffers		pbufs;
+};
+
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+			     bool is_extended, bool may_supress);
+
+#ifdef CONFIG_PRINTK
+
+void console_prepend_dropped(struct printk_message *pmsg,
+			     unsigned long dropped);
+
+#endif
@ kernel/printk/printk.c:445 @ static int console_msg_format = MSG_FORMAT_DEFAULT;
 /* syslog_lock protects syslog_* variables and write access to clear_seq. */
 static DEFINE_MUTEX(syslog_lock);
 
+/*
+ * Specifies if a BKL console was ever registered. Used to determine if the
+ * console lock/unlock dance is needed for console printing.
+ */
+bool have_bkl_console;
+
+/*
+ * Specifies if a boot console is registered. Used to determine if NOBKL
+ * consoles may be used since NOBKL consoles cannot synchronize with boot
+ * consoles.
+ */
+bool have_boot_console;
+
+static int unregister_console_locked(struct console *console);
+
 #ifdef CONFIG_PRINTK
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* All 3 protected by @syslog_lock. */
@ kernel/printk/printk.c:510 @ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
 
 static struct printk_ringbuffer printk_rb_dynamic;
 
-static struct printk_ringbuffer *prb = &printk_rb_static;
+struct printk_ringbuffer *prb = &printk_rb_static;
 
 /*
  * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
@ kernel/printk/printk.c:714 @ static ssize_t msg_print_ext_body(char *buf, size_t size,
 	return len;
 }
 
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
-				    bool is_extended, bool may_supress);
-
 /* /dev/kmsg - userspace message inject/listen interface */
 struct devkmsg_user {
 	atomic64_t seq;
@ kernel/printk/printk.c:1115 @ static inline void log_buf_add_cpu(void) {}
 
 static void __init set_percpu_data_ready(void)
 {
+	struct hlist_node *tmp;
+	struct console *con;
+
+	console_list_lock();
+
+	hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+		if (!cons_alloc_percpu_data(con))
+			unregister_console_locked(con);
+	}
+
 	__printk_percpu_data_ready = true;
+
+	console_list_unlock();
 }
 
 static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
@ kernel/printk/printk.c:2303 @ asmlinkage int vprintk_emit(int facility, int level,
 			    const struct dev_printk_info *dev_info,
 			    const char *fmt, va_list args)
 {
+	struct cons_write_context wctxt = { };
 	int printed_len;
 	bool in_sched = false;
 
@ kernel/printk/printk.c:2324 @ asmlinkage int vprintk_emit(int facility, int level,
 
 	printed_len = vprintk_store(facility, level, dev_info, fmt, args);
 
+	/*
+	 * The caller may be holding system-critical or
+	 * timing-sensitive locks. Disable preemption during
+	 * printing of all remaining records to all consoles so that
+	 * this context can return as soon as possible. Hopefully
+	 * another printk() caller will take over the printing.
+	 */
+	preempt_disable();
+
+	/*
+	 * Flush the non-BKL consoles. This only leads to direct atomic
+	 * printing for non-BKL consoles that do not have a printer
+	 * thread available. Otherwise the printer thread will perform
+	 * the printing.
+	 */
+	cons_atomic_flush(&wctxt, true);
+
 	/* If called from the scheduler, we can not call up(). */
-	if (!in_sched) {
-		/*
-		 * The caller may be holding system-critical or
-		 * timing-sensitive locks. Disable preemption during
-		 * printing of all remaining records to all consoles so that
-		 * this context can return as soon as possible. Hopefully
-		 * another printk() caller will take over the printing.
-		 */
-		preempt_disable();
+	if (!in_sched && have_bkl_console && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
 		/*
 		 * Try to acquire and then immediately release the console
 		 * semaphore. The release will print out buffers. With the
@ kernel/printk/printk.c:2351 @ asmlinkage int vprintk_emit(int facility, int level,
 		 */
 		if (console_trylock_spinning())
 			console_unlock();
-		preempt_enable();
 	}
 
-	wake_up_klogd();
+	preempt_enable();
+
+	cons_wake_threads();
+	if (in_sched)
+		defer_console_output();
+	else
+		wake_up_klogd();
 	return printed_len;
 }
 EXPORT_SYMBOL(vprintk_emit);
@ kernel/printk/printk.c:2598 @ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig
  */
 void suspend_console(void)
 {
+	struct console *con;
+
 	if (!console_suspend_enabled)
 		return;
 	pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
 	pr_flush(1000, true);
+
+	console_list_lock();
+	for_each_console(con)
+		console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+	console_list_unlock();
+
+	/*
+	 * Ensure that all SRCU list walks have completed. All printing
+	 * contexts must be able to see that they are suspended so that it
+	 * is guaranteed that all printing has stopped when this function
+	 * completes.
+	 */
+	synchronize_srcu(&console_srcu);
+
 	console_lock();
 	console_suspended = 1;
 	up_console_sem();
@ kernel/printk/printk.c:2625 @ void suspend_console(void)
 
 void resume_console(void)
 {
+	struct console *con;
+	short flags;
+	int cookie;
+
 	if (!console_suspend_enabled)
 		return;
 	down_console_sem();
 	console_suspended = 0;
 	console_unlock();
+
+	console_list_lock();
+	for_each_console(con)
+		console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+	console_list_unlock();
+
+	/*
+	 * Ensure that all SRCU list walks have completed. All printing
+	 * contexts must be able to see they are no longer suspended so
+	 * that they are guaranteed to wake up and resume printing.
+	 */
+	synchronize_srcu(&console_srcu);
+
+	cookie = console_srcu_read_lock();
+	for_each_console_srcu(con) {
+		flags = console_srcu_read_flags(con);
+		if (flags & CON_NO_BKL)
+			cons_kthread_wake(con);
+	}
+	console_srcu_read_unlock(cookie);
+
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && have_bkl_console)
+		wake_up_interruptible(&log_wait);
+
 	pr_flush(1000, true);
 }
 
@ kernel/printk/printk.c:2672 @ void resume_console(void)
  */
 static int console_cpu_notify(unsigned int cpu)
 {
-	if (!cpuhp_tasks_frozen) {
+	if (!cpuhp_tasks_frozen && have_bkl_console) {
 		/* If trylock fails, someone else is doing the printing */
 		if (console_trylock())
 			console_unlock();
@ kernel/printk/printk.c:2747 @ static bool abandon_console_lock_in_panic(void)
 	return atomic_read(&panic_cpu) != raw_smp_processor_id();
 }
 
-/*
- * Check if the given console is currently capable and allowed to print
- * records.
- *
- * Requires the console_srcu_read_lock.
- */
-static inline bool console_is_usable(struct console *con)
-{
-	short flags = console_srcu_read_flags(con);
-
-	if (!(flags & CON_ENABLED))
-		return false;
-
-	if (!con->write)
-		return false;
-
-	/*
-	 * Console drivers may assume that per-cpu resources have been
-	 * allocated. So unless they're explicitly marked as being able to
-	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
-	 */
-	if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
-		return false;
-
-	return true;
-}
-
 static void __console_unlock(void)
 {
 	console_locked = 0;
@ kernel/printk/printk.c:2768 @ static void __console_unlock(void)
  * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
  */
 #ifdef CONFIG_PRINTK
-static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
 {
 	struct printk_buffers *pbufs = pmsg->pbufs;
 	const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
@ kernel/printk/printk.c:2800 @ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
 	pmsg->outbuf_len += len;
 }
 #else
-#define console_prepend_dropped(pmsg, dropped)
+static inline void console_prepend_dropped(struct printk_message *pmsg,
+					   unsigned long dropped) { }
 #endif /* CONFIG_PRINTK */
 
 /*
@ kernel/printk/printk.c:2823 @ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
  * of @pmsg are valid. (See the documentation of struct printk_message
  * for information about the @pmsg fields.)
  */
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
-				    bool is_extended, bool may_suppress)
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+			     bool is_extended, bool may_suppress)
 {
 	static int panic_console_dropped;
 
@ kernel/printk/printk.c:2993 @ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 
 		cookie = console_srcu_read_lock();
 		for_each_console_srcu(con) {
+			short flags = console_srcu_read_flags(con);
 			bool progress;
 
-			if (!console_is_usable(con))
+			/* console_flush_all() is only for legacy consoles. */
+			if (flags & CON_NO_BKL)
+				continue;
+
+			if (!console_is_usable(con, flags))
 				continue;
 			any_usable = true;
 
@ kernel/printk/printk.c:3038 @ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 	return false;
 }
 
-/**
- * console_unlock - unblock the console subsystem from printing
- *
- * Releases the console_lock which the caller holds to block printing of
- * the console subsystem.
- *
- * While the console_lock was held, console output may have been buffered
- * by printk().  If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
- *
- * console_unlock(); may be called from any context.
- */
-void console_unlock(void)
+static u64 console_flush_and_unlock(void)
 {
 	bool do_cond_resched;
 	bool handover;
 	bool flushed;
 	u64 next_seq;
 
-	if (console_suspended) {
-		up_console_sem();
-		return;
-	}
-
 	/*
 	 * Console drivers are called with interrupts disabled, so
 	 * @console_may_schedule should be cleared before; however, we may
@ kernel/printk/printk.c:3081 @ void console_unlock(void)
 		 * fails, another context is already handling the printing.
 		 */
 	} while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
+
+	return next_seq;
+}
+
+/**
+ * console_unlock - unblock the console subsystem from printing
+ *
+ * Releases the console_lock which the caller holds to block printing of
+ * the console subsystem.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk().  If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+	if (console_suspended) {
+		up_console_sem();
+		return;
+	}
+
+	/*
+	 * PREEMPT_RT relies on kthread and atomic consoles for printing.
+	 * It never attempts to print from console_unlock().
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		__console_unlock();
+		return;
+	}
+
+	console_flush_and_unlock();
 }
 EXPORT_SYMBOL(console_unlock);
 
@ kernel/printk/printk.c:3138 @ void console_unblank(void)
 	struct console *c;
 	int cookie;
 
+	if (!have_bkl_console)
+		return;
+
 	/*
 	 * Stop console printing because the unblank() callback may
 	 * assume the console is not within its write() callback.
@ kernel/printk/printk.c:3149 @ void console_unblank(void)
 	 * In that case, attempt a trylock as best-effort.
 	 */
 	if (oops_in_progress) {
+		/* Semaphores are not NMI-safe. */
+		if (in_nmi())
+			return;
+
 		if (down_trylock_console_sem() != 0)
 			return;
 	} else
@ kernel/printk/printk.c:3182 @ void console_unblank(void)
  */
 void console_flush_on_panic(enum con_flush_mode mode)
 {
+	struct console *c;
+	short flags;
+	int cookie;
+	u64 seq;
+
+	seq = prb_first_valid_seq(prb);
+
+	/*
+	 * Safely flush the atomic consoles before trying to flush any
+	 * BKL/legacy consoles.
+	 */
+	if (mode == CONSOLE_REPLAY_ALL) {
+		cookie = console_srcu_read_lock();
+		for_each_console_srcu(c) {
+			flags = console_srcu_read_flags(c);
+			if (flags & CON_NO_BKL)
+				cons_force_seq(c, seq);
+		}
+		console_srcu_read_unlock(cookie);
+	}
+	cons_atomic_flush(NULL, true);
+
+	if (!have_bkl_console)
+		return;
+
 	/*
 	 * If someone else is holding the console lock, trylock will fail
 	 * and may_schedule may be set.  Ignore and proceed to unlock so
 	 * that messages are flushed out.  As this can be called from any
 	 * context and we don't want to get preempted while flushing,
 	 * ensure may_schedule is cleared.
+	 *
+	 * Since semaphores are not NMI-safe, the console lock must be
+	 * ignored if the panic is in NMI context.
 	 */
-	console_trylock();
+	if (!in_nmi())
+		console_trylock();
 	console_may_schedule = 0;
 
 	if (mode == CONSOLE_REPLAY_ALL) {
-		struct console *c;
-		int cookie;
-		u64 seq;
-
-		seq = prb_first_valid_seq(prb);
-
 		cookie = console_srcu_read_lock();
 		for_each_console_srcu(c) {
 			/*
@ kernel/printk/printk.c:3233 @ void console_flush_on_panic(enum con_flush_mode mode)
 		}
 		console_srcu_read_unlock(cookie);
 	}
-	console_unlock();
+	if (!in_nmi())
+		console_unlock();
 }
 
 /*
@ kernel/printk/printk.c:3291 @ EXPORT_SYMBOL(console_stop);
 
 void console_start(struct console *console)
 {
+	short flags;
+
 	console_list_lock();
 	console_srcu_write_flags(console, console->flags | CON_ENABLED);
+	flags = console->flags;
 	console_list_unlock();
+
+	/*
+	 * Ensure that all SRCU list walks have completed. The related
+	 * printing context must be able to see it is enabled so that
+	 * it is guaranteed to wake up and resume printing.
+	 */
+	synchronize_srcu(&console_srcu);
+
+	if (flags & CON_NO_BKL)
+		cons_kthread_wake(console);
+	else if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		wake_up_interruptible(&log_wait);
+
 	__pr_flush(console, 1000, true);
 }
 EXPORT_SYMBOL(console_start);
 
+static struct task_struct *console_bkl_kthread;
+
+static bool printer_should_wake(u64 seq)
+{
+	bool available = false;
+	struct console *con;
+	int cookie;
+
+	if (kthread_should_stop())
+		return true;
+
+	cookie = console_srcu_read_lock();
+	for_each_console_srcu(con) {
+		short flags = console_srcu_read_flags(con);
+
+		if (flags & CON_NO_BKL)
+			continue;
+		if (!console_is_usable(con, flags))
+			continue;
+		/*
+		 * It is safe to read @seq because only this
+		 * thread context updates @seq.
+		 */
+		if (prb_read_valid(prb, con->seq, NULL)) {
+			available = true;
+			break;
+		}
+	}
+	console_srcu_read_unlock(cookie);
+
+	return available;
+}
+
+static int console_bkl_kthread_func(void *unused)
+{
+	u64 seq = 0;
+	int error;
+
+	for (;;) {
+		error = wait_event_interruptible(log_wait, printer_should_wake(seq));
+
+		if (kthread_should_stop())
+			break;
+
+		if (error)
+			continue;
+
+		console_lock();
+		if (console_suspended)
+			up_console_sem();
+		else
+			seq = console_flush_and_unlock();
+	}
+	return 0;
+}
+
+void console_bkl_kthread_create(void)
+{
+	struct task_struct *kt;
+	struct console *c;
+
+	lockdep_assert_held(&console_mutex);
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		return;
+
+	if (!printk_threads_enabled || console_bkl_kthread)
+		return;
+
+	for_each_console(c) {
+		if (c->flags & CON_BOOT)
+			return;
+	}
+
+	kt = kthread_run(console_bkl_kthread_func, NULL, "pr/bkl");
+	if (IS_ERR(kt)) {
+		pr_err("unable to start BKL printing thread\n");
+		return;
+	}
+
+	console_bkl_kthread = kt;
+
+	/*
+	 * It is important that console printing threads are scheduled
+	 * shortly after a printk call and with generous runtime budgets.
+	 */
+	sched_set_normal(console_bkl_kthread, -20);
+}
+
 static int __read_mostly keep_bootcon;
 
 static int __init keep_bootcon_setup(char *str)
@ kernel/printk/printk.c:3486 @ static void try_enable_default_console(struct console *newcon)
 		newcon->flags |= CON_CONSDEV;
 }
 
-#define con_printk(lvl, con, fmt, ...)			\
-	printk(lvl pr_fmt("%sconsole [%s%d] " fmt),	\
-	       (con->flags & CON_BOOT) ? "boot" : "",	\
-	       con->name, con->index, ##__VA_ARGS__)
-
 static void console_init_seq(struct console *newcon, bool bootcon_registered)
 {
 	struct console *con;
@ kernel/printk/printk.c:3550 @ static void console_init_seq(struct console *newcon, bool bootcon_registered)
 #define console_first()				\
 	hlist_entry(console_list.first, struct console, node)
 
-static int unregister_console_locked(struct console *console);
-
 /*
  * The console driver calls this routine during kernel initialization
  * to register the console printing procedure with printk() and to
@ kernel/printk/printk.c:3641 @ void register_console(struct console *newcon)
 	newcon->dropped = 0;
 	console_init_seq(newcon, bootcon_registered);
 
+	if (!(newcon->flags & CON_NO_BKL)) {
+		have_bkl_console = true;
+		console_bkl_kthread_create();
+	} else if (!cons_nobkl_init(newcon)) {
+		goto unlock;
+	}
+
+	if (newcon->flags & CON_BOOT)
+		have_boot_console = true;
+
 	/*
 	 * Put this console in the list - keep the
 	 * preferred driver at the head of the list.
@ kernel/printk/printk.c:3694 @ void register_console(struct console *newcon)
 			if (con->flags & CON_BOOT)
 				unregister_console_locked(con);
 		}
+
+		/* All boot consoles have been unregistered. */
+		have_boot_console = false;
 	}
 unlock:
 	console_list_unlock();
@ kernel/printk/printk.c:3706 @ EXPORT_SYMBOL(register_console);
 /* Must be called under console_list_lock(). */
 static int unregister_console_locked(struct console *console)
 {
+	struct console *c;
+	bool is_boot_con;
 	int res;
 
 	lockdep_assert_console_list_lock_held();
 
-	con_printk(KERN_INFO, console, "disabled\n");
+	is_boot_con = console->flags & CON_BOOT;
 
 	res = _braille_unregister_console(console);
 	if (res < 0)
@ kernel/printk/printk.c:3720 @ static int unregister_console_locked(struct console *console)
 	if (res > 0)
 		return 0;
 
-	/* Disable it unconditionally */
-	console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
-
 	if (!console_is_registered_locked(console))
 		return -ENODEV;
 
+	console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+
+	con_printk(KERN_INFO, console, "disabled\n");
+
 	hlist_del_init_rcu(&console->node);
 
 	/*
@ kernel/printk/printk.c:3748 @ static int unregister_console_locked(struct console *console)
 	 */
 	synchronize_srcu(&console_srcu);
 
+	if (console->flags & CON_NO_BKL)
+		cons_nobkl_cleanup(console);
+
 	console_sysfs_notify();
 
 	if (console->exit)
 		res = console->exit(console);
 
+	/*
+	 * Each time a boot console unregisters, try to start up the printing
+	 * threads. They will only start if this was the last boot console.
+	 */
+	if (is_boot_con) {
+		for_each_console(c)
+			cons_kthread_create(c);
+	}
+
 	return res;
 }
 
@ kernel/printk/printk.c:3926 @ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 
 		/*
 		 * Hold the console_lock to guarantee safe access to
-		 * console->seq and to prevent changes to @console_suspended
-		 * until all consoles have been processed.
+		 * console->seq.
 		 */
 		console_lock();
 
 		cookie = console_srcu_read_lock();
 		for_each_console_srcu(c) {
+			short flags;
+
 			if (con && con != c)
 				continue;
-			if (!console_is_usable(c))
+
+			flags = console_srcu_read_flags(c);
+
+			if (!console_is_usable(c, flags))
 				continue;
+
+			/*
+			 * Since the console is locked, use this opportunity
+			 * to update console->seq for NOBKL consoles.
+			 */
+			if (flags & CON_NO_BKL)
+				c->seq = cons_read_seq(c);
+
 			printk_seq = c->seq;
 			if (printk_seq < seq)
 				diff += seq - printk_seq;
 		}
 		console_srcu_read_unlock(cookie);
 
-		/*
-		 * If consoles are suspended, it cannot be expected that they
-		 * make forward progress, so timeout immediately. @diff is
-		 * still used to return a valid flush status.
-		 */
-		if (console_suspended)
-			remaining = 0;
-		else if (diff != last_diff && reset_on_progress)
+		if (diff != last_diff && reset_on_progress)
 			remaining = timeout_ms;
 
 		console_unlock();
@ kernel/printk/printk.c:4013 @ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 	int pending = this_cpu_xchg(printk_pending, 0);
 
 	if (pending & PRINTK_PENDING_OUTPUT) {
-		/* If trylock fails, someone else is doing the printing */
-		if (console_trylock())
-			console_unlock();
+		if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+			/* The BKL thread waits on @log_wait. */
+			pending |= PRINTK_PENDING_WAKEUP;
+		} else {
+			/*
+			 * If trylock fails, some other context
+			 * will do the printing.
+			 */
+			if (console_trylock())
+				console_unlock();
+		}
 	}
 
 	if (pending & PRINTK_PENDING_WAKEUP)
@ kernel/printk/printk.c:4058 @ static void __wake_up_klogd(int val)
 	preempt_enable();
 }
 
+/**
+ * wake_up_klogd - Wake kernel logging daemon
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * and the console printing for those records is handled elsewhere. In
+ * this case only the logging daemon needs to be woken.
+ *
+ * Context: Any context.
+ */
 void wake_up_klogd(void)
 {
 	__wake_up_klogd(PRINTK_PENDING_WAKEUP);
 }
 
+/**
+ * defer_console_output - Wake kernel logging daemon and trigger
+ *	console printing in a deferred context
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * but the current context is unable to perform the console printing.
+ * This function also wakes the logging daemon.
+ *
+ * Context: Any context.
+ */
 void defer_console_output(void)
 {
+	int val = PRINTK_PENDING_WAKEUP;
+
 	/*
 	 * New messages may have been added directly to the ringbuffer
 	 * using vprintk_store(), so wake any waiters as well.
 	 */
-	__wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
+	if (have_bkl_console)
+		val |= PRINTK_PENDING_OUTPUT;
+	__wake_up_klogd(val);
 }
 
 void printk_trigger_flush(void)
 {
+	struct cons_write_context wctxt = { };
+
+	preempt_disable();
+	cons_atomic_flush(&wctxt, true);
+	preempt_enable();
+
+	cons_wake_threads();
 	defer_console_output();
 }
 
 int vprintk_deferred(const char *fmt, va_list args)
 {
-	int r;
-
-	r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
-	defer_console_output();
-
-	return r;
+	return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
 }
 
 int _printk_deferred(const char *fmt, ...)
@ kernel/printk/printk_nobkl.c:4 @
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+#include "printk_ringbuffer.h"
+#include "internal.h"
+/*
+ * Printk implementation for consoles that do not depend on the BKL style
+ * console_lock mechanism.
+ *
+ * Console is locked on a CPU when state::locked is set and state:cpu ==
+ * current CPU. This is valid for the current execution context.
+ *
+ * Nesting execution contexts on the same CPU can carefully take over
+ * if the driver allows reentrancy via state::unsafe = false. When the
+ * interrupted context resumes it checks the state before entering
+ * an unsafe region and aborts the operation if it detects a takeover.
+ *
+ * In case of panic or emergency the nesting context can take over the
+ * console forcefully. The write callback is then invoked with the unsafe
+ * flag set in the write context data, which allows the driver side to avoid
+ * locks and to evaluate the driver state so it can use an emergency path
+ * or repair the state instead of blindly assuming that it works.
+ *
+ * If the interrupted context touches the assigned record buffer after
+ * takeover, it does not cause harm because at the same execution level
+ * there is no concurrency on the same CPU. A threaded printer always has
+ * its own record buffer so it can never interfere with any of the per CPU
+ * record buffers.
+ *
+ * A concurrent writer on a different CPU can request to take over the
+ * console by:
+ *
+ *	1) Carefully writing the desired state into state[REQ]
+ *	   if there is no same or higher priority request pending.
+ *	   This locks state[REQ] except for higher priority
+ *	   waiters.
+ *
+ *	2) Setting state[CUR].req_prio unless a same or higher
+ *	   priority waiter won the race.
+ *
+ *	3) Carefully spin on state[CUR] until that is locked with the
+ *	   expected state. When the state is not the expected one then it
+ *	   has to verify that state[REQ] is still the same and that
+ *	   state[CUR] has not been taken over or unlocked.
+ *
+ *      The unlocker hands over to state[REQ], but only if state[CUR]
+ *	matches.
+ *
+ * In case that the owner does not react on the request and does not make
+ * observable progress, the waiter will timeout and can then decide to do
+ * a hostile takeover.
+ */
+
+#define copy_full_state(_dst, _src)	do { _dst = _src; } while (0)
+#define copy_bit_state(_dst, _src)	do { _dst.bits = _src.bits; } while (0)
+
+#ifdef CONFIG_64BIT
+#define copy_seq_state64(_dst, _src)	do { _dst.seq = _src.seq; } while (0)
+#else
+#define copy_seq_state64(_dst, _src)	do { } while (0)
+#endif
+
+enum state_selector {
+	CON_STATE_CUR,
+	CON_STATE_REQ,
+};
+
+/**
+ * cons_state_set - Helper function to set the console state
+ * @con:	Console to update
+ * @which:	Selects real state or handover state
+ * @new:	The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use cons_state_try_cmpxchg().
+ */
+static inline void cons_state_set(struct console *con, enum state_selector which,
+				  struct cons_state *new)
+{
+	atomic_long_set(&ACCESS_PRIVATE(con, atomic_state[which]), new->atom);
+}
+
+/**
+ * cons_state_read - Helper function to read the console state
+ * @con:	Console to update
+ * @which:	Selects real state or handover state
+ * @state:	The state to store the result
+ */
+static inline void cons_state_read(struct console *con, enum state_selector which,
+				   struct cons_state *state)
+{
+	state->atom = atomic_long_read(&ACCESS_PRIVATE(con, atomic_state[which]));
+}
+
+/**
+ * cons_state_try_cmpxchg() - Helper function for atomic_long_try_cmpxchg() on console state
+ * @con:	Console to update
+ * @which:	Selects real state or handover state
+ * @old:	Old/expected state
+ * @new:	New state
+ *
+ * Returns: True on success, false on fail
+ */
+static inline bool cons_state_try_cmpxchg(struct console *con,
+					  enum state_selector which,
+					  struct cons_state *old,
+					  struct cons_state *new)
+{
+	return atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, atomic_state[which]),
+				       &old->atom, new->atom);
+}
+
+/**
+ * cons_state_full_match - Check whether the full state matches
+ * @cur:	The state to check
+ * @prev:	The previous state
+ *
+ * Returns: True if matching, false otherwise.
+ *
+ * Check the full state including state::seq on 64bit. For take over
+ * detection.
+ */
+static inline bool cons_state_full_match(struct cons_state cur,
+					 struct cons_state prev)
+{
+	/*
+	 * req_prio can be set by a concurrent writer for friendly
+	 * handover. Ignore it in the comparison.
+	 */
+	cur.req_prio = prev.req_prio;
+	return cur.atom == prev.atom;
+}
+
+/**
+ * cons_state_bits_match - Check for matching state bits
+ * @cur:	The state to check
+ * @prev:	The previous state
+ *
+ * Returns: True if state matches, false otherwise.
+ *
+ * Contrary to cons_state_full_match this checks only the bits and ignores
+ * a sequence change on 64bits. On 32bit the two functions are identical.
+ */
+static inline bool cons_state_bits_match(struct cons_state cur, struct cons_state prev)
+{
+	/*
+	 * req_prio can be set by a concurrent writer for friendly
+	 * handover. Ignore it in the comparison.
+	 */
+	cur.req_prio = prev.req_prio;
+	return cur.bits == prev.bits;
+}
+
+/**
+ * cons_check_panic - Check whether a remote CPU is in panic
+ *
+ * Returns: True if a remote CPU is in panic, false otherwise.
+ */
+static inline bool cons_check_panic(void)
+{
+	unsigned int pcpu = atomic_read(&panic_cpu);
+
+	return pcpu != PANIC_CPU_INVALID && pcpu != smp_processor_id();
+}
+
+static struct cons_context_data early_cons_ctxt_data __initdata;
+
+/**
+ * cons_context_set_pbufs - Set the output text buffer for the current context
+ * @ctxt:	Pointer to the acquire context
+ *
+ * Buffer selection:
+ *   1) Early boot uses the global (initdata) buffer
+ *   2) Printer threads use the dynamically allocated per-console buffers
+ *   3) All other contexts use the per CPU buffers
+ *
+ * This guarantees that there is no concurrency on the output records ever.
+ * Early boot and per CPU nesting is not a problem. The takeover logic
+ * tells the interrupted context that the buffer has been overwritten.
+ *
+ * There are two critical regions that matter:
+ *
+ * 1) Context is filling the buffer with a record. After interruption
+ *    it continues to sprintf() the record and before it goes to
+ *    write it out, it checks the state, notices the takeover, discards
+ *    the content and backs out.
+ *
+ * 2) Context is in a unsafe critical region in the driver. After
+ *    interruption it might read overwritten data from the output
+ *    buffer. When it leaves the critical region it notices and backs
+ *    out. Hostile takeovers in driver critical regions are best effort
+ *    and there is not much that can be done about that.
+ */
+static __ref void cons_context_set_pbufs(struct cons_context *ctxt)
+{
+	struct console *con = ctxt->console;
+
+	/* Thread context or early boot? */
+	if (ctxt->thread)
+		ctxt->pbufs = con->thread_pbufs;
+	else if (!con->pcpu_data)
+		ctxt->pbufs = &early_cons_ctxt_data.pbufs;
+	else
+		ctxt->pbufs = &(this_cpu_ptr(con->pcpu_data)->pbufs);
+}
+
+/**
+ * cons_seq_init - Helper function to initialize the console sequence
+ * @con:	Console to work on
+ *
+ * Set @con->atomic_seq to the starting record, or if that record no
+ * longer exists, the oldest available record. For init only. Do not
+ * use for runtime updates.
+ */
+static void cons_seq_init(struct console *con)
+{
+	u32 seq = (u32)max_t(u64, con->seq, prb_first_valid_seq(prb));
+#ifdef CONFIG_64BIT
+	struct cons_state state;
+
+	cons_state_read(con, CON_STATE_CUR, &state);
+	state.seq = seq;
+	cons_state_set(con, CON_STATE_CUR, &state);
+#else
+	atomic_set(&ACCESS_PRIVATE(con, atomic_seq), seq);
+#endif
+}
+
+/**
+ * cons_force_seq - Force a specified sequence number for a console
+ * @con:	Console to work on
+ * @seq:	Sequence number to force
+ *
+ * This function is only intended to be used in emergency situations. In
+ * particular: console_flush_on_panic(CONSOLE_REPLAY_ALL)
+ */
+void cons_force_seq(struct console *con, u64 seq)
+{
+#ifdef CONFIG_64BIT
+	struct cons_state old;
+	struct cons_state new;
+
+	do {
+		cons_state_read(con, CON_STATE_CUR, &old);
+		copy_bit_state(new, old);
+		new.seq = seq;
+	} while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new));
+#else
+	atomic_set(&ACCESS_PRIVATE(con, atomic_seq), seq);
+#endif
+}
+
+static inline u64 cons_expand_seq(u64 seq)
+{
+	u64 rbseq;
+
+	/*
+	 * The provided sequence is only the lower 32bits of the ringbuffer
+	 * sequence. It needs to be expanded to 64bit. Get the next sequence
+	 * number from the ringbuffer and fold it.
+	 */
+	rbseq = prb_next_seq(prb);
+	seq = rbseq - ((u32)rbseq - (u32)seq);
+
+	return seq;
+}
+
+/**
+ * cons_read_seq - Read the current console sequence
+ * @con:	Console to read the sequence of
+ *
+ * Returns:	Sequence number of the next record to print on @con.
+ */
+u64 cons_read_seq(struct console *con)
+{
+	u64 seq;
+#ifdef CONFIG_64BIT
+	struct cons_state state;
+
+	cons_state_read(con, CON_STATE_CUR, &state);
+	seq = state.seq;
+#else
+	seq = atomic_read(&ACCESS_PRIVATE(con, atomic_seq));
+#endif
+	return cons_expand_seq(seq);
+}
+
+/**
+ * cons_context_set_seq - Setup the context with the next sequence to print
+ * @ctxt:	Pointer to an acquire context that contains
+ *		all information about the acquire mode
+ *
+ * On return the retrieved sequence number is stored in ctxt->oldseq.
+ *
+ * The sequence number is safe in forceful takeover situations.
+ *
+ * Either the writer succeeded to update before it got interrupted
+ * or it failed. In the latter case the takeover will print the
+ * same line again.
+ *
+ * The sequence is only the lower 32bits of the ringbuffer sequence. The
+ * ringbuffer must be 2^31 records ahead to get out of sync. This needs
+ * some care when starting a console, i.e setting the sequence to 0 is
+ * wrong. It has to be set to the oldest valid sequence in the ringbuffer
+ * as that cannot be more than 2^31 records away
+ *
+ * On 64bit the 32bit sequence is part of console::state, which is saved
+ * in @ctxt->state. This prevents the 32bit update race.
+ */
+static void cons_context_set_seq(struct cons_context *ctxt)
+{
+#ifdef CONFIG_64BIT
+	ctxt->oldseq = ctxt->state.seq;
+#else
+	ctxt->oldseq = atomic_read(&ACCESS_PRIVATE(ctxt->console, atomic_seq));
+#endif
+	ctxt->oldseq = cons_expand_seq(ctxt->oldseq);
+	ctxt->newseq = ctxt->oldseq;
+}
+
+/**
+ * cons_seq_try_update - Try to update the console sequence number
+ * @ctxt:	Pointer to an acquire context that contains
+ *		all information about the acquire mode
+ *
+ * Returns:	True if the console sequence was updated, false otherwise.
+ *
+ * Internal helper as the logic is different on 32bit and 64bit.
+ *
+ * On 32 bit the sequence is separate from state and therefore
+ * subject to a subtle race in the case of hostile takeovers.
+ *
+ * On 64 bit the sequence is part of the state and therefore safe
+ * vs. hostile takeovers.
+ *
+ * In case of fail the console has been taken over and @ctxt is
+ * invalid. Caller has to reacquire the console.
+ */
+#ifdef CONFIG_64BIT
+static bool cons_seq_try_update(struct cons_context *ctxt)
+{
+	struct console *con = ctxt->console;
+	struct cons_state old;
+	struct cons_state new;
+
+	cons_state_read(con, CON_STATE_CUR, &old);
+	do {
+		/* Make sure this context is still the owner. */
+		if (!cons_state_bits_match(old, ctxt->state))
+			return false;
+
+		/* Preserve bit state */
+		copy_bit_state(new, old);
+		new.seq = ctxt->newseq;
+
+		/*
+		 * Can race with hostile takeover or with a handover
+		 * request.
+		 */
+	} while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new));
+
+	copy_full_state(ctxt->state, new);
+	ctxt->oldseq = ctxt->newseq;
+
+	return true;
+}
+#else
+static bool cons_release(struct cons_context *ctxt);
+static bool cons_seq_try_update(struct cons_context *ctxt)
+{
+	struct console *con = ctxt->console;
+	struct cons_state state;
+	int pcpu;
+	u32 old;
+	u32 new;
+
+	/*
+	 * There is a corner case that needs to be considered here:
+	 *
+	 * CPU0			CPU1
+	 * printk()
+	 *  acquire()		-> emergency
+	 *  write()		   acquire()
+	 *  update_seq()
+	 *    state == OK
+	 * --> NMI
+	 *			   takeover()
+	 * <---			     write()
+	 *  cmpxchg() succeeds	     update_seq()
+	 *			     cmpxchg() fails
+	 *
+	 * There is nothing that can be done about this other than having
+	 * yet another state bit that needs to be tracked and analyzed,
+	 * but fails to cover the problem completely.
+	 *
+	 * No other scenarios expose such a problem. On same CPU takeovers
+	 * the cmpxchg() always fails on the interrupted context after the
+	 * interrupting context finished printing, but that's fine as it
+	 * does not own the console anymore. The state check after the
+	 * failed cmpxchg prevents that.
+	 */
+	cons_state_read(con, CON_STATE_CUR, &state);
+	/* Make sure this context is still the owner. */
+	if (!cons_state_bits_match(state, ctxt->state))
+		return false;
+
+	/*
+	 * Get the original sequence number that was retrieved
+	 * from @con->atomic_seq. @con->atomic_seq should be still
+	 * the same. 32bit truncates. See cons_context_set_seq().
+	 */
+	old = (u32)ctxt->oldseq;
+	new = (u32)ctxt->newseq;
+	if (atomic_try_cmpxchg(&ACCESS_PRIVATE(con, atomic_seq), &old, new)) {
+		ctxt->oldseq = ctxt->newseq;
+		return true;
+	}
+
+	/*
+	 * Reread the state. If this context does not own the console anymore
+	 * then it cannot touch the sequence again.
+	 */
+	cons_state_read(con, CON_STATE_CUR, &state);
+	if (!cons_state_bits_match(state, ctxt->state))
+		return false;
+
+	pcpu = atomic_read(&panic_cpu);
+	if (pcpu == smp_processor_id()) {
+		/*
+		 * This is the panic CPU. Emitting a warning here does not
+		 * help at all. The callchain is clear and the priority is
+		 * to get the messages out. In the worst case duplicated
+		 * ones. That's a job for postprocessing.
+		 */
+		atomic_set(&ACCESS_PRIVATE(con, atomic_seq), new);
+		ctxt->oldseq = ctxt->newseq;
+		return true;
+	}
+
+	/*
+	 * Only emit a warning when this happens outside of a panic
+	 * situation as on panic it's neither useful nor helping to let the
+	 * panic CPU get the important stuff out.
+	 */
+	WARN_ON_ONCE(pcpu == PANIC_CPU_INVALID);
+
+	cons_release(ctxt);
+	return false;
+}
+#endif
+
+/**
+ * cons_cleanup_handover - Cleanup a handover request
+ * @ctxt:	Pointer to acquire context
+ *
+ * @ctxt->hov_state contains the state to clean up
+ */
+static void cons_cleanup_handover(struct cons_context *ctxt)
+{
+	struct console *con = ctxt->console;
+	struct cons_state new;
+
+	/*
+	 * No loop required. Either hov_state is still the same or
+	 * not.
+	 */
+	new.atom = 0;
+	cons_state_try_cmpxchg(con, CON_STATE_REQ, &ctxt->hov_state, &new);
+}
+
+/**
+ * cons_setup_handover - Setup a handover request
+ * @ctxt:	Pointer to acquire context
+ *
+ * Returns: True if a handover request was setup, false otherwise.
+ *
+ * On success @ctxt->hov_state contains the requested handover state
+ *
+ * On failure this context is not allowed to request a handover from the
+ * current owner. Reasons would be priority too low or a remote CPU in panic.
+ * In both cases this context should give up trying to acquire the console.
+ */
+static bool cons_setup_handover(struct cons_context *ctxt)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct cons_state old;
+	struct cons_state hstate = {
+		.locked		= 1,
+		.cur_prio	= ctxt->prio,
+		.cpu		= cpu,
+	};
+
+	/*
+	 * Try to store hstate in @con->atomic_state[REQ]. This might
+	 * race with a higher priority waiter.
+	 */
+	cons_state_read(con, CON_STATE_REQ, &old);
+	do {
+		if (cons_check_panic())
+			return false;
+
+		/* Same or higher priority waiter exists? */
+		if (old.cur_prio >= ctxt->prio)
+			return false;
+
+	} while (!cons_state_try_cmpxchg(con, CON_STATE_REQ, &old, &hstate));
+
+	/* Save that state for comparison in spinwait */
+	copy_full_state(ctxt->hov_state, hstate);
+	return true;
+}
+
+/**
+ * cons_setup_request - Setup a handover request in state[CUR]
+ * @ctxt:	Pointer to acquire context
+ * @old:	The state that was used to make the decision to spin wait
+ *
+ * Returns: True if a handover request was setup in state[CUR], false
+ * otherwise.
+ *
+ * On success @ctxt->req_state contains the request state that was set in
+ * state[CUR]
+ *
+ * On failure this context encountered unexpected state values. This
+ * context should retry the full handover request setup process (the
+ * handover request setup by cons_setup_handover() is now invalidated
+ * and must be performed again).
+ */
+static bool cons_setup_request(struct cons_context *ctxt, struct cons_state old)
+{
+	struct console *con = ctxt->console;
+	struct cons_state cur;
+	struct cons_state new;
+
+	/* Now set the request in state[CUR] */
+	cons_state_read(con, CON_STATE_CUR, &cur);
+	do {
+		if (cons_check_panic())
+			goto cleanup;
+
+		/* Bit state changed vs. the decision to spinwait? */
+		if (!cons_state_bits_match(cur, old))
+			goto cleanup;
+
+		/*
+		 * A higher or equal priority context already setup a
+		 * request?
+		 */
+		if (cur.req_prio >= ctxt->prio)
+			goto cleanup;
+
+		/* Setup a request for handover. */
+		copy_full_state(new, cur);
+		new.req_prio = ctxt->prio;
+	} while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &cur, &new));
+
+	/* Save that state for comparison in spinwait */
+	copy_bit_state(ctxt->req_state, new);
+	return true;
+
+cleanup:
+	cons_cleanup_handover(ctxt);
+	return false;
+}
+
+/**
+ * cons_try_acquire_spin - Complete the spinwait attempt
+ * @ctxt:	Pointer to an acquire context that contains
+ *		all information about the acquire mode
+ *
+ * @ctxt->hov_state contains the handover state that was set in
+ * state[REQ]
+ * @ctxt->req_state contains the request state that was set in
+ * state[CUR]
+ *
+ * Returns: 0 if successfully locked. -EBUSY on timeout. -EAGAIN on
+ * unexpected state values.
+ *
+ * On success @ctxt->state contains the new state that was set in
+ * state[CUR]
+ *
+ * On -EBUSY failure this context timed out. This context should either
+ * give up or attempt a hostile takeover.
+ *
+ * On -EAGAIN failure this context encountered unexpected state values.
+ * This context should retry the full handover request setup process (the
+ * handover request setup by cons_setup_handover() is now invalidated and
+ * must be performed again).
+ */
+static int cons_try_acquire_spin(struct cons_context *ctxt)
+{
+	struct console *con = ctxt->console;
+	struct cons_state cur;
+	struct cons_state new;
+	int err = -EAGAIN;
+	int timeout;
+
+	/* Now wait for the other side to hand over */
+	for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
+		/* Timeout immediately if a remote panic is detected. */
+		if (cons_check_panic())
+			break;
+
+		cons_state_read(con, CON_STATE_CUR, &cur);
+
+		/*
+		 * If the real state of the console matches the handover state
+		 * that this context setup, then the handover was a success
+		 * and this context is now the owner.
+		 *
+		 * Note that this might have raced with a new higher priority
+		 * requester coming in after the lock was handed over.
+		 * However, that requester will see that the owner changes and
+		 * setup a new request for the current owner (this context).
+		 */
+		if (cons_state_bits_match(cur, ctxt->hov_state))
+			goto success;
+
+		/*
+		 * If state changed since the request was made, give up as
+		 * it is no longer consistent. This must include
+		 * state::req_prio since there could be a higher priority
+		 * request available.
+		 */
+		if (cur.bits != ctxt->req_state.bits)
+			goto cleanup;
+
+		/*
+		 * Finally check whether the handover state is still
+		 * the same.
+		 */
+		cons_state_read(con, CON_STATE_REQ, &cur);
+		if (cur.atom != ctxt->hov_state.atom)
+			goto cleanup;
+
+		/* Account time */
+		if (timeout > 0)
+			udelay(1);
+	}
+
+	/*
+	 * Timeout. Cleanup the handover state and carefully try to reset
+	 * req_prio in the real state. The reset is important to ensure
+	 * that the owner does not hand over the lock after this context
+	 * has given up waiting.
+	 */
+	cons_cleanup_handover(ctxt);
+
+	cons_state_read(con, CON_STATE_CUR, &cur);
+	do {
+		/*
+		 * The timeout might have raced with the owner coming late
+		 * and handing it over gracefully.
+		 */
+		if (cons_state_bits_match(cur, ctxt->hov_state))
+			goto success;
+
+		/*
+		 * Validate that the state matches with the state at request
+		 * time. If this check fails, there is already a higher
+		 * priority context waiting or the owner has changed (either
+		 * by higher priority or by hostile takeover). In all fail
+		 * cases this context is no longer in line for a handover to
+		 * take place, so no reset is necessary.
+		 */
+		if (cur.bits != ctxt->req_state.bits)
+			goto cleanup;
+
+		copy_full_state(new, cur);
+		new.req_prio = 0;
+	} while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &cur, &new));
+	/* Reset worked. Report timeout. */
+	return -EBUSY;
+
+success:
+	/* Store the real state */
+	copy_full_state(ctxt->state, cur);
+	ctxt->hostile = false;
+	err = 0;
+
+cleanup:
+	cons_cleanup_handover(ctxt);
+	return err;
+}
+
+/**
+ * __cons_try_acquire - Try to acquire the console for printk output
+ * @ctxt:	Pointer to an acquire context that contains
+ *		all information about the acquire mode
+ *
+ * Returns: True if the acquire was successful. False on fail.
+ *
+ * In case of success @ctxt->state contains the acquisition
+ * state.
+ *
+ * In case of fail @ctxt->old_state contains the state
+ * that was read from @con->state for analysis by the caller.
+ */
+static bool __cons_try_acquire(struct cons_context *ctxt)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	short flags = console_srcu_read_flags(con);
+	struct cons_state old;
+	struct cons_state new;
+	int err;
+
+	if (WARN_ON_ONCE(!(flags & CON_NO_BKL)))
+		return false;
+again:
+	cons_state_read(con, CON_STATE_CUR, &old);
+
+	/* Preserve it for the caller and for spinwait */
+	copy_full_state(ctxt->old_state, old);
+
+	if (cons_check_panic())
+		return false;
+
+	/* Set up the new state for takeover */
+	copy_full_state(new, old);
+	new.locked = 1;
+	new.thread = ctxt->thread;
+	new.cur_prio = ctxt->prio;
+	new.req_prio = CONS_PRIO_NONE;
+	new.cpu = cpu;
+
+	/* Attempt to acquire it directly if unlocked */
+	if (!old.locked) {
+		if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new))
+			goto again;
+
+		ctxt->hostile = false;
+		copy_full_state(ctxt->state, new);
+		goto success;
+	}
+
+	/*
+	 * A threaded printer context will never spin or perform a
+	 * hostile takeover. The atomic writer will wake the thread
+	 * when it is done with the important output.
+	 */
+	if (ctxt->thread)
+		return false;
+
+	/*
+	 * If the active context is on the same CPU then there is
+	 * obviously no handshake possible.
+	 */
+	if (old.cpu == cpu)
+		goto check_hostile;
+
+	/*
+	 * If a handover request with same or higher priority is already
+	 * pending then this context cannot setup a handover request.
+	 */
+	if (old.req_prio >= ctxt->prio)
+		goto check_hostile;
+
+	/*
+	 * If the caller did not request spin-waiting then performing a
+	 * handover is not an option.
+	 */
+	if (!ctxt->spinwait)
+		goto check_hostile;
+
+	/*
+	 * Setup the request in state[REQ]. If this fails then this
+	 * context is not allowed to setup a handover request.
+	 */
+	if (!cons_setup_handover(ctxt))
+		goto check_hostile;
+
+	/*
+	 * Setup the request in state[CUR]. Hand in the state that was
+	 * used to make the decision to spinwait above, for comparison. If
+	 * this fails then unexpected state values were encountered and the
+	 * full request setup process is retried.
+	 */
+	if (!cons_setup_request(ctxt, old))
+		goto again;
+
+	/*
+	 * Spin-wait to acquire the console. If this fails then unexpected
+	 * state values were encountered (for example, a hostile takeover by
+	 * another context) and the full request setup process is retried.
+	 */
+	err = cons_try_acquire_spin(ctxt);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		goto check_hostile;
+	}
+success:
+	/* Common updates on success */
+	cons_context_set_seq(ctxt);
+	cons_context_set_pbufs(ctxt);
+	return true;
+
+check_hostile:
+	if (!ctxt->hostile)
+		return false;
+
+	if (cons_check_panic())
+		return false;
+
+	if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new))
+		goto again;
+
+	copy_full_state(ctxt->state, new);
+	goto success;
+}
+
+/**
+ * cons_try_acquire - Try to acquire the console for printk output
+ * @ctxt:	Pointer to an acquire context that contains
+ *		all information about the acquire mode
+ *
+ * Returns: True if the acquire was successful. False on fail.
+ *
+ * In case of success @ctxt->state contains the acquisition
+ * state.
+ *
+ * In case of fail @ctxt->old_state contains the state
+ * that was read from @con->state for analysis by the caller.
+ */
+static bool cons_try_acquire(struct cons_context *ctxt)
+{
+	if (__cons_try_acquire(ctxt))
+		return true;
+
+	ctxt->state.atom = 0;
+	return false;
+}
+
+/**
+ * __cons_release - Release the console after output is done
+ * @ctxt:	The acquire context that contains the state
+ *		at cons_try_acquire()
+ *
+ * Returns:	True if the release was regular
+ *
+ *		False if the console is in unusable state or was handed over
+ *		with handshake or taken	over hostile without handshake.
+ *
+ * The return value tells the caller whether it needs to evaluate further
+ * printing.
+ */
+static bool __cons_release(struct cons_context *ctxt)
+{
+	struct console *con = ctxt->console;
+	short flags = console_srcu_read_flags(con);
+	struct cons_state hstate;
+	struct cons_state old;
+	struct cons_state new;
+
+	if (WARN_ON_ONCE(!(flags & CON_NO_BKL)))
+		return false;
+
+	cons_state_read(con, CON_STATE_CUR, &old);
+again:
+	if (!cons_state_bits_match(old, ctxt->state))
+		return false;
+
+	/* Release it directly when no handover request is pending. */
+	if (!old.req_prio)
+		goto unlock;
+
+	/* Read the handover target state */
+	cons_state_read(con, CON_STATE_REQ, &hstate);
+
+	/* If the waiter gave up hstate is 0 */
+	if (!hstate.atom)
+		goto unlock;
+
+	/*
+	 * If a higher priority waiter raced against a lower priority
+	 * waiter then unlock instead of handing over to either. The
+	 * higher priority waiter will notice the updated state and
+	 * retry.
+	 */
+	if (hstate.cur_prio != old.req_prio)
+		goto unlock;
+
+	/* Switch the state and preserve the sequence on 64bit */
+	copy_bit_state(new, hstate);
+	copy_seq_state64(new, old);
+	if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new))
+		goto again;
+
+	return true;
+
+unlock:
+	/* Clear the state and preserve the sequence on 64bit */
+	new.atom = 0;
+	copy_seq_state64(new, old);
+	if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new))
+		goto again;
+
+	return true;
+}
+
+bool printk_threads_enabled __ro_after_init;
+static bool printk_force_atomic __initdata;
+
+/**
+ * cons_release - Release the console after output is done
+ * @ctxt:	The acquire context that contains the state
+ *		at cons_try_acquire()
+ *
+ * Returns:	True if the release was regular
+ *
+ *		False if the console is in unusable state or was handed over
+ *		with handshake or taken	over hostile without handshake.
+ *
+ * The return value tells the caller whether it needs to evaluate further
+ * printing.
+ */
+static bool cons_release(struct cons_context *ctxt)
+{
+	bool ret = __cons_release(ctxt);
+
+	/* Invalidate the buffer pointer. It is no longer valid. */
+	ctxt->pbufs = NULL;
+
+	ctxt->state.atom = 0;
+	return ret;
+}
+
+bool console_try_acquire(struct cons_write_context *wctxt)
+{
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	return cons_try_acquire(ctxt);
+}
+EXPORT_SYMBOL_GPL(console_try_acquire);
+
+bool console_release(struct cons_write_context *wctxt)
+{
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	return cons_release(ctxt);
+}
+EXPORT_SYMBOL_GPL(console_release);
+
+/**
+ * cons_alloc_percpu_data - Allocate percpu data for a console
+ * @con:	Console to allocate for
+ *
+ * Returns: True on success. False otherwise and the console cannot be used.
+ *
+ * If it is not yet possible to allocate per CPU data, success is returned.
+ * When per CPU data becomes possible, set_percpu_data_ready() will call
+ * this function again for all registered consoles.
+ */
+bool cons_alloc_percpu_data(struct console *con)
+{
+	if (!printk_percpu_data_ready())
+		return true;
+
+	con->pcpu_data = alloc_percpu(typeof(*con->pcpu_data));
+	if (con->pcpu_data)
+		return true;
+
+	con_printk(KERN_WARNING, con, "failed to allocate percpu buffers\n");
+	return false;
+}
+
+/**
+ * cons_free_percpu_data - Free percpu data of a console on unregister
+ * @con:	Console to clean up
+ */
+static void cons_free_percpu_data(struct console *con)
+{
+	if (!con->pcpu_data)
+		return;
+
+	free_percpu(con->pcpu_data);
+	con->pcpu_data = NULL;
+}
+
+/**
+ * console_can_proceed - Check whether printing can proceed
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Returns:	True if the state is correct. False if a handover
+ *		has been requested or if the console was taken
+ *		over.
+ *
+ * Must be invoked after the record was dumped into the assigned record
+ * buffer and at appropriate safe places in the driver.  For unsafe driver
+ * sections see console_enter_unsafe().
+ *
+ * When this function returns false then the calling context is not allowed
+ * to go forward and has to back out immediately and carefully. The buffer
+ * content is no longer trusted either and the console lock is no longer
+ * held.
+ */
+bool console_can_proceed(struct cons_write_context *wctxt)
+{
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	struct console *con = ctxt->console;
+	struct cons_state state;
+
+	cons_state_read(con, CON_STATE_CUR, &state);
+	/* Store it for analysis or reuse */
+	copy_full_state(ctxt->old_state, state);
+
+	/* Make sure this context is still the owner. */
+	if (!cons_state_full_match(state, ctxt->state))
+		return false;
+
+	/*
+	 * Having a safe point for take over and eventually a few
+	 * duplicated characters or a full line is way better than a
+	 * hostile takeover. Post processing can take care of the garbage.
+	 * Continue if the requested priority is not sufficient.
+	 */
+	if (state.req_prio <= state.cur_prio)
+		return true;
+
+	/*
+	 * A console printer within an unsafe region is allowed to continue.
+	 * It can perform the handover when exiting the safe region. Otherwise
+	 * a hostile takeover will be necessary.
+	 */
+	if (state.unsafe)
+		return true;
+
+	/* Release and hand over */
+	cons_release(ctxt);
+	/*
+	 * This does not check whether the handover succeeded. The
+	 * outermost callsite has to make the final decision whether printing
+	 * should continue or not (via reacquire, possibly hostile). The
+	 * console is unlocked already so go back all the way instead of
+	 * trying to implement heuristics in tons of places.
+	 */
+	return false;
+}
+EXPORT_SYMBOL_GPL(console_can_proceed);
+
+/**
+ * __console_update_unsafe - Update the unsafe bit in @con->atomic_state
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Returns:	True if the state is correct. False if a handover
+ *		has been requested or if the console was taken
+ *		over.
+ *
+ * Must be invoked before an unsafe driver section is entered.
+ *
+ * When this function returns false then the calling context is not allowed
+ * to go forward and has to back out immediately and carefully. The buffer
+ * content is no longer trusted either and the console lock is no longer
+ * held.
+ *
+ * Internal helper to avoid duplicated code
+ */
+static bool __console_update_unsafe(struct cons_write_context *wctxt, bool unsafe)
+{
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	struct console *con = ctxt->console;
+	struct cons_state new;
+
+	do  {
+		if (!console_can_proceed(wctxt))
+			return false;
+		/*
+		 * console_can_proceed() saved the real state in
+		 * ctxt->old_state
+		 */
+		copy_full_state(new, ctxt->old_state);
+		new.unsafe = unsafe;
+
+	} while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &ctxt->old_state, &new));
+
+	copy_full_state(ctxt->state, new);
+	return true;
+}
+
+/**
+ * console_enter_unsafe - Enter an unsafe region in the driver
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Returns:	True if the state is correct. False if a handover
+ *		has been requested or if the console was taken
+ *		over.
+ *
+ * Must be invoked before an unsafe driver section is entered.
+ *
+ * When this function returns false then the calling context is not allowed
+ * to go forward and has to back out immediately and carefully. The buffer
+ * content is no longer trusted either and the console lock is no longer
+ * held.
+ */
+bool console_enter_unsafe(struct cons_write_context *wctxt)
+{
+	return __console_update_unsafe(wctxt, true);
+}
+EXPORT_SYMBOL_GPL(console_enter_unsafe);
+
+/**
+ * console_exit_unsafe - Exit an unsafe region in the driver
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Returns:	True if the state is correct. False if a handover
+ *		has been requested or if the console was taken
+ *		over.
+ *
+ * Must be invoked before an unsafe driver section is exited.
+ *
+ * When this function returns false then the calling context is not allowed
+ * to go forward and has to back out immediately and carefully. The buffer
+ * content is no longer trusted either and the console lock is no longer
+ * held.
+ */
+bool console_exit_unsafe(struct cons_write_context *wctxt)
+{
+	return __console_update_unsafe(wctxt, false);
+}
+EXPORT_SYMBOL_GPL(console_exit_unsafe);
+
+/**
+ * cons_get_record - Fill the buffer with the next pending ringbuffer record
+ * @wctxt:	The write context which will be handed to the write function
+ *
+ * Returns:	True if there are records available. If the next record should
+ *		be printed, the output buffer is filled and @wctxt->outbuf
+ *		points to the text to print. If @wctxt->outbuf is NULL after
+ *		the call, the record should not be printed but the caller must
+ *		still update the console sequence number.
+ *
+ *		False means that there are no pending records anymore and the
+ *		printing can stop.
+ */
+static bool cons_get_record(struct cons_write_context *wctxt)
+{
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	struct console *con = ctxt->console;
+	bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+	struct printk_message pmsg = {
+		.pbufs = ctxt->pbufs,
+	};
+
+	if (!printk_get_next_message(&pmsg, ctxt->newseq, is_extended, true))
+		return false;
+
+	ctxt->newseq = pmsg.seq;
+	ctxt->dropped += pmsg.dropped;
+
+	if (pmsg.outbuf_len == 0) {
+		wctxt->outbuf = NULL;
+	} else {
+		if (ctxt->dropped && !is_extended)
+			console_prepend_dropped(&pmsg, ctxt->dropped);
+		wctxt->outbuf = &pmsg.pbufs->outbuf[0];
+	}
+
+	wctxt->len = pmsg.outbuf_len;
+
+	return true;
+}
+
+/**
+ * cons_emit_record - Emit record in the acquired context
+ * @wctxt:	The write context that will be handed to the write function
+ *
+ * Returns:	False if the operation was aborted (takeover or handover).
+ *		True otherwise
+ *
+ * When false is returned, the caller is not allowed to touch console state.
+ * The console is owned by someone else. If the caller wants to print more
+ * it has to reacquire the console first.
+ *
+ * When true is returned, @wctxt->ctxt.backlog indicates whether there are
+ * still records pending in the ringbuffer,
+ */
+static bool cons_emit_record(struct cons_write_context *wctxt)
+{
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	struct console *con = ctxt->console;
+	bool done = false;
+
+	/*
+	 * @con->dropped is not protected in case of hostile takeovers so
+	 * the update below is racy. Annotate it accordingly.
+	 */
+	ctxt->dropped = data_race(READ_ONCE(con->dropped));
+
+	/* Fill the output buffer with the next record */
+	ctxt->backlog = cons_get_record(wctxt);
+	if (!ctxt->backlog)
+		return true;
+
+	/* Safety point. Don't touch state in case of takeover */
+	if (!console_can_proceed(wctxt))
+		return false;
+
+	/* Counterpart to the read above */
+	WRITE_ONCE(con->dropped, ctxt->dropped);
+
+	/*
+	 * In case of skipped records, Update sequence state in @con.
+	 */
+	if (!wctxt->outbuf)
+		goto update;
+
+	/* Tell the driver about potential unsafe state */
+	wctxt->unsafe = ctxt->state.unsafe;
+
+	if (!ctxt->thread && con->write_atomic) {
+		done = con->write_atomic(con, wctxt);
+	} else if (ctxt->thread && con->write_thread) {
+		done = con->write_thread(con, wctxt);
+	} else {
+		cons_release(ctxt);
+		WARN_ON_ONCE(1);
+		return false;
+	}
+
+	/* If not done, the write was aborted due to takeover */
+	if (!done)
+		return false;
+
+	/* If there was a dropped message, it has now been output. */
+	if (ctxt->dropped) {
+		ctxt->dropped = 0;
+		/* Counterpart to the read above */
+		WRITE_ONCE(con->dropped, ctxt->dropped);
+	}
+update:
+	ctxt->newseq++;
+	/*
+	 * The sequence update attempt is not part of console_release()
+	 * because in panic situations the console is not released by
+	 * the panic CPU until all records are written. On 32bit the
+	 * sequence is separate from state anyway.
+	 */
+	return cons_seq_try_update(ctxt);
+}
+
+/**
+ * cons_kthread_should_wakeup - Check whether the printk thread should wakeup
+ * @con:	Console to operate on
+ * @ctxt:	The acquire context that contains the state
+ *		at console_acquire()
+ *
+ * Returns: True if the thread should shutdown or if the console is allowed to
+ * print and a record is available. False otherwise
+ *
+ * After the thread wakes up, it must first check if it should shutdown before
+ * attempting any printing.
+ */
+static bool cons_kthread_should_wakeup(struct console *con, struct cons_context *ctxt)
+{
+	bool is_usable;
+	short flags;
+	int cookie;
+
+	if (kthread_should_stop())
+		return true;
+
+	cookie = console_srcu_read_lock();
+	flags = console_srcu_read_flags(con);
+	is_usable = console_is_usable(con, flags);
+	console_srcu_read_unlock(cookie);
+
+	if (!is_usable)
+		return false;
+
+	/* This reads state and sequence on 64bit. On 32bit only state */
+	cons_state_read(con, CON_STATE_CUR, &ctxt->state);
+
+	/*
+	 * Atomic printing is running on some other CPU. The owner
+	 * will wake the console thread on unlock if necessary.
+	 */
+	if (ctxt->state.locked)
+		return false;
+
+	/* Bring the sequence in @ctxt up to date */
+	cons_context_set_seq(ctxt);
+
+	return prb_read_valid(prb, ctxt->oldseq, NULL);
+}
+
+/**
+ * cons_kthread_func - The printk thread function
+ * @__console:	Console to operate on
+ */
+static int cons_kthread_func(void *__console)
+{
+	struct console *con = __console;
+	struct cons_write_context wctxt = {
+		.ctxt.console	= con,
+		.ctxt.prio	= CONS_PRIO_NORMAL,
+		.ctxt.thread	= 1,
+	};
+	struct cons_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+	unsigned long flags;
+	short con_flags;
+	bool backlog;
+	int cookie;
+	int ret;
+
+	for (;;) {
+		atomic_inc(&con->kthread_waiting);
+
+		/*
+		 * Provides a full memory barrier vs. cons_kthread_wake().
+		 */
+		ret = rcuwait_wait_event(&con->rcuwait,
+					 cons_kthread_should_wakeup(con, ctxt),
+					 TASK_INTERRUPTIBLE);
+
+		atomic_dec(&con->kthread_waiting);
+
+		if (kthread_should_stop())
+			break;
+
+		/* Wait was interrupted by a spurious signal, go back to sleep */
+		if (ret)
+			continue;
+
+		for (;;) {
+			cookie = console_srcu_read_lock();
+
+			/*
+			 * Ensure this stays on the CPU to make handover and
+			 * takeover possible.
+			 */
+			if (con->port_lock)
+				con->port_lock(con, true, &flags);
+			else
+				migrate_disable();
+
+			/*
+			 * Try to acquire the console without attempting to
+			 * take over. If an atomic printer wants to hand
+			 * back to the thread it simply wakes it up.
+			 */
+			if (!cons_try_acquire(ctxt))
+				break;
+
+			con_flags = console_srcu_read_flags(con);
+
+			if (console_is_usable(con, con_flags)) {
+				/*
+				 * If the emit fails, this context is no
+				 * longer the owner. Abort the processing and
+				 * wait for new records to print.
+				 */
+				if (!cons_emit_record(&wctxt))
+					break;
+				backlog = ctxt->backlog;
+			} else {
+				backlog = false;
+			}
+
+			/*
+			 * If the release fails, this context was not the
+			 * owner. Abort the processing and wait for new
+			 * records to print.
+			 */
+			if (!cons_release(ctxt))
+				break;
+
+			/* Backlog done? */
+			if (!backlog)
+				break;
+
+			if (con->port_lock)
+				con->port_lock(con, false, &flags);
+			else
+				migrate_enable();
+
+			console_srcu_read_unlock(cookie);
+
+			cond_resched();
+		}
+		if (con->port_lock)
+			con->port_lock(con, false, &flags);
+		else
+			migrate_enable();
+
+		console_srcu_read_unlock(cookie);
+	}
+	return 0;
+}
+
+/**
+ * cons_irq_work - irq work to wake printk thread
+ * @irq_work:	The irq work to operate on
+ */
+static void cons_irq_work(struct irq_work *irq_work)
+{
+	struct console *con = container_of(irq_work, struct console, irq_work);
+
+	cons_kthread_wake(con);
+}
+
+/**
+ * cons_wake_threads - Wake up printing threads
+ *
+ * A printing thread is only woken if it is within the @kthread_waiting
+ * block. If it is not within the block (or enters the block later), it
+ * will see any new records and continue printing on its own.
+ */
+void cons_wake_threads(void)
+{
+	struct console *con;
+	int cookie;
+
+	cookie = console_srcu_read_lock();
+	for_each_console_srcu(con) {
+		if (con->kthread && atomic_read(&con->kthread_waiting))
+			irq_work_queue(&con->irq_work);
+	}
+	console_srcu_read_unlock(cookie);
+}
+
+/**
+ * struct cons_cpu_state - Per CPU printk context state
+ * @prio:	The current context priority level
+ * @nesting:	Per priority nest counter
+ */
+struct cons_cpu_state {
+	enum cons_prio	prio;
+	int		nesting[CONS_PRIO_MAX];
+};
+
+static DEFINE_PER_CPU(struct cons_cpu_state, cons_pcpu_state);
+static struct cons_cpu_state early_cons_pcpu_state __initdata;
+
+/**
+ * cons_get_cpu_state - Get the per CPU console state pointer
+ *
+ * Returns either a pointer to the per CPU state of the current CPU or to
+ * the init data state during early boot.
+ */
+static __ref struct cons_cpu_state *cons_get_cpu_state(void)
+{
+	if (!printk_percpu_data_ready())
+		return &early_cons_pcpu_state;
+
+	return this_cpu_ptr(&cons_pcpu_state);
+}
+
+/**
+ * cons_get_wctxt - Get the write context for atomic printing
+ * @con:	Console to operate on
+ * @prio:	Priority of the context
+ *
+ * Returns either the per CPU context or the builtin context for
+ * early boot.
+ */
+static __ref struct cons_write_context *cons_get_wctxt(struct console *con,
+						       enum cons_prio prio)
+{
+	if (!con->pcpu_data)
+		return &early_cons_ctxt_data.wctxt[prio];
+
+	return &this_cpu_ptr(con->pcpu_data)->wctxt[prio];
+}
+
+/**
+ * cons_atomic_try_acquire - Try to acquire the console for atomic printing
+ * @con:	The console to acquire
+ * @ctxt:	The console context instance to work on
+ * @prio:	The priority of the current context
+ */
+static bool cons_atomic_try_acquire(struct console *con, struct cons_context *ctxt,
+				    enum cons_prio prio, bool skip_unsafe)
+{
+	memset(ctxt, 0, sizeof(*ctxt));
+	ctxt->console		= con;
+	ctxt->spinwait_max_us	= 2000;
+	ctxt->prio		= prio;
+	ctxt->spinwait		= 1;
+
+	/* Try to acquire it directly or via a friendly handover */
+	if (cons_try_acquire(ctxt))
+		return true;
+
+	/* Investigate whether a hostile takeover is due */
+	if (ctxt->old_state.cur_prio >= prio)
+		return false;
+
+	if (!ctxt->old_state.unsafe || !skip_unsafe)
+		ctxt->hostile = 1;
+	return cons_try_acquire(ctxt);
+}
+
+/**
+ * cons_atomic_flush_con - Flush one console in atomic mode
+ * @wctxt:		The write context struct to use for this context
+ * @con:		The console to flush
+ * @prio:		The priority of the current context
+ * @skip_unsafe:	True, to avoid unsafe hostile takeovers
+ */
+static void cons_atomic_flush_con(struct cons_write_context *wctxt, struct console *con,
+				  enum cons_prio prio, bool skip_unsafe)
+{
+	struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	bool wake_thread = false;
+	short flags;
+
+	if (!cons_atomic_try_acquire(con, ctxt, prio, skip_unsafe))
+		return;
+
+	do {
+		flags = console_srcu_read_flags(con);
+
+		if (!console_is_usable(con, flags))
+			break;
+
+		/*
+		 * For normal prio messages let the printer thread handle
+		 * the printing if it is available.
+		 */
+		if (prio <= CONS_PRIO_NORMAL && con->kthread) {
+			wake_thread = true;
+			break;
+		}
+
+		/*
+		 * cons_emit_record() returns false when the console was
+		 * handed over or taken over. In both cases the context is
+		 * no longer valid.
+		 */
+		if (!cons_emit_record(wctxt))
+			return;
+	} while (ctxt->backlog);
+
+	cons_release(ctxt);
+
+	if (wake_thread && atomic_read(&con->kthread_waiting))
+		irq_work_queue(&con->irq_work);
+}
+
+/**
+ * cons_atomic_flush - Flush consoles in atomic mode if required
+ * @printk_caller_wctxt:	The write context struct to use for this
+ *				context (for printk() context only)
+ * @skip_unsafe:		True, to avoid unsafe hostile takeovers
+ */
+void cons_atomic_flush(struct cons_write_context *printk_caller_wctxt, bool skip_unsafe)
+{
+	struct cons_write_context *wctxt;
+	struct cons_cpu_state *cpu_state;
+	struct console *con;
+	short flags;
+	int cookie;
+
+	cpu_state = cons_get_cpu_state();
+
+	/*
+	 * When in an elevated priority, the printk() calls are not
+	 * individually flushed. This is to allow the full output to
+	 * be dumped to the ringbuffer before starting with printing
+	 * the backlog.
+	 */
+	if (cpu_state->prio > CONS_PRIO_NORMAL && printk_caller_wctxt)
+		return;
+
+	/*
+	 * Let the outermost write of this priority print. This avoids
+	 * nasty hackery for nested WARN() where the printing itself
+	 * generates one.
+	 *
+	 * cpu_state->prio <= CONS_PRIO_NORMAL is not subject to nesting
+	 * and can proceed in order to allow atomic printing when consoles
+	 * do not have a printer thread.
+	 */
+	if (cpu_state->prio > CONS_PRIO_NORMAL &&
+	    cpu_state->nesting[cpu_state->prio] != 1)
+		return;
+
+	cookie = console_srcu_read_lock();
+	for_each_console_srcu(con) {
+		if (!con->write_atomic)
+			continue;
+
+		flags = console_srcu_read_flags(con);
+
+		if (!console_is_usable(con, flags))
+			continue;
+
+		if (cpu_state->prio > CONS_PRIO_NORMAL || !con->kthread) {
+			if (printk_caller_wctxt)
+				wctxt = printk_caller_wctxt;
+			else
+				wctxt = cons_get_wctxt(con, cpu_state->prio);
+			cons_atomic_flush_con(wctxt, con, cpu_state->prio, skip_unsafe);
+		}
+	}
+	console_srcu_read_unlock(cookie);
+}
+
+/**
+ * cons_atomic_enter - Enter a context that enforces atomic printing
+ * @prio:	Priority of the context
+ *
+ * Returns:	The previous priority that needs to be fed into
+ *		the corresponding cons_atomic_exit()
+ */
+enum cons_prio cons_atomic_enter(enum cons_prio prio)
+{
+	struct cons_cpu_state *cpu_state;
+	enum cons_prio prev_prio;
+
+	migrate_disable();
+	cpu_state = cons_get_cpu_state();
+
+	prev_prio = cpu_state->prio;
+	if (prev_prio < prio)
+		cpu_state->prio = prio;
+
+	/*
+	 * Increment the nesting on @cpu_state->prio so a WARN()
+	 * nested into a panic printout does not attempt to
+	 * scribble state.
+	 */
+	cpu_state->nesting[cpu_state->prio]++;
+
+	return prev_prio;
+}
+
+/**
+ * cons_atomic_exit - Exit a context that enforces atomic printing
+ * @prio:	Priority of the context to leave
+ * @prev_prio:	Priority of the previous context for restore
+ *
+ * @prev_prio is the priority returned by the corresponding cons_atomic_enter().
+ */
+void cons_atomic_exit(enum cons_prio prio, enum cons_prio prev_prio)
+{
+	struct cons_cpu_state *cpu_state;
+
+	cons_atomic_flush(NULL, true);
+
+	cpu_state = cons_get_cpu_state();
+
+	if (cpu_state->prio == CONS_PRIO_PANIC)
+		cons_atomic_flush(NULL, false);
+
+	/*
+	 * Undo the nesting of cons_atomic_enter() at the CPU state
+	 * priority.
+	 */
+	cpu_state->nesting[cpu_state->prio]--;
+
+	/*
+	 * Restore the previous priority, which was returned by
+	 * cons_atomic_enter().
+	 */
+	cpu_state->prio = prev_prio;
+
+	migrate_enable();
+}
+
+/**
+ * cons_kthread_stop - Stop a printk thread
+ * @con:	Console to operate on
+ */
+static void cons_kthread_stop(struct console *con)
+{
+	lockdep_assert_console_list_lock_held();
+
+	if (!con->kthread)
+		return;
+
+	kthread_stop(con->kthread);
+	con->kthread = NULL;
+
+	kfree(con->thread_pbufs);
+	con->thread_pbufs = NULL;
+}
+
+/**
+ * cons_kthread_create - Create a printk thread
+ * @con:	Console to operate on
+ *
+ * If it fails, let the console proceed. The atomic part might
+ * be usable and useful.
+ */
+void cons_kthread_create(struct console *con)
+{
+	struct task_struct *kt;
+	struct console *c;
+
+	lockdep_assert_console_list_lock_held();
+
+	if (!(con->flags & CON_NO_BKL) || !con->write_thread)
+		return;
+
+	if (!printk_threads_enabled || con->kthread)
+		return;
+
+	/*
+	 * Printer threads cannot be started as long as any boot console is
+	 * registered because there is no way to synchronize the hardware
+	 * registers between boot console code and regular console code.
+	 */
+	for_each_console(c) {
+		if (c->flags & CON_BOOT)
+			return;
+	}
+	have_boot_console = false;
+
+	con->thread_pbufs = kmalloc(sizeof(*con->thread_pbufs), GFP_KERNEL);
+	if (!con->thread_pbufs) {
+		con_printk(KERN_ERR, con, "failed to allocate printing thread buffers\n");
+		return;
+	}
+
+	kt = kthread_run(cons_kthread_func, con, "pr/%s%d", con->name, con->index);
+	if (IS_ERR(kt)) {
+		con_printk(KERN_ERR, con, "failed to start printing thread\n");
+		kfree(con->thread_pbufs);
+		con->thread_pbufs = NULL;
+		return;
+	}
+
+	con->kthread = kt;
+
+	/*
+	 * It is important that console printing threads are scheduled
+	 * shortly after a printk call and with generous runtime budgets.
+	 */
+	sched_set_normal(con->kthread, -20);
+}
+
+static int __init printk_setup_threads(void)
+{
+	struct console *con;
+
+	if (printk_force_atomic)
+		return 0;
+
+	console_list_lock();
+	printk_threads_enabled = true;
+	for_each_console(con)
+		cons_kthread_create(con);
+	if (have_bkl_console)
+		console_bkl_kthread_create();
+	console_list_unlock();
+	return 0;
+}
+early_initcall(printk_setup_threads);
+
+/**
+ * cons_nobkl_init - Initialize the NOBKL console specific data
+ * @con:	Console to initialize
+ *
+ * Returns: True on success. False otherwise and the console cannot be used.
+ */
+bool cons_nobkl_init(struct console *con)
+{
+	struct cons_state state = { };
+
+	if (!cons_alloc_percpu_data(con))
+		return false;
+
+	rcuwait_init(&con->rcuwait);
+	atomic_set(&con->kthread_waiting, 0);
+	init_irq_work(&con->irq_work, cons_irq_work);
+	cons_state_set(con, CON_STATE_CUR, &state);
+	cons_state_set(con, CON_STATE_REQ, &state);
+	cons_seq_init(con);
+	cons_kthread_create(con);
+	return true;
+}
+
+/**
+ * cons_nobkl_cleanup - Cleanup the NOBKL console specific data
+ * @con:	Console to cleanup
+ */
+void cons_nobkl_cleanup(struct console *con)
+{
+	struct cons_state state = { };
+
+	cons_kthread_stop(con);
+	cons_state_set(con, CON_STATE_CUR, &state);
+	cons_state_set(con, CON_STATE_REQ, &state);
+	cons_free_percpu_data(con);
+}
+
+/**
+ * printk_kthread_shutdown - shutdown all threaded printers
+ *
+ * On system shutdown all threaded printers are stopped. This allows printk
+ * to transition back to atomic printing, thus providing a robust mechanism
+ * for the final shutdown/reboot messages to be output.
+ */
+static void printk_kthread_shutdown(void)
+{
+	struct console *con;
+
+	console_list_lock();
+	for_each_console(con) {
+		if (con->flags & CON_NO_BKL)
+			cons_kthread_stop(con);
+	}
+	console_list_unlock();
+}
+
+static struct syscore_ops printk_syscore_ops = {
+	.shutdown = printk_kthread_shutdown,
+};
+
+static int __init printk_init_ops(void)
+{
+	register_syscore_ops(&printk_syscore_ops);
+	return 0;
+}
+device_initcall(printk_init_ops);
@ kernel/printk/printk_safe.c:15 @
 
 #include "internal.h"
 
-static DEFINE_PER_CPU(int, printk_context);
+struct printk_context {
+	local_lock_t cpu;
+	int recursion;
+};
+
+static DEFINE_PER_CPU(struct printk_context, printk_context) = {
+	.cpu = INIT_LOCAL_LOCK(cpu),
+};
 
 /* Can be preempted by NMI. */
-void __printk_safe_enter(void)
+void __printk_safe_enter(unsigned long *flags)
 {
-	this_cpu_inc(printk_context);
+	WARN_ON_ONCE(in_nmi());
+	local_lock_irqsave(&printk_context.cpu, *flags);
+	this_cpu_inc(printk_context.recursion);
 }
 
 /* Can be preempted by NMI. */
-void __printk_safe_exit(void)
+void __printk_safe_exit(unsigned long *flags)
 {
-	this_cpu_dec(printk_context);
+	WARN_ON_ONCE(in_nmi());
+	this_cpu_dec(printk_context.recursion);
+	local_unlock_irqrestore(&printk_context.cpu, *flags);
+}
+
+void __printk_deferred_enter(void)
+{
+	WARN_ON_ONCE(!in_atomic());
+	this_cpu_inc(printk_context.recursion);
+}
+
+void __printk_deferred_exit(void)
+{
+	WARN_ON_ONCE(!in_atomic());
+	this_cpu_dec(printk_context.recursion);
 }
 
 asmlinkage int vprintk(const char *fmt, va_list args)
@ kernel/printk/printk_safe.c:64 @ asmlinkage int vprintk(const char *fmt, va_list args)
 	 * Use the main logbuf even in NMI. But avoid calling console
 	 * drivers that might have their own locks.
 	 */
-	if (this_cpu_read(printk_context) || in_nmi()) {
-		int len;
-
-		len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
-		defer_console_output();
-		return len;
-	}
+	if (this_cpu_read(printk_context.recursion) || in_nmi())
+		return vprintk_deferred(fmt, args);
 
 	/* No obstacles. */
 	return vprintk_default(fmt, args);
@ kernel/rcu/rcutorture.c:2410 @ static int rcutorture_booster_init(unsigned int cpu)
 		WARN_ON_ONCE(!t);
 		sp.sched_priority = 2;
 		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#ifdef CONFIG_PREEMPT_RT
+		t = per_cpu(timersd, cpu);
+		WARN_ON_ONCE(!t);
+		sp.sched_priority = 2;
+		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#endif
 	}
 
 	/* Don't allow time recalculation while creating a new task. */
@ kernel/rcu/tree_stall.h:11 @
  */
 
 #include <linux/kvm_para.h>
+#include <linux/console.h>
 
 //////////////////////////////////////////////////////////////////////////////
 //
@ kernel/rcu/tree_stall.h:586 @ static void rcu_check_gp_kthread_expired_fqs_timer(void)
 
 static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
 {
+	enum cons_prio prev_prio;
 	int cpu;
 	unsigned long flags;
 	unsigned long gpa;
@ kernel/rcu/tree_stall.h:602 @ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
 	if (rcu_stall_is_suppressed())
 		return;
 
+	prev_prio = cons_atomic_enter(CONS_PRIO_EMERGENCY);
+
 	/*
 	 * OK, time to rat on our buddy...
 	 * See Documentation/RCU/stallwarn.rst for info on how to debug
@ kernel/rcu/tree_stall.h:658 @ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
 	panic_on_rcu_stall();
 
 	rcu_force_quiescent_state();  /* Kick them all. */
+
+	cons_atomic_exit(CONS_PRIO_EMERGENCY, prev_prio);
 }
 
 static void print_cpu_stall(unsigned long gps)
@ kernel/sched/core.c:1045 @ void resched_curr(struct rq *rq)
 		trace_sched_wake_idle_without_ipi(cpu);
 }
 
+#ifdef CONFIG_PREEMPT_LAZY
+
+static int tsk_is_polling(struct task_struct *p)
+{
+#ifdef TIF_POLLING_NRFLAG
+	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+#else
+	return 0;
+#endif
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	int cpu;
+
+	if (!sched_feat(PREEMPT_LAZY)) {
+		resched_curr(rq);
+		return;
+	}
+
+	if (test_tsk_need_resched(curr))
+		return;
+
+	if (test_tsk_need_resched_lazy(curr))
+		return;
+
+	set_tsk_need_resched_lazy(curr);
+
+	cpu = cpu_of(rq);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED_LAZY must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(curr))
+		smp_send_reschedule(cpu);
+}
+#endif
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@ kernel/sched/core.c:2273 @ void migrate_disable(void)
 	preempt_disable();
 	this_rq()->nr_pinned++;
 	p->migration_disabled = 1;
+	preempt_lazy_disable();
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_disable);
@ kernel/sched/core.c:2309 @ void migrate_enable(void)
 	barrier();
 	p->migration_disabled = 0;
 	this_rq()->nr_pinned--;
+	preempt_lazy_enable();
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
@ kernel/sched/core.c:3363 @ int migrate_swap(struct task_struct *cur, struct task_struct *p,
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * Consider:
+ *
+ *  set_special_state(X);
+ *
+ *  do_things()
+ *    // Somewhere in there is an rtlock that can be contended:
+ *    current_save_and_set_rtlock_wait_state();
+ *    [...]
+ *    schedule_rtlock(); (A)
+ *    [...]
+ *    current_restore_rtlock_saved_state();
+ *
+ *  schedule(); (B)
+ *
+ * If p->saved_state is anything else than TASK_RUNNING, then p blocked on an
+ * rtlock (A) *before* voluntarily calling into schedule() (B) after setting its
+ * state to X. For things like ptrace (X=TASK_TRACED), the task could have more
+ * work to do upon acquiring the lock in do_things() before whoever called
+ * wait_task_inactive() should return. IOW, we have to wait for:
+ *
+ *   p.saved_state = TASK_RUNNING
+ *   p.__state     = X
+ *
+ * which implies the task isn't blocked on an RT lock and got to schedule() (B).
+ *
+ * Also see comments in ttwu_state_match().
+ */
+
+static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state)
+{
+	unsigned long flags;
+	bool mismatch;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	if (READ_ONCE(p->__state) & match_state)
+		mismatch = false;
+	else if (READ_ONCE(p->saved_state) & match_state)
+		mismatch = false;
+	else
+		mismatch = true;
+
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	return mismatch;
+}
+static __always_inline bool state_match(struct task_struct *p, unsigned int match_state,
+					bool *wait)
+{
+	if (READ_ONCE(p->__state) & match_state)
+		return true;
+	if (READ_ONCE(p->saved_state) & match_state) {
+		*wait = true;
+		return true;
+	}
+	return false;
+}
+#else
+static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state)
+{
+	return !(READ_ONCE(p->__state) & match_state);
+}
+static __always_inline bool state_match(struct task_struct *p, unsigned int match_state,
+					bool *wait)
+{
+	return (READ_ONCE(p->__state) & match_state);
+}
+#endif
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@ kernel/sched/core.c:3451 @ int migrate_swap(struct task_struct *cur, struct task_struct *p,
  */
 unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
 {
-	int running, queued;
+	bool running, wait;
 	struct rq_flags rf;
 	unsigned long ncsw;
 	struct rq *rq;
@ kernel/sched/core.c:3477 @ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 		 * is actually now running somewhere else!
 		 */
 		while (task_on_cpu(rq, p)) {
-			if (!(READ_ONCE(p->__state) & match_state))
+			if (state_mismatch(p, match_state))
 				return 0;
 			cpu_relax();
 		}
@ kernel/sched/core.c:3490 @ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 		rq = task_rq_lock(p, &rf);
 		trace_sched_wait_task(p);
 		running = task_on_cpu(rq, p);
-		queued = task_on_rq_queued(p);
+		wait = task_on_rq_queued(p);
 		ncsw = 0;
-		if (READ_ONCE(p->__state) & match_state)
+
+		if (state_match(p, match_state, &wait))
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &rf);
 
@ kernel/sched/core.c:3523 @ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
-		if (unlikely(queued)) {
+		if (unlikely(wait)) {
 			ktime_t to = NSEC_PER_SEC / HZ;
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
@ kernel/sched/core.c:4828 @ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->on_cpu = 0;
 #endif
 	init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(p)->preempt_lazy_count = 0;
+#endif
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
@ kernel/sched/core.c:6707 @ static void __sched notrace __schedule(unsigned int sched_mode)
 
 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_lazy(prev);
 	clear_preempt_need_resched();
 #ifdef CONFIG_SCHED_DEBUG
 	rq->last_seen_need_resched_ns = 0;
@ kernel/sched/core.c:6768 @ void __noreturn do_task_dead(void)
 		cpu_relax();
 }
 
-static inline void sched_submit_work(struct task_struct *tsk)
+void sched_submit_work(void)
 {
-	unsigned int task_flags;
+	struct task_struct *tsk = current;
+	unsigned int task_flags = tsk->flags;
 
-	if (task_is_running(tsk))
-		return;
-
-	task_flags = tsk->flags;
 	/*
 	 * If a worker goes to sleep, notify and ask workqueue whether it
 	 * wants to wake up a task to maintain concurrency.
@ kernel/sched/core.c:6798 @ static inline void sched_submit_work(struct task_struct *tsk)
 	blk_flush_plug(tsk->plug, true);
 }
 
-static void sched_update_worker(struct task_struct *tsk)
+void sched_resume_work(void)
 {
+	struct task_struct *tsk = current;
+
 	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
 		if (tsk->flags & PF_WQ_WORKER)
 			wq_worker_running(tsk);
@ kernel/sched/core.c:6810 @ static void sched_update_worker(struct task_struct *tsk)
 	}
 }
 
-asmlinkage __visible void __sched schedule(void)
+static void schedule_loop(unsigned int sched_mode)
 {
-	struct task_struct *tsk = current;
-
-	sched_submit_work(tsk);
 	do {
 		preempt_disable();
-		__schedule(SM_NONE);
+		__schedule(sched_mode);
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
-	sched_update_worker(tsk);
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+	if (!task_is_running(current))
+		sched_submit_work();
+	schedule_loop(SM_NONE);
+	sched_resume_work();
 }
 EXPORT_SYMBOL(schedule);
 
+void schedule_rtmutex(void)
+{
+	schedule_loop(SM_NONE);
+}
+
 /*
  * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
  * state (have scheduled out non-voluntarily) by making sure that all
@ kernel/sched/core.c:6892 @ void __sched schedule_preempt_disabled(void)
 #ifdef CONFIG_PREEMPT_RT
 void __sched notrace schedule_rtlock(void)
 {
-	do {
-		preempt_disable();
-		__schedule(SM_RTLOCK_WAIT);
-		sched_preempt_enable_no_resched();
-	} while (need_resched());
+	schedule_loop(SM_RTLOCK_WAIT);
 }
 NOKPROBE_SYMBOL(schedule_rtlock);
 #endif
@ kernel/sched/core.c:6926 @ static void __sched notrace preempt_schedule_common(void)
 	} while (need_resched());
 }
 
+#ifdef CONFIG_PREEMPT_LAZY
+/*
+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
+ * preempt_lazy_count counter >0.
+ */
+static __always_inline int preemptible_lazy(void)
+{
+	if (test_thread_flag(TIF_NEED_RESCHED))
+		return 1;
+	if (current_thread_info()->preempt_lazy_count)
+		return 0;
+	return 1;
+}
+
+#else
+
+static inline int preemptible_lazy(void)
+{
+	return 1;
+}
+
+#endif
+
 #ifdef CONFIG_PREEMPTION
 /*
  * This is the entry point to schedule() from in-kernel preemption
@ kernel/sched/core.c:6963 @ asmlinkage __visible void __sched notrace preempt_schedule(void)
 	 */
 	if (likely(!preemptible()))
 		return;
+	if (!preemptible_lazy())
+		return;
 	preempt_schedule_common();
 }
 NOKPROBE_SYMBOL(preempt_schedule);
@ kernel/sched/core.c:7012 @ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 	if (likely(!preemptible()))
 		return;
 
+	if (!preemptible_lazy())
+		return;
+
 	do {
 		/*
 		 * Because the function tracer can trace preempt_count_sub()
@ kernel/sched/core.c:9320 @ void __init init_idle(struct task_struct *idle, int cpu)
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
@ kernel/sched/fair.c:4951 @ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		/*
 		 * The current task ran long enough, ensure it doesn't get
 		 * re-elected due to buddy favours.
@ kernel/sched/fair.c:4975 @ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		return;
 
 	if (delta > ideal_runtime)
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 }
 
 static void
@ kernel/sched/fair.c:5121 @ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		return;
 	}
 	/*
@ kernel/sched/fair.c:5270 @ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 	 * hierarchy can be throttled
 	 */
 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 }
 
 static __always_inline
@ kernel/sched/fair.c:6145 @ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 
 		if (delta < 0) {
 			if (task_current(rq, p))
-				resched_curr(rq);
+				resched_curr_lazy(rq);
 			return;
 		}
 		hrtick_start(rq, delta);
@ kernel/sched/fair.c:7874 @ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	return;
 
 preempt:
-	resched_curr(rq);
+	resched_curr_lazy(rq);
 	/*
 	 * Only set the backward buddy when the current task is still
 	 * on the rq. This can happen when a wakeup gets interleaved
@ kernel/sched/fair.c:12039 @ static void task_fork_fair(struct task_struct *p)
 		 * 'current' within the tree based on its new key value.
 		 */
 		swap(curr->vruntime, se->vruntime);
-		resched_curr(rq);
+		resched_curr_lazy(rq);
 	}
 
 	se->vruntime -= cfs_rq->min_vruntime;
@ kernel/sched/fair.c:12066 @ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 	 */
 	if (task_current(rq, p)) {
 		if (p->prio > oldprio)
-			resched_curr(rq);
+			resched_curr_lazy(rq);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
@ kernel/sched/features.h:51 @ SCHED_FEAT(NONTASK_CAPACITY, true)
 
 #ifdef CONFIG_PREEMPT_RT
 SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
 #else
 
 /*
@ kernel/sched/sched.h:2373 @ extern void reweight_task(struct task_struct *p, int prio);
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 
+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_curr_lazy(struct rq *rq);
+#else
+static inline void resched_curr_lazy(struct rq *rq)
+{
+	resched_curr(rq);
+}
+#endif
+
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@ kernel/signal.c:435 @ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 		return NULL;
 
 	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
-		q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
+
+		if (!sigqueue_flags) {
+			struct sighand_struct *sighand = t->sighand;
+
+			lockdep_assert_held(&sighand->siglock);
+			if (sighand->sigqueue_cache) {
+				q = sighand->sigqueue_cache;
+				sighand->sigqueue_cache = NULL;
+			}
+		}
+		if (!q)
+			q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
 	} else {
 		print_dropped_signal(sig);
 	}
@ kernel/signal.c:461 @ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 	return q;
 }
 
-static void __sigqueue_free(struct sigqueue *q)
+static bool sigqueue_cleanup_accounting(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
-		return;
+		return false;
 	if (q->ucounts) {
 		dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
 		q->ucounts = NULL;
 	}
+	return true;
+}
+
+static void __sigqueue_free(struct sigqueue *q)
+{
+	if (!sigqueue_cleanup_accounting(q))
+		return;
+	kmem_cache_free(sigqueue_cachep, q);
+}
+
+void sigqueue_free_cached_entry(struct sigqueue *q)
+{
+	if (!q)
+		return;
+	kmem_cache_free(sigqueue_cachep, q);
+}
+
+static void sigqueue_cache_or_free(struct sigqueue *q)
+{
+	struct sighand_struct *sighand = current->sighand;
+
+	if (!sigqueue_cleanup_accounting(q))
+		return;
+
+	lockdep_assert_held(&sighand->siglock);
+	if (!sighand->sigqueue_cache) {
+		sighand->sigqueue_cache = q;
+		return;
+	}
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
@ kernel/signal.c:637 @ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i
 			(info->si_code == SI_TIMER) &&
 			(info->si_sys_private);
 
-		__sigqueue_free(first);
+		sigqueue_cache_or_free(first);
 	} else {
 		/*
 		 * Ok, it wasn't in the queue.  This must be
@ kernel/signal.c:2339 @ static int ptrace_stop(int exit_code, int why, unsigned long message,
 		do_notify_parent_cldstop(current, false, why);
 
 	/*
-	 * Don't want to allow preemption here, because
-	 * sys_ptrace() needs this task to be inactive.
+	 * The previous do_notify_parent_cldstop() invocation woke ptracer.
+	 * One a PREEMPTION kernel this can result in preemption requirement
+	 * which will be fulfilled after read_unlock() and the ptracer will be
+	 * put on the CPU.
+	 * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
+	 * this task wait in schedule(). If this task gets preempted then it
+	 * remains enqueued on the runqueue. The ptracer will observe this and
+	 * then sleep for a delay of one HZ tick. In the meantime this task
+	 * gets scheduled, enters schedule() and will wait for the ptracer.
 	 *
-	 * XXX: implement read_unlock_no_resched().
+	 * This preemption point is not bad from correctness point of view but
+	 * extends the runtime by one HZ tick time due to the ptracer's sleep.
+	 * The preempt-disable section ensures that there will be no preemption
+	 * between unlock and schedule() and so improving the performance since
+	 * the ptracer has no reason to sleep.
+	 *
+	 * This optimisation is not doable on PREEMPT_RT due to the spinlock_t
+	 * within the preempt-disable section.
 	 */
-	preempt_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
 	read_unlock(&tasklist_lock);
 	cgroup_enter_frozen();
-	preempt_enable_no_resched();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable_no_resched();
 	schedule();
 	cgroup_leave_frozen(true);
 
@ kernel/softirq.c:83 @ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
-/*
- * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness,
- * unless we're doing some of the synchronous softirqs.
- */
-#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
-static bool ksoftirqd_running(unsigned long pending)
-{
-	struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
-	if (pending & SOFTIRQ_NOW_MASK)
-		return false;
-	return tsk && task_is_running(tsk) && !__kthread_should_park(tsk);
-}
-
 #ifdef CONFIG_TRACE_IRQFLAGS
 DEFINE_PER_CPU(int, hardirqs_enabled);
 DEFINE_PER_CPU(int, hardirq_context);
@ kernel/softirq.c:224 @ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
 		goto out;
 
 	pending = local_softirq_pending();
-	if (!pending || ksoftirqd_running(pending))
+	if (!pending)
 		goto out;
 
 	/*
@ kernel/softirq.c:420 @ static inline bool should_wake_ksoftirqd(void)
 
 static inline void invoke_softirq(void)
 {
-	if (ksoftirqd_running(local_softirq_pending()))
-		return;
-
 	if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) {
 #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
 		/*
@ kernel/softirq.c:453 @ asmlinkage __visible void do_softirq(void)
 
 	pending = local_softirq_pending();
 
-	if (pending && !ksoftirqd_running(pending))
+	if (pending)
 		do_softirq_own_stack();
 
 	local_irq_restore(flags);
@ kernel/softirq.c:622 @ static inline void tick_irq_exit(void)
 #endif
 }
 
+#ifdef CONFIG_PREEMPT_RT
+DEFINE_PER_CPU(struct task_struct *, timersd);
+DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
+
+static void wake_timersd(void)
+{
+        struct task_struct *tsk = __this_cpu_read(timersd);
+
+        if (tsk)
+                wake_up_process(tsk);
+}
+
+#else
+
+static inline void wake_timersd(void) { }
+
+#endif
+
 static inline void __irq_exit_rcu(void)
 {
 #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
@ kernel/softirq.c:652 @ static inline void __irq_exit_rcu(void)
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
 
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() &&
+	    !(in_nmi() | in_hardirq()))
+		wake_timersd();
+
 	tick_irq_exit();
 }
 
@ kernel/softirq.c:983 @ static struct smp_hotplug_thread softirq_threads = {
 	.thread_comm		= "ksoftirqd/%u",
 };
 
+#ifdef CONFIG_PREEMPT_RT
+static void timersd_setup(unsigned int cpu)
+{
+        sched_set_fifo_low(current);
+}
+
+static int timersd_should_run(unsigned int cpu)
+{
+        return local_pending_timers();
+}
+
+static void run_timersd(unsigned int cpu)
+{
+	unsigned int timer_si;
+
+	ksoftirqd_run_begin();
+
+	timer_si = local_pending_timers();
+	__this_cpu_write(pending_timer_softirq, 0);
+	or_softirq_pending(timer_si);
+
+	__do_softirq();
+
+	ksoftirqd_run_end();
+}
+
+static void raise_ktimers_thread(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	__this_cpu_or(pending_timer_softirq, 1 << nr);
+}
+
+void raise_hrtimer_softirq(void)
+{
+	raise_ktimers_thread(HRTIMER_SOFTIRQ);
+}
+
+void raise_timer_softirq(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	raise_ktimers_thread(TIMER_SOFTIRQ);
+	wake_timersd();
+	local_irq_restore(flags);
+}
+
+static struct smp_hotplug_thread timer_threads = {
+        .store                  = &timersd,
+        .setup                  = timersd_setup,
+        .thread_should_run      = timersd_should_run,
+        .thread_fn              = run_timersd,
+        .thread_comm            = "ktimers/%u",
+};
+#endif
+
 static __init int spawn_ksoftirqd(void)
 {
 	cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
 				  takeover_tasklets);
 	BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
-
+#ifdef CONFIG_PREEMPT_RT
+	BUG_ON(smpboot_register_percpu_thread(&timer_threads));
+#endif
 	return 0;
 }
 early_initcall(spawn_ksoftirqd);
@ kernel/time/hrtimer.c:1808 @ void hrtimer_interrupt(struct clock_event_device *dev)
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		raise_hrtimer_softirq();
 	}
 
 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
@ kernel/time/hrtimer.c:1921 @ void hrtimer_run_queues(void)
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		raise_hrtimer_softirq();
 	}
 
 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
@ kernel/time/posix-timers.c:143 @ static struct k_itimer *posix_timer_by_id(timer_t id)
 static int posix_timer_add(struct k_itimer *timer)
 {
 	struct signal_struct *sig = current->signal;
-	int first_free_id = sig->posix_timer_id;
 	struct hlist_head *head;
-	int ret = -ENOENT;
+	unsigned int start, id;
 
-	do {
+	/* Can be written by a different task concurrently in the loop below */
+	start = READ_ONCE(sig->next_posix_timer_id);
+
+	for (id = ~start; start != id; id++) {
 		spin_lock(&hash_lock);
-		head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
-		if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+		id = sig->next_posix_timer_id;
+
+		/* Write the next ID back. Clamp it to the positive space */
+		WRITE_ONCE(sig->next_posix_timer_id, (id + 1) & INT_MAX);
+
+		head = &posix_timers_hashtable[hash(sig, id)];
+		if (!__posix_timers_find(head, sig, id)) {
 			hlist_add_head_rcu(&timer->t_hash, head);
-			ret = sig->posix_timer_id;
+			spin_unlock(&hash_lock);
+			return id;
 		}
-		if (++sig->posix_timer_id < 0)
-			sig->posix_timer_id = 0;
-		if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
-			/* Loop over all possible ids completed */
-			ret = -EAGAIN;
 		spin_unlock(&hash_lock);
-	} while (ret == -ENOENT);
-	return ret;
+	}
+	/* POSIX return code when no timer ID could be allocated */
+	return -EAGAIN;
 }
 
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
@ kernel/time/posix-timers.c:1044 @ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
 }
 
 /*
- * return timer owned by the process, used by exit_itimers
+ * Delete a timer if it is armed, remove it from the hash and schedule it
+ * for RCU freeing.
  */
 static void itimer_delete(struct k_itimer *timer)
 {
-retry_delete:
-	spin_lock_irq(&timer->it_lock);
+	unsigned long flags;
 
+retry_delete:
+	/*
+	 * irqsave is required to make timer_wait_running() work.
+	 */
+	spin_lock_irqsave(&timer->it_lock, flags);
+
+	/*
+	 * Even if the timer is not longer accessible from other tasks
+	 * it still might be armed and queued in the underlying timer
+	 * mechanism. Worse, that timer mechanism might run the expiry
+	 * function concurrently.
+	 */
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
-		spin_unlock_irq(&timer->it_lock);
+		/*
+		 * Timer is expired concurrently, prevent livelocks
+		 * and pointless spinning on RT.
+		 *
+		 * The CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y case is
+		 * irrelevant here because obviously the exiting task
+		 * cannot be expiring timer in task work concurrently.
+		 * Ditto for CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n as the
+		 * tick interrupt cannot run on this CPU because the above
+		 * spin_lock disabled interrupts.
+		 *
+		 * timer_wait_running() drops timer::it_lock, which opens
+		 * the possibility for another task to delete the timer.
+		 *
+		 * That's not possible here because this is invoked from
+		 * do_exit() only for the last thread of the thread group.
+		 * So no other task can access that timer.
+		 */
+		if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
+			return;
+
 		goto retry_delete;
 	}
 	list_del(&timer->list);
 
-	spin_unlock_irq(&timer->it_lock);
+	spin_unlock_irqrestore(&timer->it_lock, flags);
 	release_posix_timer(timer, IT_ID_SET);
 }
 
 /*
- * This is called by do_exit or de_thread, only when nobody else can
- * modify the signal->posix_timers list. Yet we need sighand->siglock
- * to prevent the race with /proc/pid/timers.
+ * Invoked from do_exit() when the last thread of a thread group exits.
+ * At that point no other task can access the timers of the dying
+ * task anymore.
  */
 void exit_itimers(struct task_struct *tsk)
 {
@ kernel/time/posix-timers.c:1106 @ void exit_itimers(struct task_struct *tsk)
 	if (list_empty(&tsk->signal->posix_timers))
 		return;
 
+	/* Protect against concurrent read via /proc/$PID/timers */
 	spin_lock_irq(&tsk->sighand->siglock);
 	list_replace_init(&tsk->signal->posix_timers, &timers);
 	spin_unlock_irq(&tsk->sighand->siglock);
 
+	/* The timers are not longer accessible via tsk::signal */
 	while (!list_empty(&timers)) {
 		tmr = list_first_entry(&timers, struct k_itimer, list);
 		itimer_delete(tmr);
@ kernel/time/tick-sched.c:792 @ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 
 static inline bool local_timer_softirq_pending(void)
 {
-	return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
+	return local_pending_timers() & BIT(TIMER_SOFTIRQ);
 }
 
 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
@ kernel/time/timer.c:2057 @ static void run_local_timers(void)
 		if (time_before(jiffies, base->next_expiry))
 			return;
 	}
-	raise_softirq(TIMER_SOFTIRQ);
+	raise_timer_softirq();
 }
 
 /*
@ kernel/trace/trace.c:2697 @ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
 	if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
 		trace_flags |= TRACE_FLAG_BH_OFF;
 
-	if (tif_need_resched())
+	if (tif_need_resched_now())
 		trace_flags |= TRACE_FLAG_NEED_RESCHED;
+#ifdef CONFIG_PREEMPT_LAZY
+	/* Run out of bits. Share the LAZY and PREEMPT_RESCHED */
+	if (need_resched_lazy())
+		trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
+#else
 	if (test_preempt_need_resched())
 		trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
-	return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+#endif
+
+	return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+		(preempt_lazy_count() & 0xff) << 16 |
 		(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
 }
 
@ kernel/trace/trace.c:4298 @ unsigned long trace_total_entries(struct trace_array *tr)
 
 static void print_lat_help_header(struct seq_file *m)
 {
-	seq_puts(m, "#                    _------=> CPU#            \n"
-		    "#                   / _-----=> irqs-off/BH-disabled\n"
-		    "#                  | / _----=> need-resched    \n"
-		    "#                  || / _---=> hardirq/softirq \n"
-		    "#                  ||| / _--=> preempt-depth   \n"
-		    "#                  |||| / _-=> migrate-disable \n"
-		    "#                  ||||| /     delay           \n"
-		    "#  cmd     pid     |||||| time  |   caller     \n"
-		    "#     \\   /        ||||||  \\    |    /       \n");
+	seq_puts(m, "#                    _--------=> CPU#            \n"
+		    "#                   / _-------=> irqs-off/BH-disabled\n"
+		    "#                  | / _------=> need-resched    \n"
+		    "#                  || / _-----=> need-resched-lazy\n"
+		    "#                  ||| / _----=> hardirq/softirq \n"
+		    "#                  |||| / _---=> preempt-depth   \n"
+		    "#                  ||||| / _--=> preempt-lazy-depth\n"
+		    "#                  |||||| / _-=> migrate-disable \n"
+		    "#                  ||||||| /     delay           \n"
+		    "#  cmd     pid     |||||||| time  |   caller     \n"
+		    "#     \\   /        ||||||||  \\    |    /       \n");
 }
 
 static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@ kernel/trace/trace.c:4342 @ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
 
 	print_event_info(buf, m);
 
-	seq_printf(m, "#                            %.*s  _-----=> irqs-off/BH-disabled\n", prec, space);
-	seq_printf(m, "#                            %.*s / _----=> need-resched\n", prec, space);
-	seq_printf(m, "#                            %.*s| / _---=> hardirq/softirq\n", prec, space);
-	seq_printf(m, "#                            %.*s|| / _--=> preempt-depth\n", prec, space);
-	seq_printf(m, "#                            %.*s||| / _-=> migrate-disable\n", prec, space);
-	seq_printf(m, "#                            %.*s|||| /     delay\n", prec, space);
-	seq_printf(m, "#           TASK-PID  %.*s CPU#  |||||  TIMESTAMP  FUNCTION\n", prec, "     TGID   ");
-	seq_printf(m, "#              | |    %.*s   |   |||||     |         |\n", prec, "       |    ");
+	seq_printf(m, "#                            %.*s  _-------=> irqs-off/BH-disabled\n", prec, space);
+	seq_printf(m, "#                            %.*s / _------=> need-resched\n", prec, space);
+	seq_printf(m, "#                            %.*s| / _-----=> need-resched-lazy\n", prec, space);
+	seq_printf(m, "#                            %.*s|| / _----=> hardirq/softirq\n", prec, space);
+	seq_printf(m, "#                            %.*s||| / _---=> preempt-depth\n", prec, space);
+	seq_printf(m, "#                            %.*s|||| / _--=> preempt-lazy-depth\n", prec, space);
+	seq_printf(m, "#                            %.*s||||| / _-=> migrate-disable\n", prec, space);
+	seq_printf(m, "#                            %.*s|||||| /     delay\n", prec, space);
+	seq_printf(m, "#           TASK-PID  %.*s CPU#  |||||||  TIMESTAMP  FUNCTION\n", prec, "     TGID   ");
+	seq_printf(m, "#              | |    %.*s   |   |||||||      |         |\n", prec, "       |    ");
 }
 
 void
@ kernel/trace/trace_events.c:211 @ static int trace_define_common_fields(void)
 	/* Holds both preempt_count and migrate_disable */
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
+	__common_field(unsigned char, preempt_lazy_count);
 
 	return ret;
 }
@ kernel/trace/trace_output.c:445 @ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
 	char hardsoft_irq;
 	char need_resched;
+	char need_resched_lazy;
 	char irqs_off;
 	int hardirq;
 	int softirq;
@ kernel/trace/trace_output.c:466 @ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 
 	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
 				TRACE_FLAG_PREEMPT_RESCHED)) {
+#ifndef CONFIG_PREEMPT_LAZY
 	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
 		need_resched = 'N';
 		break;
+#endif
 	case TRACE_FLAG_NEED_RESCHED:
 		need_resched = 'n';
 		break;
+#ifndef CONFIG_PREEMPT_LAZY
 	case TRACE_FLAG_PREEMPT_RESCHED:
 		need_resched = 'p';
 		break;
+#endif
 	default:
 		need_resched = '.';
 		break;
 	}
 
+	need_resched_lazy =
+		(entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
+
 	hardsoft_irq =
 		(nmi && hardirq)     ? 'Z' :
 		nmi                  ? 'z' :
@ kernel/trace/trace_output.c:495 @ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 		softirq              ? 's' :
 		                       '.' ;
 
-	trace_seq_printf(s, "%c%c%c",
-			 irqs_off, need_resched, hardsoft_irq);
+	trace_seq_printf(s, "%c%c%c%c",
+			 irqs_off, need_resched, need_resched_lazy,
+			 hardsoft_irq);
 
 	if (entry->preempt_count & 0xf)
 		trace_seq_printf(s, "%x", entry->preempt_count & 0xf);
 	else
 		trace_seq_putc(s, '.');
 
+	if (entry->preempt_lazy_count)
+		trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+	else
+		trace_seq_putc(s, '.');
+
 	if (entry->preempt_count & 0xf0)
 		trace_seq_printf(s, "%x", entry->preempt_count >> 4);
 	else
@ localversion-rt:1 @
+-rt15
@ net/core/dev.c:4576 @ static void rps_trigger_softirq(void *data)
 
 #endif /* CONFIG_RPS */
 
-/* Called from hardirq (IPI) context */
-static void trigger_rx_softirq(void *data)
-{
-	struct softnet_data *sd = data;
-
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-	smp_store_release(&sd->defer_ipi_scheduled, 0);
-}
-
 /*
  * Check if this softnet_data structure is another cpu one
  * If yes, queue it to our IPI list and return 1
@ net/core/dev.c:6626 @ static void skb_defer_free_flush(struct softnet_data *sd)
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT
+/* Called from hardirq (IPI) context */
+static void trigger_rx_softirq(void *data)
+{
+	struct softnet_data *sd = data;
+
+	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	smp_store_release(&sd->defer_ipi_scheduled, 0);
+}
+
+#else
+
+static void trigger_rx_softirq(struct work_struct *defer_work)
+{
+	struct softnet_data *sd;
+
+	sd = container_of(defer_work, struct softnet_data, defer_work);
+	smp_store_release(&sd->defer_ipi_scheduled, 0);
+	local_bh_disable();
+	skb_defer_free_flush(sd);
+	local_bh_enable();
+}
+#endif
+
 static __latent_entropy void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@ net/core/dev.c:11427 @ static int __init net_dev_init(void)
 		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
 		sd->cpu = i;
 #endif
+#ifndef CONFIG_PREEMPT_RT
 		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
+#else
+		INIT_WORK(&sd->defer_work, trigger_rx_softirq);
+#endif
 		spin_lock_init(&sd->defer_lock);
 
 		init_gro_hash(&sd->backlog);
@ net/core/skbuff.c:6859 @ nodefer:	__kfree_skb(skb);
 	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
 	 * if we are unlucky enough (this seems very unlikely).
 	 */
-	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
+	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
+#ifndef CONFIG_PREEMPT_RT
 		smp_call_function_single_async(cpu, &sd->defer_csd);
+#else
+		schedule_work_on(cpu, &sd->defer_work);
+#endif
+	}
 }