@ arch/arm/Kconfig:37 @ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @ arch/arm/Kconfig:76 @ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL @ arch/arm/Kconfig:121 @ config ARM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ @ arch/arm/include/asm/assembler.h:247 @ THUMB( fpreg .req r7 ) .endm #endif - .macro local_bh_disable, ti, tmp - ldr \tmp, [\ti, #TI_PREEMPT] - add \tmp, \tmp, #SOFTIRQ_DISABLE_OFFSET - str \tmp, [\ti, #TI_PREEMPT] - .endm - - .macro local_bh_enable_ti, ti, tmp - get_thread_info \ti - ldr \tmp, [\ti, #TI_PREEMPT] - sub \tmp, \tmp, #SOFTIRQ_DISABLE_OFFSET - str \tmp, [\ti, #TI_PREEMPT] - .endm - #define USERL(l, x...) \ 9999: x; \ .pushsection __ex_table,"a"; \ @ arch/arm/include/asm/thread_info.h:65 @ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ __u32 cpu; /* cpu */ __u32 cpu_domain; /* cpu domain */ struct cpu_context_save cpu_context; /* cpu context */ @ arch/arm/include/asm/thread_info.h:133 @ extern int vfp_restore_user_hwstate(struct user_vfp *, #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ #define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 5 #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ @ arch/arm/include/asm/thread_info.h:153 @ extern int vfp_restore_user_hwstate(struct user_vfp *, #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) /* Checks for any syscall work in entry-common.S */ @ arch/arm/include/asm/thread_info.h:163 @ extern int vfp_restore_user_hwstate(struct user_vfp *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NOTIFY_SIGNAL) @ arch/arm/kernel/asm-offsets.c:46 @ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context)); @ arch/arm/kernel/entry-armv.S:225 @ ENDPROC(__dabt_svc) #ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count - ldr r0, [tsk, #TI_FLAGS] @ get flags teq r8, #0 @ if preempt count != 0 + bne 1f @ return from exeption + ldr r0, [tsk, #TI_FLAGS] @ get flags + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set + blne svc_preempt @ preempt! + + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r8, #0 @ if preempt lazy count != 0 movne r0, #0 @ force flags to 0 - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED_LAZY blne svc_preempt +1: #endif svc_exit r5, irq = 1 @ return from exception @ arch/arm/kernel/entry-armv.S:251 @ ENDPROC(__irq_svc) 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED + bne 1b + tst r0, #_TIF_NEED_RESCHED_LAZY reteq r8 @ go again - b 1b + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r0, #0 @ if preempt lazy count != 0 + beq 1b + ret r8 @ go again + #endif __und_fault: @ arch/arm/kernel/signal.c:610 @ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) */ trace_hardirqs_off(); do { - if (likely(thread_flags & _TIF_NEED_RESCHED)) { + if (likely(thread_flags & (_TIF_NEED_RESCHED | + _TIF_NEED_RESCHED_LAZY))) { schedule(); } else { if (unlikely(!user_mode(regs))) @ arch/arm/mm/fault.c:439 @ do_translation_fault(unsigned long addr, unsigned int fsr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); + if (interrupts_enabled(regs)) + local_irq_enable(); + if (user_mode(regs)) goto bad_area; @ arch/arm/mm/fault.c:512 @ do_translation_fault(unsigned long addr, unsigned int fsr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + if (interrupts_enabled(regs)) + local_irq_enable(); + do_bad_area(addr, fsr, regs); return 0; } @ arch/arm/vfp/entry.S:26 @ @ ENTRY(do_vfp) mov r1, r10 - mov r3, r9 - ldr r4, .LCvfp - ldr pc, [r4] @ call VFP entry point + str lr, [sp, #-8]! + add r3, sp, #4 + str r9, [r3] + bl vfp_entry + ldr pc, [sp], #8 ENDPROC(do_vfp) - -ENTRY(vfp_null_entry) - ret lr -ENDPROC(vfp_null_entry) - - .align 2 -.LCvfp: - .word vfp_vector @ arch/arm/vfp/vfphw.S:78 @ @ lr = unrecognised instruction return address @ IRQs enabled. ENTRY(vfp_support_entry) - local_bh_disable r1, r4 - ldr r11, [r1, #TI_CPU] @ CPU number add r10, r1, #TI_VFPSTATE @ r10 = workspace @ arch/arm/vfp/vfphw.S:175 @ ENTRY(vfp_support_entry) @ out before setting an FPEXC that @ stops us reading stuff VFPFMXR FPEXC, r1 @ Restore FPEXC last + mov sp, r3 @ we think we have handled things + pop {lr} sub r2, r2, #4 @ Retry current instruction - if Thumb str r2, [sp, #S_PC] @ mode it's two 16-bit instructions, @ else it's one 32-bit instruction, so @ always subtract 4 from the following @ instruction address. - local_bh_enable_ti r10, r4 - ret r3 @ we think we have handled things + b vfp_exit @ tail call look_for_VFP_exceptions: @ Check for synchronous or asynchronous exception @ arch/arm/vfp/vfphw.S:206 @ ENTRY(vfp_support_entry) @ not recognised by VFP DBGSTR "not VFP" - local_bh_enable_ti r10, r4 - ret lr + b vfp_exit @ tail call process_exception: DBGSTR "bounce" + mov sp, r3 @ setup for a return to the user code. + pop {lr} mov r2, sp @ nothing stacked - regdump is at TOS - mov lr, r3 @ setup for a return to the user code. @ Now call the C code to package up the bounce to the support code @ r0 holds the trigger instruction @ arch/arm/vfp/vfpmodule.c:35 @ /* * Our undef handlers (in entry.S) */ -asmlinkage void vfp_support_entry(void); -asmlinkage void vfp_null_entry(void); +asmlinkage void vfp_support_entry(u32, void *, u32, u32); -asmlinkage void (*vfp_vector)(void) = vfp_null_entry; +static bool have_vfp __ro_after_init; /* * Dual-use variable. @ arch/arm/vfp/vfpmodule.c:57 @ static unsigned int __initdata VFP_arch; */ union vfp_state *vfp_current_hw_state[NR_CPUS]; +/* + * Claim ownership of the VFP unit. + * + * The caller may change VFP registers until vfp_unlock() is called. + * + * local_bh_disable() is used to disable preemption and to disable VFP + * processing in softirq context. On PREEMPT_RT kernels local_bh_disable() is + * not sufficient because it only serializes soft interrupt related sections + * via a local lock, but stays preemptible. Disabling preemption is the right + * choice here as bottom half processing is always in thread context on RT + * kernels so it implicitly prevents bottom half processing as well. + */ +static void vfp_lock(void) +{ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); +} + +static void vfp_unlock(void) +{ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); +} + /* * Is 'thread's most up to date state stored in this CPUs hardware? * Must be called from non-preemptible context. @ arch/arm/vfp/vfpmodule.c:270 @ static void vfp_panic(char *reason, u32 inst) /* * Process bitmask of exception conditions. */ -static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_regs *regs) +static int vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr) { int si_code = 0; @ arch/arm/vfp/vfpmodule.c:278 @ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ if (exceptions == VFP_EXCEPTION_ERROR) { vfp_panic("unhandled bounce", inst); - vfp_raise_sigfpe(FPE_FLTINV, regs); - return; + return FPE_FLTINV; } /* @ arch/arm/vfp/vfpmodule.c:306 @ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ RAISE(FPSCR_OFC, FPSCR_OFE, FPE_FLTOVF); RAISE(FPSCR_IOC, FPSCR_IOE, FPE_FLTINV); - if (si_code) - vfp_raise_sigfpe(si_code, regs); + return si_code; } /* @ arch/arm/vfp/vfpmodule.c:351 @ static u32 vfp_emulate_instruction(u32 inst, u32 fpscr, struct pt_regs *regs) void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) { u32 fpscr, orig_fpscr, fpsid, exceptions; + int si_code2 = 0; + int si_code = 0; pr_debug("VFP: bounce: trigger %08x fpexc %08x\n", trigger, fpexc); @ arch/arm/vfp/vfpmodule.c:400 @ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) * unallocated VFP instruction but with FPSCR.IXE set and not * on VFP subarch 1. */ - vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr, regs); + si_code = vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr); goto exit; } @ arch/arm/vfp/vfpmodule.c:425 @ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) */ exceptions = vfp_emulate_instruction(trigger, fpscr, regs); if (exceptions) - vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); + si_code2 = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); /* * If there isn't a second FP instruction, exit now. Note that @ arch/arm/vfp/vfpmodule.c:444 @ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) emulate: exceptions = vfp_emulate_instruction(trigger, orig_fpscr, regs); if (exceptions) - vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); + si_code = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); + exit: - local_bh_enable(); + vfp_unlock(); + if (si_code2) + vfp_raise_sigfpe(si_code2, regs); + if (si_code) + vfp_raise_sigfpe(si_code, regs); } static void vfp_enable(void *unused) @ arch/arm/vfp/vfpmodule.c:550 @ static inline void vfp_pm_init(void) { } */ void vfp_sync_hwstate(struct thread_info *thread) { - unsigned int cpu = get_cpu(); + vfp_lock(); - local_bh_disable(); - - if (vfp_state_in_hw(cpu, thread)) { + if (vfp_state_in_hw(raw_smp_processor_id(), thread)) { u32 fpexc = fmrx(FPEXC); /* @ arch/arm/vfp/vfpmodule.c:563 @ void vfp_sync_hwstate(struct thread_info *thread) fmxr(FPEXC, fpexc); } - local_bh_enable(); - put_cpu(); + vfp_unlock(); } /* Ensure that the thread reloads the hardware VFP state on the next use. */ @ arch/arm/vfp/vfpmodule.c:677 @ static int vfp_starting_cpu(unsigned int unused) return 0; } +/* + * Entered with: + * + * r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) + * r1 = thread_info pointer + * r2 = PC value to resume execution after successful emulation + * r3 = normal "successful" return address + * lr = unrecognised instruction return address + */ +asmlinkage void vfp_entry(u32 trigger, struct thread_info *ti, u32 resume_pc, + u32 resume_return_address) +{ + if (unlikely(!have_vfp)) + return; + + vfp_lock(); + vfp_support_entry(trigger, ti, resume_pc, resume_return_address); +} + +asmlinkage void vfp_exit(void) +{ + vfp_unlock(); +} + #ifdef CONFIG_KERNEL_MODE_NEON static int vfp_kmode_exception(struct pt_regs *regs, unsigned int instr) @ arch/arm/vfp/vfpmodule.c:776 @ void kernel_neon_begin(void) unsigned int cpu; u32 fpexc; - local_bh_disable(); + vfp_lock(); /* * Kernel mode NEON is only allowed outside of hardirq context with @ arch/arm/vfp/vfpmodule.c:807 @ void kernel_neon_end(void) { /* Disable the NEON/VFP unit. */ fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN); - local_bh_enable(); + vfp_unlock(); } EXPORT_SYMBOL(kernel_neon_end); @ arch/arm/vfp/vfpmodule.c:854 @ static int __init vfp_init(void) vfpsid = fmrx(FPSID); barrier(); unregister_undef_hook(&vfp_detect_hook); - vfp_vector = vfp_null_entry; pr_info("VFP support v0.3: "); if (VFP_arch) { @ arch/arm/vfp/vfpmodule.c:938 @ static int __init vfp_init(void) "arm/vfp:starting", vfp_starting_cpu, vfp_dying_cpu); - vfp_vector = vfp_support_entry; + have_vfp = true; thread_register_notifier(&vfp_notifier_block); vfp_pm_init(); @ arch/arm64/Kconfig:98 @ config ARM64 select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PAGE_TABLE_CHECK + select ARCH_SUPPORTS_RT select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT @ arch/arm64/Kconfig:211 @ config ARM64 select HAVE_PERF_USER_STACK_DUMP select HAVE_PREEMPT_DYNAMIC_KEY select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_PREEMPT_LAZY select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_FUNCTION_ARG_ACCESS_API select MMU_GATHER_RCU_TABLE_FREE @ arch/arm64/include/asm/preempt.h:74 @ static inline bool __preempt_count_dec_and_test(void) * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE * pair. */ - return !pc || !READ_ONCE(ti->preempt_count); + if (!pc || !READ_ONCE(ti->preempt_count)) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if ((pc & ~PREEMPT_NEED_RESCHED)) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif } static inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u64 pc = READ_ONCE(current_thread_info()->preempt_count); + if (pc == preempt_offset) + return true; + + if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset) + return false; + + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else u64 pc = READ_ONCE(current_thread_info()->preempt_count); return pc == preempt_offset; +#endif } #ifdef CONFIG_PREEMPTION @ arch/arm64/include/asm/thread_info.h:29 @ struct thread_info { #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ #endif + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ union { u64 preempt_count; /* 0 => preemptible, <0 => bug */ struct { @ arch/arm64/include/asm/thread_info.h:72 @ int arch_dup_task_struct(struct task_struct *dst, #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 7 #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @ arch/arm64/include/asm/thread_info.h:105 @ int arch_dup_task_struct(struct task_struct *dst, #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ _TIF_NOTIFY_SIGNAL) @ arch/arm64/include/asm/thread_info.h:117 @ int arch_dup_task_struct(struct task_struct *dst, _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) + #ifdef CONFIG_SHADOW_CALL_STACK #define INIT_SCS \ .scs_base = init_shadow_call_stack, \ @ arch/arm64/kernel/asm-offsets.c:36 @ int main(void) DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif @ arch/arm64/kernel/signal.c:1281 @ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { do { - if (thread_flags & _TIF_NEED_RESCHED) { + if (thread_flags & _TIF_NEED_RESCHED_MASK) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); @ arch/powerpc/Kconfig:157 @ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST @ arch/powerpc/Kconfig:251 @ config PPC select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select HAVE_RSEQ select HAVE_SETUP_PER_CPU_AREA if PPC64 select HAVE_SOFTIRQ_ON_OWN_STACK @ arch/powerpc/include/asm/stackprotector.h:22 @ */ static __always_inline void boot_init_stack_canary(void) { - unsigned long canary = get_random_canary(); + unsigned long canary; +#ifndef CONFIG_PREEMPT_RT + canary = get_random_canary(); +#else + canary = ((unsigned long)&canary) & CANARY_MASK; +#endif current->stack_canary = canary; #ifdef CONFIG_PPC64 get_paca()->canary = canary; @ arch/powerpc/include/asm/thread_info.h:56 @ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_lazy_count; /* 0 => preemptable, + <0 => BUG */ #ifdef CONFIG_SMP unsigned int cpu; #endif @ arch/powerpc/include/asm/thread_info.h:82 @ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .preempt_count = INIT_PREEMPT_COUNT, \ + .preempt_lazy_count = 0, \ .flags = 0, \ } @ arch/powerpc/include/asm/thread_info.h:108 @ void arch_setup_new_exec(void); #define TIF_PATCH_PENDING 6 /* pending live patching update */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SINGLESTEP 8 /* singlestepping active */ +#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */ #define TIF_SECCOMP 10 /* secure computing */ #define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ #define TIF_NOERROR 12 /* Force successful syscall return */ @ arch/powerpc/include/asm/thread_info.h:124 @ void arch_setup_new_exec(void); #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_32BIT 20 /* 32 bit binary */ + /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) @ arch/powerpc/include/asm/thread_info.h:136 @ void arch_setup_new_exec(void); #define _TIF_PATCH_PENDING (1<<TIF_PATCH_PENDING) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) #define _TIF_SECCOMP (1<<TIF_SECCOMP) #define _TIF_RESTOREALL (1<<TIF_RESTOREALL) #define _TIF_NOERROR (1<<TIF_NOERROR) @ arch/powerpc/include/asm/thread_info.h:150 @ void arch_setup_new_exec(void); _TIF_SYSCALL_EMU) #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ + _TIF_NEED_RESCHED_LAZY | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \ _TIF_NOTIFY_SIGNAL) #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) /* Bits in local_flags */ /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ @ arch/powerpc/kernel/interrupt.c:189 @ interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs) ti_flags = read_thread_flags(); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); - if (ti_flags & _TIF_NEED_RESCHED) { + if (ti_flags & _TIF_NEED_RESCHED_MASK) { schedule(); } else { /* @ arch/powerpc/kernel/interrupt.c:401 @ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) /* Returning to a kernel context with local irqs enabled. */ WARN_ON_ONCE(!(regs->msr & MSR_EE)); again: - if (IS_ENABLED(CONFIG_PREEMPT)) { + if (IS_ENABLED(CONFIG_PREEMPTION)) { /* Return to preemptible kernel context */ if (unlikely(read_thread_flags() & _TIF_NEED_RESCHED)) { if (preempt_count() == 0) preempt_schedule_irq(); + } else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) { + if ((preempt_count() == 0) && + (current_thread_info()->preempt_lazy_count == 0)) + preempt_schedule_irq(); } } @ arch/powerpc/kernel/traps.c:264 @ static char *get_mmu_str(void) static int __die(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", PAGE_SIZE / 1024, get_mmu_str(), - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", @ arch/powerpc/kvm/Kconfig:228 @ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" depends on KVM && PPC_E500 + depends on !PREEMPT_RT select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING @ arch/powerpc/perf/imc-pmu.c:54 @ static int trace_imc_mem_size; * core and trace-imc */ static struct imc_pmu_ref imc_global_refc = { - .lock = __SPIN_LOCK_INITIALIZER(imc_global_refc.lock), + .lock = __SPIN_LOCK_UNLOCKED(imc_global_refc.lock), .id = 0, .refc = 0, }; @ arch/powerpc/platforms/pseries/Kconfig:5 @ config PPC_PSERIES depends on PPC64 && PPC_BOOK3S bool "IBM pSeries & new (POWER5-based) iSeries" + select GENERIC_ALLOCATOR select HAVE_PCSPKR_PLATFORM select MPIC select OF_DYNAMIC @ arch/powerpc/platforms/pseries/iommu.c:27 @ #include <linux/of.h> #include <linux/iommu.h> #include <linux/rculist.h> +#include <linux/local_lock.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> @ arch/powerpc/platforms/pseries/iommu.c:199 @ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, return ret; } -static DEFINE_PER_CPU(__be64 *, tce_page); +struct tce_page { + __be64 * page; + local_lock_t lock; +}; +static DEFINE_PER_CPU(struct tce_page, tce_page) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @ arch/powerpc/platforms/pseries/iommu.c:228 @ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, direction, attrs); } - local_irq_save(flags); /* to protect tcep and the page behind it */ + /* to protect tcep and the page behind it */ + local_lock_irqsave(&tce_page.lock, flags); - tcep = __this_cpu_read(tce_page); + tcep = __this_cpu_read(tce_page.page); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() @ arch/powerpc/platforms/pseries/iommu.c:240 @ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, tceshift, npages, uaddr, direction, attrs); } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } rpn = __pa(uaddr) >> tceshift; @ arch/powerpc/platforms/pseries/iommu.c:275 @ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum += limit; } while (npages > 0 && !rc); - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; @ arch/powerpc/platforms/pseries/iommu.c:451 @ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, DMA_BIDIRECTIONAL, 0); } - local_irq_disable(); /* to protect tcep and the page behind it */ - tcep = __this_cpu_read(tce_page); + /* to protect tcep and the page behind it */ + local_lock_irq(&tce_page.lock); + tcep = __this_cpu_read(tce_page.page); if (!tcep) { tcep = (__be64 *)__get_free_page(GFP_ATOMIC); if (!tcep) { - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return -ENOMEM; } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; @ arch/powerpc/platforms/pseries/iommu.c:504 @ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, /* error cleanup: caller will clear whole range */ - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return rc; } @ arch/x86/Kconfig:117 @ config X86 select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS @ arch/x86/Kconfig:256 @ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK @ arch/x86/include/asm/preempt.h:93 @ static __always_inline void __preempt_count_sub(int val) * a decrement which hits zero means we have no preempt_count and should * reschedule. */ -static __always_inline bool __preempt_count_dec_and_test(void) +static __always_inline bool ____preempt_count_dec_and_test(void) { return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, __percpu_arg([var])); } +static __always_inline bool __preempt_count_dec_and_test(void) +{ + if (____preempt_count_dec_and_test()) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if (preempt_count()) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif +} + /* * Returns true when we need to resched and can (barring IRQ state). */ static __always_inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u32 tmp; + tmp = raw_cpu_read_4(pcpu_hot.preempt_count); + if (tmp == preempt_offset) + return true; + + /* preempt count == 0 ? */ + tmp &= ~PREEMPT_NEED_RESCHED; + if (tmp != preempt_offset) + return false; + /* XXX PREEMPT_LOCK_OFFSET */ + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset); +#endif } #ifdef CONFIG_PREEMPTION @ arch/x86/include/asm/thread_info.h:60 @ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ + int preempt_lazy_count; /* 0 => lazy preemptable + <0 => BUG */ #ifdef CONFIG_SMP u32 cpu; /* current CPU */ #endif @ arch/x86/include/asm/thread_info.h:70 @ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .flags = 0, \ + .preempt_lazy_count = 0, \ } #else /* !__ASSEMBLY__ */ @ arch/x86/include/asm/thread_info.h:98 @ struct thread_info { #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @ arch/x86/include/asm/thread_info.h:122 @ struct thread_info { #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) @ drivers/block/zram/zram_drv.c:60 @ static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); +#ifdef CONFIG_PREEMPT_RT +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) +{ + size_t index; + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); +} + +static int zram_slot_trylock(struct zram *zram, u32 index) +{ + int ret; + + ret = spin_trylock(&zram->table[index].lock); + if (ret) + __set_bit(ZRAM_LOCK, &zram->table[index].flags); + return ret; +} + +static void zram_slot_lock(struct zram *zram, u32 index) +{ + spin_lock(&zram->table[index].lock); + __set_bit(ZRAM_LOCK, &zram->table[index].flags); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + __clear_bit(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); +} + +#else + +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } static int zram_slot_trylock(struct zram *zram, u32 index) { @ drivers/block/zram/zram_drv.c:109 @ static void zram_slot_unlock(struct zram *zram, u32 index) { bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } +#endif static inline bool init_done(struct zram *zram) { @ drivers/block/zram/zram_drv.c:1349 @ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + zram_meta_init_table_locks(zram, num_pages); return true; } @ drivers/block/zram/zram_drv.h:72 @ struct zram_table_entry { unsigned long element; }; unsigned long flags; +#ifdef CONFIG_PREEMPT_RT + spinlock_t lock; +#endif #ifdef CONFIG_ZRAM_MEMORY_TRACKING ktime_t ac_time; #endif @ drivers/char/tpm/tpm_tis.c:53 @ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da return container_of(data, struct tpm_tis_tcg_phy, priv); } +#ifdef CONFIG_PREEMPT_RT +/* + * Flush previous write operations with a dummy read operation to the + * TPM MMIO base address. + */ +static inline void tpm_tis_flush(void __iomem *iobase) +{ + ioread8(iobase + TPM_ACCESS(0)); +} +#else +#define tpm_tis_flush(iobase) do { } while (0) +#endif + +/* + * Write a byte word to the TPM MMIO address, and flush the write queue. + * The flush ensures that the data is sent immediately over the bus and not + * aggregated with further requests and transferred later in a batch. The large + * write requests can lead to unwanted latency spikes by blocking the CPU until + * the complete batch has been transferred. + */ +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) +{ + iowrite8(b, iobase + addr); + tpm_tis_flush(iobase); +} + +/* + * Write a 32-bit word to the TPM MMIO address, and flush the write queue. + * The flush ensures that the data is sent immediately over the bus and not + * aggregated with further requests and transferred later in a batch. The large + * write requests can lead to unwanted latency spikes by blocking the CPU until + * the complete batch has been transferred. + */ +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) +{ + iowrite32(b, iobase + addr); + tpm_tis_flush(iobase); +} + static int interrupts = -1; module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); @ drivers/char/tpm/tpm_tis.c:228 @ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, switch (io_mode) { case TPM_TIS_PHYS_8: while (len--) - iowrite8(*value++, phy->iobase + addr); + tpm_tis_iowrite8(*value++, phy->iobase, addr); break; case TPM_TIS_PHYS_16: return -EINVAL; case TPM_TIS_PHYS_32: - iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase + addr); + tpm_tis_iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase, addr); break; } @ drivers/gpu/drm/i915/Kconfig:6 @ config DRM_I915 tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics" depends on DRM depends on X86 && PCI - depends on !PREEMPT_RT select INTEL_GTT if X86 select INTERVAL_TREE # we need shmfs for the swappable backing store, and in particular @ drivers/gpu/drm/i915/display/intel_crtc.c:523 @ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state) */ intel_psr_wait_for_idle_locked(new_crtc_state); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); crtc->debug.min_vbl = min; crtc->debug.max_vbl = max; @ drivers/gpu/drm/i915/display/intel_crtc.c:549 @ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state) break; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); timeout = schedule_timeout(timeout); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } finish_wait(wq, &wait); @ drivers/gpu/drm/i915/display/intel_crtc.c:588 @ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state) return; irq_disable: - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) @ drivers/gpu/drm/i915/display/intel_crtc.c:698 @ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) if (new_crtc_state->seamless_m_n && intel_crtc_needs_fastset(new_crtc_state)) intel_crtc_update_active_timings(new_crtc_state); - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); if (intel_vgpu_active(dev_priv)) return; @ drivers/gpu/drm/i915/display/intel_vblank.c:296 @ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, */ spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); - /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); /* Get optional system timestamp before query. */ if (stime) @ drivers/gpu/drm/i915/display/intel_vblank.c:362 @ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, if (etime) *etime = ktime_get(); - /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); @ drivers/gpu/drm/i915/gt/intel_breadcrumbs.c:315 @ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) /* Kick the work once more to drain the signalers, and disarm the irq */ irq_work_sync(&b->irq_work); while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { - local_irq_disable(); - signal_irq_work(&b->irq_work); - local_irq_enable(); + irq_work_queue(&b->irq_work); cond_resched(); + irq_work_sync(&b->irq_work); } } @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1306 @ static void execlists_dequeue(struct intel_engine_cs *engine) * and context switches) submission. */ - spin_lock(&sched_engine->lock); + spin_lock_irq(&sched_engine->lock); /* * If the queue is higher priority than the last @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1406 @ static void execlists_dequeue(struct intel_engine_cs *engine) * Even if ELSP[1] is occupied and not worthy * of timeslices, our queue might be. */ - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); return; } } @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1432 @ static void execlists_dequeue(struct intel_engine_cs *engine) if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.sched_engine->lock); - spin_unlock(&engine->sched_engine->lock); + spin_unlock_irq(&engine->sched_engine->lock); return; /* leave this for another sibling */ } @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1594 @ static void execlists_dequeue(struct intel_engine_cs *engine) */ sched_engine->queue_priority_hint = queue_prio(sched_engine); i915_sched_engine_reset_on_empty(sched_engine); - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); /* * We can skip poking the HW if we ended up with exactly the same set @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1620 @ static void execlists_dequeue(struct intel_engine_cs *engine) } } -static void execlists_dequeue_irq(struct intel_engine_cs *engine) -{ - local_irq_disable(); /* Suspend interrupts across request submission */ - execlists_dequeue(engine); - local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ -} - static void clear_ports(struct i915_request **ports, int count) { memset_p((void **)ports, NULL, count); @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2473 @ static void execlists_submission_tasklet(struct tasklet_struct *t) } if (!engine->execlists.pending[0]) { - execlists_dequeue_irq(engine); + execlists_dequeue(engine); start_timeslice(engine); } @ drivers/gpu/drm/i915/i915_request.c:616 @ bool __i915_request_submit(struct i915_request *request) RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* @ drivers/gpu/drm/i915/i915_request.c:724 @ void __i915_request_unsubmit(struct i915_request *request) */ RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* @ drivers/gpu/drm/i915/i915_trace.h:9 @ #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _I915_TRACE_H_ +#ifdef CONFIG_PREEMPT_RT +#define NOTRACE +#endif + #include <linux/stringify.h> #include <linux/types.h> #include <linux/tracepoint.h> @ drivers/gpu/drm/i915/i915_trace.h:329 @ DEFINE_EVENT(i915_request, i915_request_add, TP_ARGS(rq) ); -#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) +#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) DEFINE_EVENT(i915_request, i915_request_guc_submit, TP_PROTO(struct i915_request *rq), TP_ARGS(rq) @ drivers/gpu/drm/i915/i915_utils.h:291 @ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ -#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) +#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) #else # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) @ drivers/tty/serial/8250/8250.h:180 @ static inline void serial_dl_write(struct uart_8250_port *up, int value) up->dl_write(up, value); } +static inline bool serial8250_is_console(struct uart_port *port) +{ + return uart_console(port) && !hlist_unhashed_lockless(&port->cons->node); +} + +/** + * serial8250_init_wctxt - Initialize a write context for + * non-console-printing usage + * @wctxt: The write context to initialize + * @cons: The console to assign to the write context + * + * In order to mark an unsafe region, drivers must acquire the console. This + * requires providing an initialized write context (even if that driver will + * not be doing any printing). + * + * This function should not be used for console printing contexts. + */ +static inline void serial8250_init_wctxt(struct cons_write_context *wctxt, + struct console *cons) +{ + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + memset(wctxt, 0, sizeof(*wctxt)); + ctxt->console = cons; + ctxt->prio = CONS_PRIO_NORMAL; +} + +/** + * __serial8250_console_acquire - Acquire a console for + * non-console-printing usage + * @wctxt: An uninitialized write context to use for acquiring + * @cons: The console to assign to the write context + * + * The caller is holding the port->lock. + * The caller is holding the console_srcu_read_lock. + * + * This function should not be used for console printing contexts. + */ +static inline void __serial8250_console_acquire(struct cons_write_context *wctxt, + struct console *cons) +{ + for (;;) { + serial8250_init_wctxt(wctxt, cons); + if (console_try_acquire(wctxt)) + break; + cpu_relax(); + } +} + +/** + * serial8250_enter_unsafe - Mark the beginning of an unsafe region for + * non-console-printing usage + * @up: The port that is entering the unsafe state + * + * The caller should ensure @up is a console before calling this function. + * + * The caller is holding the port->lock. + * This function takes the console_srcu_read_lock and becomes owner of the + * console associated with @up. + * + * This function should not be used for console printing contexts. + */ +static inline void serial8250_enter_unsafe(struct uart_8250_port *up) +{ + struct uart_port *port = &up->port; + + lockdep_assert_held_once(&port->lock); + + for (;;) { + up->cookie = console_srcu_read_lock(); + + __serial8250_console_acquire(&up->wctxt, port->cons); + + if (console_enter_unsafe(&up->wctxt)) + break; + + console_srcu_read_unlock(up->cookie); + cpu_relax(); + } +} + +/** + * serial8250_exit_unsafe - Mark the end of an unsafe region for + * non-console-printing usage + * @up: The port that is exiting the unsafe state + * + * The caller is holding the port->lock. + * This function releases ownership of the console associated with @up and + * releases the console_srcu_read_lock. + * + * This function should not be used for console printing contexts. + */ +static inline void serial8250_exit_unsafe(struct uart_8250_port *up) +{ + struct uart_port *port = &up->port; + + lockdep_assert_held_once(&port->lock); + + if (console_exit_unsafe(&up->wctxt)) + console_release(&up->wctxt); + + console_srcu_read_unlock(up->cookie); +} + +/** + * serial8250_in_IER - Read the IER register for + * non-console-printing usage + * @up: The port to work on + * + * Returns: The value read from IER + * + * The caller is holding the port->lock. + * + * This is the top-level function for non-console-printing contexts to + * read the IER register. The caller does not need to care if @up is a + * console before calling this function. + * + * This function should not be used for printing contexts. + */ +static inline int serial8250_in_IER(struct uart_8250_port *up) +{ + struct uart_port *port = &up->port; + bool is_console; + int ier; + + is_console = serial8250_is_console(port); + + if (is_console) + serial8250_enter_unsafe(up); + + ier = serial_in(up, UART_IER); + + if (is_console) + serial8250_exit_unsafe(up); + + return ier; +} + +/** + * __serial8250_set_IER - Directly write to the IER register + * @up: The port to work on + * @wctxt: The current write context + * @ier: The value to write + * + * Returns: True if IER was written to. False otherwise + * + * The caller is holding the port->lock. + * The caller is holding the console_srcu_read_unlock. + * The caller is the owner of the console associated with @up. + * + * This function should only be directly called within console printing + * contexts. Other contexts should use serial8250_set_IER(). + */ +static inline bool __serial8250_set_IER(struct uart_8250_port *up, + struct cons_write_context *wctxt, + int ier) +{ + if (wctxt && !console_can_proceed(wctxt)) + return false; + serial_out(up, UART_IER, ier); + return true; +} + +/** + * serial8250_set_IER - Write a new value to the IER register for + * non-console-printing usage + * @up: The port to work on + * @ier: The value to write + * + * The caller is holding the port->lock. + * + * This is the top-level function for non-console-printing contexts to + * write to the IER register. The caller does not need to care if @up is a + * console before calling this function. + * + * This function should not be used for printing contexts. + */ +static inline void serial8250_set_IER(struct uart_8250_port *up, int ier) +{ + struct uart_port *port = &up->port; + bool is_console; + + is_console = serial8250_is_console(port); + + if (is_console) { + serial8250_enter_unsafe(up); + while (!__serial8250_set_IER(up, &up->wctxt, ier)) { + console_srcu_read_unlock(up->cookie); + console_enter_unsafe(&up->wctxt); + } + serial8250_exit_unsafe(up); + } else { + __serial8250_set_IER(up, NULL, ier); + } +} + +/** + * __serial8250_clear_IER - Directly clear the IER register + * @up: The port to work on + * @wctxt: The current write context + * @prior: Gets set to the previous value of IER + * + * Returns: True if IER was cleared and @prior points to the previous + * value of IER. False otherwise and @prior is invalid + * + * The caller is holding the port->lock. + * The caller is holding the console_srcu_read_unlock. + * The caller is the owner of the console associated with @up. + * + * This function should only be directly called within console printing + * contexts. Other contexts should use serial8250_clear_IER(). + */ +static inline bool __serial8250_clear_IER(struct uart_8250_port *up, + struct cons_write_context *wctxt, + int *prior) +{ + unsigned int clearval = 0; + + if (up->capabilities & UART_CAP_UUE) + clearval = UART_IER_UUE; + + *prior = serial_in(up, UART_IER); + if (wctxt && !console_can_proceed(wctxt)) + return false; + serial_out(up, UART_IER, clearval); + return true; +} + +/** + * serial8250_clear_IER - Clear the IER register for + * non-console-printing usage + * @up: The port to work on + * + * Returns: The previous value of IER + * + * The caller is holding the port->lock. + * + * This is the top-level function for non-console-printing contexts to + * clear the IER register. The caller does not need to care if @up is a + * console before calling this function. + * + * This function should not be used for printing contexts. + */ +static inline int serial8250_clear_IER(struct uart_8250_port *up) +{ + struct uart_port *port = &up->port; + bool is_console; + int prior; + + is_console = serial8250_is_console(port); + + if (is_console) { + serial8250_enter_unsafe(up); + while (!__serial8250_clear_IER(up, &up->wctxt, &prior)) { + console_srcu_read_unlock(up->cookie); + console_enter_unsafe(&up->wctxt); + } + serial8250_exit_unsafe(up); + } else { + __serial8250_clear_IER(up, NULL, &prior); + } + + return prior; +} + static inline bool serial8250_set_THRI(struct uart_8250_port *up) { if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } @ drivers/tty/serial/8250/8250.h:459 @ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } @ drivers/tty/serial/8250/8250_aspeed_vuart.c:281 @ static void __aspeed_vuart_set_throttle(struct uart_8250_port *up, up->ier &= ~irqs; if (!throttle) up->ier |= irqs; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); } static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle) { @ drivers/tty/serial/8250/8250_bcm7271.c:609 @ static int brcmuart_startup(struct uart_port *port) * Disable the Receive Data Interrupt because the DMA engine * will handle this. */ + spin_lock_irq(&port->lock); up->ier &= ~UART_IER_RDI; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); + spin_unlock_irq(&port->lock); priv->tx_running = false; priv->dma.rx_dma = NULL; @ drivers/tty/serial/8250/8250_bcm7271.c:792 @ static int brcmuart_handle_irq(struct uart_port *p) spin_lock_irqsave(&p->lock, flags); status = serial_port_in(p, UART_LSR); if ((status & UART_LSR_DR) == 0) { + bool is_console; + + is_console = serial8250_is_console(p); + + if (is_console) + serial8250_enter_unsafe(up); ier = serial_port_in(p, UART_IER); /* @ drivers/tty/serial/8250/8250_bcm7271.c:818 @ static int brcmuart_handle_irq(struct uart_port *p) serial_port_in(p, UART_RX); } + if (is_console) + serial8250_exit_unsafe(up); + handled = 1; } spin_unlock_irqrestore(&p->lock, flags); @ drivers/tty/serial/8250/8250_bcm7271.c:858 @ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t) /* re-enable receive unless upper layer has disabled it */ if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) == (UART_IER_RLSI | UART_IER_RDI)) { + bool is_console; + + is_console = serial8250_is_console(p); + + if (is_console) + serial8250_enter_unsafe(up); + status = serial_port_in(p, UART_IER); status |= (UART_IER_RLSI | UART_IER_RDI); serial_port_out(p, UART_IER, status); status = serial_port_in(p, UART_MCR); status |= UART_MCR_RTS; serial_port_out(p, UART_MCR, status); + + if (is_console) + serial8250_exit_unsafe(up); } spin_unlock_irqrestore(&p->lock, flags); return HRTIMER_NORESTART; @ drivers/tty/serial/8250/8250_core.c:259 @ static void serial8250_timeout(struct timer_list *t) static void serial8250_backup_timeout(struct timer_list *t) { struct uart_8250_port *up = from_timer(up, t, timer); + struct uart_port *port = &up->port; unsigned int iir, ier = 0, lsr; unsigned long flags; @ drivers/tty/serial/8250/8250_core.c:270 @ static void serial8250_backup_timeout(struct timer_list *t) * based handler. */ if (up->port.irq) { + bool is_console; + + /* + * Do not use serial8250_clear_IER() because this code + * ignores capabilties. + */ + + is_console = serial8250_is_console(port); + + if (is_console) + serial8250_enter_unsafe(up); + ier = serial_in(up, UART_IER); serial_out(up, UART_IER, 0); + + if (is_console) + serial8250_exit_unsafe(up); } iir = serial_in(up, UART_IIR); @ drivers/tty/serial/8250/8250_core.c:309 @ static void serial8250_backup_timeout(struct timer_list *t) serial8250_tx_chars(up); if (up->port.irq) - serial_out(up, UART_IER, ier); + serial8250_set_IER(up, ier); spin_unlock_irqrestore(&up->port.lock, flags); @ drivers/tty/serial/8250/8250_core.c:595 @ serial8250_register_ports(struct uart_driver *drv, struct device *dev) #ifdef CONFIG_SERIAL_8250_CONSOLE -static void univ8250_console_write(struct console *co, const char *s, - unsigned int count) +static void univ8250_console_port_lock(struct console *con, bool do_lock, unsigned long *flags) +{ + struct uart_8250_port *up = &serial8250_ports[con->index]; + + if (do_lock) + spin_lock_irqsave(&up->port.lock, *flags); + else + spin_unlock_irqrestore(&up->port.lock, *flags); +} + +static bool univ8250_console_write_atomic(struct console *co, + struct cons_write_context *wctxt) { struct uart_8250_port *up = &serial8250_ports[co->index]; - serial8250_console_write(up, s, count); + return serial8250_console_write_atomic(up, wctxt); +} + +static bool univ8250_console_write_thread(struct console *co, + struct cons_write_context *wctxt) +{ + struct uart_8250_port *up = &serial8250_ports[co->index]; + + return serial8250_console_write_thread(up, wctxt); } static int univ8250_console_setup(struct console *co, char *options) @ drivers/tty/serial/8250/8250_core.c:706 @ static int univ8250_console_match(struct console *co, char *name, int idx, static struct console univ8250_console = { .name = "ttyS", - .write = univ8250_console_write, + .write_atomic = univ8250_console_write_atomic, + .write_thread = univ8250_console_write_thread, + .port_lock = univ8250_console_port_lock, .device = uart_console_device, .setup = univ8250_console_setup, .exit = univ8250_console_exit, .match = univ8250_console_match, - .flags = CON_PRINTBUFFER | CON_ANYTIME, + .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_NO_BKL, .index = -1, .data = &serial8250_reg, }; @ drivers/tty/serial/8250/8250_core.c:1001 @ static void serial_8250_overrun_backoff_work(struct work_struct *work) spin_lock_irqsave(&port->lock, flags); up->ier |= UART_IER_RLSI | UART_IER_RDI; up->port.read_status_mask |= UART_LSR_DR; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); spin_unlock_irqrestore(&port->lock, flags); } @ drivers/tty/serial/8250/8250_exar.c:188 @ static void xr17v35x_set_divisor(struct uart_port *p, unsigned int baud, static int xr17v35x_startup(struct uart_port *port) { + struct uart_8250_port *up = up_to_u8250p(port); + + spin_lock_irq(&port->lock); + /* * First enable access to IER [7:5], ISR [5:4], FCR [5:4], * MCR [7:5] and MSR [7:0] @ drivers/tty/serial/8250/8250_exar.c:202 @ static int xr17v35x_startup(struct uart_port *port) * Make sure all interrups are masked until initialization is * complete and the FIFOs are cleared */ - serial_port_out(port, UART_IER, 0); + serial8250_set_IER(up, 0); + + spin_unlock_irq(&port->lock); return serial8250_do_startup(port); } @ drivers/tty/serial/8250/8250_fsl.c:61 @ int fsl8250_handle_irq(struct uart_port *port) if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { unsigned long delay; - up->ier = port->serial_in(port, UART_IER); + up->ier = serial8250_in_IER(up); + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { port->ops->stop_rx(port); } else { @ drivers/tty/serial/8250/8250_mtk.c:225 @ static void mtk8250_shutdown(struct uart_port *port) static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); + struct uart_port *port = &up->port; + bool is_console; + int ier; + + is_console = serial8250_is_console(port); + + if (is_console) + serial8250_enter_unsafe(up); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier & (~mask)); + + if (is_console) + serial8250_exit_unsafe(up); } static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); + struct uart_port *port = &up->port; + bool is_console; + int ier; + + is_console = serial8250_is_console(port); + + if (is_console) + serial8250_enter_unsafe(up); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier | mask); + + if (is_console) + serial8250_exit_unsafe(up); } static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) @ drivers/tty/serial/8250/8250_omap.c:337 @ static void omap8250_restore_regs(struct uart_8250_port *up) /* drop TCR + TLR access, we setup XON/XOFF later */ serial8250_out_MCR(up, mcr); - - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_dl_write(up, priv->quot); @ drivers/tty/serial/8250/8250_omap.c:525 @ static void omap_8250_pm(struct uart_port *port, unsigned int state, u8 efr; pm_runtime_get_sync(port->dev); + + spin_lock_irq(&port->lock); + serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); efr = serial_in(up, UART_EFR); serial_out(up, UART_EFR, efr | UART_EFR_ECB); serial_out(up, UART_LCR, 0); - serial_out(up, UART_IER, (state != 0) ? UART_IERX_SLEEP : 0); + serial8250_set_IER(up, (state != 0) ? UART_IERX_SLEEP : 0); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(up, UART_EFR, efr); serial_out(up, UART_LCR, 0); + spin_unlock_irq(&port->lock); + pm_runtime_mark_last_busy(port->dev); pm_runtime_put_autosuspend(port->dev); } @ drivers/tty/serial/8250/8250_omap.c:656 @ static irqreturn_t omap8250_irq(int irq, void *dev_id) if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) { unsigned long delay; - up->ier = port->serial_in(port, UART_IER); + spin_lock(&port->lock); + up->ier = serial8250_in_IER(up); if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { port->ops->stop_rx(port); } else { @ drivers/tty/serial/8250/8250_omap.c:666 @ static irqreturn_t omap8250_irq(int irq, void *dev_id) */ cancel_delayed_work(&up->overrun_backoff); } + spin_unlock(&port->lock); delay = msecs_to_jiffies(up->overrun_backoff_time_ms); schedule_delayed_work(&up->overrun_backoff, delay); @ drivers/tty/serial/8250/8250_omap.c:716 @ static int omap_8250_startup(struct uart_port *port) if (ret < 0) goto err; + spin_lock_irq(&port->lock); up->ier = UART_IER_RLSI | UART_IER_RDI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); + spin_unlock_irq(&port->lock); #ifdef CONFIG_PM up->capabilities |= UART_CAP_RPM; @ drivers/tty/serial/8250/8250_omap.c:759 @ static void omap_8250_shutdown(struct uart_port *port) if (priv->habit & UART_HAS_EFR2) serial_out(up, UART_OMAP_EFR2, 0x0); + spin_lock_irq(&port->lock); up->ier = 0; - serial_out(up, UART_IER, 0); + serial8250_set_IER(up, 0); + spin_unlock_irq(&port->lock); if (up->dma) serial8250_release_dma(up); @ drivers/tty/serial/8250/8250_omap.c:810 @ static void omap_8250_unthrottle(struct uart_port *port) up->dma->rx_dma(up); up->ier |= UART_IER_RLSI | UART_IER_RDI; port->read_status_mask |= UART_LSR_DR; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); spin_unlock_irqrestore(&port->lock, flags); pm_runtime_mark_last_busy(port->dev); @ drivers/tty/serial/8250/8250_omap.c:969 @ static void __dma_rx_complete(void *param) __dma_rx_do_complete(p); if (!priv->throttled) { p->ier |= UART_IER_RLSI | UART_IER_RDI; - serial_out(p, UART_IER, p->ier); + serial8250_set_IER(p, p->ier); if (!(priv->habit & UART_HAS_EFR2)) omap_8250_rx_dma(p); } @ drivers/tty/serial/8250/8250_omap.c:1026 @ static int omap_8250_rx_dma(struct uart_8250_port *p) * callback to run. */ p->ier &= ~(UART_IER_RLSI | UART_IER_RDI); - serial_out(p, UART_IER, p->ier); + serial8250_set_IER(p, p->ier); } goto out; } @ drivers/tty/serial/8250/8250_omap.c:1239 @ static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, * periodic timeouts, re-enable interrupts. */ up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); omap_8250_rx_dma_flush(up); serial_in(up, UART_IIR); serial_out(up, UART_OMAP_EFR2, 0x0); up->ier |= UART_IER_RLSI | UART_IER_RDI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); } } @ drivers/tty/serial/8250/8250_omap.c:1730 @ static int omap8250_runtime_resume(struct device *dev) up = serial8250_get_port(priv->line); + spin_lock_irq(&up->port.lock); + if (omap8250_lost_context(up)) omap8250_restore_regs(up); if (up->dma && up->dma->rxchan && !(priv->habit & UART_HAS_EFR2)) omap_8250_rx_dma(up); + spin_unlock_irq(&up->port.lock); + priv->latency = priv->calc_latency; schedule_work(&priv->qos_work); return 0; @ drivers/tty/serial/8250/8250_port.c:748 @ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) serial8250_rpm_get(p); if (p->capabilities & UART_CAP_SLEEP) { + spin_lock_irq(&p->port.lock); if (p->capabilities & UART_CAP_EFR) { lcr = serial_in(p, UART_LCR); efr = serial_in(p, UART_EFR); @ drivers/tty/serial/8250/8250_port.c:756 @ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) serial_out(p, UART_EFR, UART_EFR_ECB); serial_out(p, UART_LCR, 0); } - serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); + serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); if (p->capabilities & UART_CAP_EFR) { serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, efr); serial_out(p, UART_LCR, lcr); } + spin_unlock_irq(&p->port.lock); } serial8250_rpm_put(p); } -static void serial8250_clear_IER(struct uart_8250_port *up) -{ - if (up->capabilities & UART_CAP_UUE) - serial_out(up, UART_IER, UART_IER_UUE); - else - serial_out(up, UART_IER, 0); -} - #ifdef CONFIG_SERIAL_8250_RSA /* * Attempts to turn on the RSA FIFO. Returns zero on failure. @ drivers/tty/serial/8250/8250_port.c:1031 @ static int broken_efr(struct uart_8250_port *up) */ static void autoconfig_16550a(struct uart_8250_port *up) { + struct uart_port *port = &up->port; unsigned char status1, status2; unsigned int iersave; + bool is_console; up->port.type = PORT_16550A; up->capabilities |= UART_CAP_FIFO; @ drivers/tty/serial/8250/8250_port.c:1150 @ static void autoconfig_16550a(struct uart_8250_port *up) return; } + is_console = serial8250_is_console(port); + + if (is_console) + serial8250_enter_unsafe(up); + /* * Try writing and reading the UART_IER_UUE bit (b6). * If it works, this is probably one of the Xscale platform's @ drivers/tty/serial/8250/8250_port.c:1190 @ static void autoconfig_16550a(struct uart_8250_port *up) } serial_out(up, UART_IER, iersave); + if (is_console) + serial8250_exit_unsafe(up); + /* * We distinguish between 16550A and U6 16550A by counting * how many bytes are in the FIFO. @ drivers/tty/serial/8250/8250_port.c:1234 @ static void autoconfig(struct uart_8250_port *up) up->bugs = 0; if (!(port->flags & UPF_BUGGY_UART)) { + bool is_console; + + is_console = serial8250_is_console(port); + + if (is_console) + serial8250_enter_unsafe(up); + /* * Do a simple existence test first; if we fail this, * there's no point trying anything else. @ drivers/tty/serial/8250/8250_port.c:1270 @ static void autoconfig(struct uart_8250_port *up) #endif scratch3 = serial_in(up, UART_IER) & UART_IER_ALL_INTR; serial_out(up, UART_IER, scratch); + + if (is_console) + serial8250_exit_unsafe(up); + if (scratch2 != 0 || scratch3 != UART_IER_ALL_INTR) { /* * We failed; there's nothing here @ drivers/tty/serial/8250/8250_port.c:1395 @ static void autoconfig_irq(struct uart_8250_port *up) unsigned char save_ICP = 0; unsigned int ICP = 0; unsigned long irqs; + bool is_console; int irq; if (port->flags & UPF_FOURPORT) { @ drivers/tty/serial/8250/8250_port.c:1405 @ static void autoconfig_irq(struct uart_8250_port *up) inb_p(ICP); } - if (uart_console(port)) + is_console = serial8250_is_console(port); + + if (is_console) { console_lock(); + serial8250_enter_unsafe(up); + } /* forget possible initially masked and pending IRQ */ probe_irq_off(probe_irq_on()); @ drivers/tty/serial/8250/8250_port.c:1442 @ static void autoconfig_irq(struct uart_8250_port *up) if (port->flags & UPF_FOURPORT) outb_p(save_ICP, ICP); - if (uart_console(port)) + if (is_console) { + serial8250_exit_unsafe(up); console_unlock(); + } port->irq = (irq > 0) ? irq : 0; } @ drivers/tty/serial/8250/8250_port.c:1458 @ static void serial8250_stop_rx(struct uart_port *port) up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); up->port.read_status_mask &= ~UART_LSR_DR; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @ drivers/tty/serial/8250/8250_port.c:1488 @ void serial8250_em485_stop_tx(struct uart_8250_port *p) serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; - serial_port_out(&p->port, UART_IER, p->ier); + serial8250_set_IER(p, p->ier); } } EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); @ drivers/tty/serial/8250/8250_port.c:1735 @ static void serial8250_disable_ms(struct uart_port *port) mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); } static void serial8250_enable_ms(struct uart_port *port) @ drivers/tty/serial/8250/8250_port.c:1751 @ static void serial8250_enable_ms(struct uart_port *port) up->ier |= UART_IER_MSI; serial8250_rpm_get(up); - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @ drivers/tty/serial/8250/8250_port.c:2201 @ static void serial8250_put_poll_char(struct uart_port *port, serial8250_rpm_get(up); /* * First save the IER then disable the interrupts + * + * Best-effort IER access because other CPUs are quiesced. */ - ier = serial_port_in(port, UART_IER); - serial8250_clear_IER(up); + __serial8250_clear_IER(up, NULL, &ier); wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); /* @ drivers/tty/serial/8250/8250_port.c:2217 @ static void serial8250_put_poll_char(struct uart_port *port, * and restore the IER */ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); - serial_port_out(port, UART_IER, ier); + __serial8250_set_IER(up, NULL, ier); serial8250_rpm_put(up); } @ drivers/tty/serial/8250/8250_port.c:2228 @ int serial8250_do_startup(struct uart_port *port) struct uart_8250_port *up = up_to_u8250p(port); unsigned long flags; unsigned char iir; + bool is_console; int retval; u16 lsr; @ drivers/tty/serial/8250/8250_port.c:2246 @ int serial8250_do_startup(struct uart_port *port) serial8250_rpm_get(up); if (port->type == PORT_16C950) { /* Wake up and initialize UART */ + spin_lock_irqsave(&port->lock, flags); up->acr = 0; serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); serial_port_out(port, UART_EFR, UART_EFR_ECB); - serial_port_out(port, UART_IER, 0); + serial8250_set_IER(up, 0); serial_port_out(port, UART_LCR, 0); serial_icr_write(up, UART_CSR, 0); /* Reset the UART */ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); serial_port_out(port, UART_EFR, UART_EFR_ECB); serial_port_out(port, UART_LCR, 0); + spin_unlock_irqrestore(&port->lock, flags); } if (port->type == PORT_DA830) { /* Reset the port */ - serial_port_out(port, UART_IER, 0); + spin_lock_irqsave(&port->lock, flags); + serial8250_set_IER(up, 0); serial_port_out(port, UART_DA830_PWREMU_MGMT, 0); + spin_unlock_irqrestore(&port->lock, flags); mdelay(10); /* Enable Tx, Rx and free run mode */ @ drivers/tty/serial/8250/8250_port.c:2362 @ int serial8250_do_startup(struct uart_port *port) if (retval) goto out; + is_console = serial8250_is_console(port); + if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) { unsigned char iir1; @ drivers/tty/serial/8250/8250_port.c:2380 @ int serial8250_do_startup(struct uart_port *port) */ spin_lock_irqsave(&port->lock, flags); + if (is_console) + serial8250_enter_unsafe(up); + wait_for_xmitr(up, UART_LSR_THRE); serial_port_out_sync(port, UART_IER, UART_IER_THRI); udelay(1); /* allow THRE to set */ @ drivers/tty/serial/8250/8250_port.c:2393 @ int serial8250_do_startup(struct uart_port *port) iir = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); + if (is_console) + serial8250_exit_unsafe(up); + spin_unlock_irqrestore(&port->lock, flags); if (port->irqflags & IRQF_SHARED) @ drivers/tty/serial/8250/8250_port.c:2450 @ int serial8250_do_startup(struct uart_port *port) * Do a quick test to see if we receive an interrupt when we enable * the TX irq. */ + if (is_console) + serial8250_enter_unsafe(up); serial_port_out(port, UART_IER, UART_IER_THRI); lsr = serial_port_in(port, UART_LSR); iir = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); + if (is_console) + serial8250_exit_unsafe(up); if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { if (!(up->bugs & UART_BUG_TXEN)) { @ drivers/tty/serial/8250/8250_port.c:2489 @ int serial8250_do_startup(struct uart_port *port) if (up->dma) { const char *msg = NULL; - if (uart_console(port)) + if (is_console) msg = "forbid DMA for kernel console"; else if (serial8250_request_dma(up)) msg = "failed to request DMA"; @ drivers/tty/serial/8250/8250_port.c:2540 @ void serial8250_do_shutdown(struct uart_port *port) */ spin_lock_irqsave(&port->lock, flags); up->ier = 0; - serial_port_out(port, UART_IER, 0); + serial8250_set_IER(up, 0); spin_unlock_irqrestore(&port->lock, flags); synchronize_irq(port->irq); @ drivers/tty/serial/8250/8250_port.c:2906 @ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; @ drivers/tty/serial/8250/8250_port.c:3371 @ EXPORT_SYMBOL_GPL(serial8250_set_defaults); #ifdef CONFIG_SERIAL_8250_CONSOLE -static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) +static bool serial8250_console_putchar(struct uart_port *port, unsigned char ch, + struct cons_write_context *wctxt) { struct uart_8250_port *up = up_to_u8250p(port); wait_for_xmitr(up, UART_LSR_THRE); + if (!console_can_proceed(wctxt)) + return false; serial_port_out(port, UART_TX, ch); + if (ch == '\n') + up->console_newline_needed = false; + else + up->console_newline_needed = true; + + return true; } /* @ drivers/tty/serial/8250/8250_port.c:3414 @ static void serial8250_console_restore(struct uart_8250_port *up) serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); } -/* - * Print a string to the serial port using the device FIFO - * - * It sends fifosize bytes and then waits for the fifo - * to get empty. - */ -static void serial8250_console_fifo_write(struct uart_8250_port *up, - const char *s, unsigned int count) +static bool __serial8250_console_write(struct uart_port *port, struct cons_write_context *wctxt, + const char *s, unsigned int count, + bool (*putchar)(struct uart_port *, unsigned char, struct cons_write_context *)) { - int i; - const char *end = s + count; - unsigned int fifosize = up->tx_loadsz; - bool cr_sent = false; + bool finished = false; + unsigned int i; - while (s != end) { - wait_for_lsr(up, UART_LSR_THRE); - - for (i = 0; i < fifosize && s != end; ++i) { - if (*s == '\n' && !cr_sent) { - serial_out(up, UART_TX, '\r'); - cr_sent = true; - } else { - serial_out(up, UART_TX, *s++); - cr_sent = false; - } + for (i = 0; i < count; i++, s++) { + if (*s == '\n') { + if (!putchar(port, '\r', wctxt)) + goto out; } + if (!putchar(port, *s, wctxt)) + goto out; } + finished = true; +out: + return finished; +} + +static bool serial8250_console_write(struct uart_port *port, struct cons_write_context *wctxt, + const char *s, unsigned int count, + bool (*putchar)(struct uart_port *, unsigned char, struct cons_write_context *)) +{ + return __serial8250_console_write(port, wctxt, s, count, putchar); +} + +static bool atomic_print_line(struct uart_8250_port *up, + struct cons_write_context *wctxt) +{ + struct uart_port *port = &up->port; + + if (up->console_newline_needed && + !__serial8250_console_write(port, wctxt, "\n", 1, serial8250_console_putchar)) { + return false; + } + + return __serial8250_console_write(port, wctxt, wctxt->outbuf, wctxt->len, + serial8250_console_putchar); +} + +static void atomic_console_reacquire(struct cons_write_context *wctxt, + struct cons_write_context *wctxt_init) +{ + memcpy(wctxt, wctxt_init, sizeof(*wctxt)); + while (!console_try_acquire(wctxt)) { + cpu_relax(); + memcpy(wctxt, wctxt_init, sizeof(*wctxt)); + } +} + +bool serial8250_console_write_atomic(struct uart_8250_port *up, + struct cons_write_context *wctxt) +{ + struct cons_write_context wctxt_init = { }; + struct cons_context *ctxt_init = &ACCESS_PRIVATE(&wctxt_init, ctxt); + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + bool finished = false; + unsigned int ier; + + touch_nmi_watchdog(); + + /* With write_atomic, another context may hold the port->lock. */ + + ctxt_init->console = ctxt->console; + ctxt_init->prio = ctxt->prio; + ctxt_init->thread = ctxt->thread; + + /* + * Enter unsafe in order to disable interrupts. If the console is + * lost before the interrupts are disabled, bail out because another + * context took over the printing. If the console is lost after the + * interrutps are disabled, the console must be reacquired in order + * to re-enable the interrupts. However in that case no printing is + * allowed because another context took over the printing. + */ + + if (!console_enter_unsafe(wctxt)) + return false; + + if (!__serial8250_clear_IER(up, wctxt, &ier)) + return false; + + if (!console_exit_unsafe(wctxt)) { + atomic_console_reacquire(wctxt, &wctxt_init); + goto enable_irq; + } + + if (!atomic_print_line(up, wctxt)) { + atomic_console_reacquire(wctxt, &wctxt_init); + goto enable_irq; + } + + wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); + finished = true; +enable_irq: + /* + * Enter unsafe in order to enable interrupts. If the console is + * lost before the interrupts are enabled, the console must be + * reacquired in order to re-enable the interrupts. + */ + for (;;) { + if (console_enter_unsafe(wctxt) && + __serial8250_set_IER(up, wctxt, ier)) { + break; + } + + /* HW-IRQs still disabled. Reacquire to enable them. */ + atomic_console_reacquire(wctxt, &wctxt_init); + } + console_exit_unsafe(wctxt); + + return finished; } /* @ drivers/tty/serial/8250/8250_port.c:3538 @ static void serial8250_console_fifo_write(struct uart_8250_port *up, * Doing runtime PM is really a bad idea for the kernel console. * Thus, we assume the function is called when device is powered up. */ -void serial8250_console_write(struct uart_8250_port *up, const char *s, - unsigned int count) +bool serial8250_console_write_thread(struct uart_8250_port *up, + struct cons_write_context *wctxt) { + struct cons_write_context wctxt_init = { }; + struct cons_context *ctxt_init = &ACCESS_PRIVATE(&wctxt_init, ctxt); + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct uart_8250_em485 *em485 = up->em485; struct uart_port *port = &up->port; - unsigned long flags; - unsigned int ier, use_fifo; - int locked = 1; + unsigned int count = wctxt->len; + const char *s = wctxt->outbuf; + bool rs485_started = false; + bool finished = false; + unsigned int ier; - touch_nmi_watchdog(); - - if (oops_in_progress) - locked = spin_trylock_irqsave(&port->lock, flags); - else - spin_lock_irqsave(&port->lock, flags); + ctxt_init->console = ctxt->console; + ctxt_init->prio = ctxt->prio; + ctxt_init->thread = ctxt->thread; /* - * First save the IER then disable the interrupts + * Enter unsafe in order to disable interrupts. If the console is + * lost before the interrupts are disabled, bail out because another + * context took over the printing. If the console is lost after the + * interrutps are disabled, the console must be reacquired in order + * to re-enable the interrupts. However in that case no printing is + * allowed because another context took over the printing. */ - ier = serial_port_in(port, UART_IER); - serial8250_clear_IER(up); + + if (!console_enter_unsafe(wctxt)) + return false; + + if (!__serial8250_clear_IER(up, wctxt, &ier)) + return false; + + if (!console_exit_unsafe(wctxt)) { + atomic_console_reacquire(wctxt, &wctxt_init); + goto enable_irq; + } /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { + if (!console_enter_unsafe(wctxt)) { + atomic_console_reacquire(wctxt, &wctxt_init); + goto enable_irq; + } serial8250_console_restore(up); + if (!console_exit_unsafe(wctxt)) { + atomic_console_reacquire(wctxt, &wctxt_init); + goto enable_irq; + } up->canary = 0; } if (em485) { - if (em485->tx_stopped) + if (em485->tx_stopped) { + if (!console_enter_unsafe(wctxt)) { + atomic_console_reacquire(wctxt, &wctxt_init); + goto enable_irq; + } up->rs485_start_tx(up); - mdelay(port->rs485.delay_rts_before_send); + rs485_started = true; + if (!console_exit_unsafe(wctxt)) { + atomic_console_reacquire(wctxt, &wctxt_init); + goto enable_irq; + } + } + if (port->rs485.delay_rts_before_send) { + mdelay(port->rs485.delay_rts_before_send); + if (!console_can_proceed(wctxt)) { + atomic_console_reacquire(wctxt, &wctxt_init); + goto enable_irq; + } + } } - use_fifo = (up->capabilities & UART_CAP_FIFO) && - /* - * BCM283x requires to check the fifo - * after each byte. - */ - !(up->capabilities & UART_CAP_MINI) && - /* - * tx_loadsz contains the transmit fifo size - */ - up->tx_loadsz > 1 && - (up->fcr & UART_FCR_ENABLE_FIFO) && - port->state && - test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) && - /* - * After we put a data in the fifo, the controller will send - * it regardless of the CTS state. Therefore, only use fifo - * if we don't use control flow. - */ - !(up->port.flags & UPF_CONS_FLOW); + if (!serial8250_console_write(port, wctxt, s, count, serial8250_console_putchar)) { + atomic_console_reacquire(wctxt, &wctxt_init); + goto enable_irq; + } - if (likely(use_fifo)) - serial8250_console_fifo_write(up, s, count); - else - uart_console_write(port, s, count, serial8250_console_putchar); - - /* - * Finally, wait for transmitter to become empty - * and restore the IER - */ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); - + finished = true; +enable_irq: + /* + * Enter unsafe in order to stop rs485_tx. If the console is + * lost before the rs485_tx is stopped, the console must be + * reacquired in order to stop rs485_tx. + */ if (em485) { mdelay(port->rs485.delay_rts_after_send); - if (em485->tx_stopped) + if (em485->tx_stopped && rs485_started) { + while (!console_enter_unsafe(wctxt)) + atomic_console_reacquire(wctxt, &wctxt_init); up->rs485_stop_tx(up); + if (!console_exit_unsafe(wctxt)) + atomic_console_reacquire(wctxt, &wctxt_init); + } } - serial_port_out(port, UART_IER, ier); + /* + * Enter unsafe in order to enable interrupts. If the console is + * lost before the interrupts are enabled, the console must be + * reacquired in order to re-enable the interrupts. + */ + for (;;) { + if (console_enter_unsafe(wctxt) && + __serial8250_set_IER(up, wctxt, ier)) { + break; + } + atomic_console_reacquire(wctxt, &wctxt_init); + } /* * The receive handling will happen properly because the @ drivers/tty/serial/8250/8250_port.c:3659 @ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (up->msr_saved_flags) serial8250_modem_status(up); - if (locked) - spin_unlock_irqrestore(&port->lock, flags); + console_exit_unsafe(wctxt); + + return finished; } static unsigned int probe_baud(struct uart_port *port) @ drivers/tty/serial/8250/8250_port.c:3681 @ static unsigned int probe_baud(struct uart_port *port) int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { + struct uart_8250_port *up = up_to_u8250p(port); int baud = 9600; int bits = 8; int parity = 'n'; @ drivers/tty/serial/8250/8250_port.c:3691 @ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) if (!port->iobase && !port->membase) return -ENODEV; + up->console_newline_needed = false; + if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) @ drivers/tty/serial/8250/Kconfig:12 @ config SERIAL_8250 depends on !S390 select SERIAL_CORE select SERIAL_MCTRL_GPIO if GPIOLIB + select HAVE_ATOMIC_CONSOLE help This selects whether you want to include the driver for the standard serial ports. The standard answer is Y. People who might say N @ drivers/tty/serial/amba-pl011.c:2322 @ pl011_console_write(struct console *co, const char *s, unsigned int count) { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; - unsigned long flags; + unsigned long flags = 0; int locked = 1; clk_enable(uap->clk); - local_irq_save(flags); + /* + * local_irq_save(flags); + * + * This local_irq_save() is nonsense. If we come in via sysrq + * handling then interrupts are already disabled. Aside of + * that the port.sysrq check is racy on SMP regardless. + */ if (uap->port.sysrq) locked = 0; else if (oops_in_progress) - locked = spin_trylock(&uap->port.lock); + locked = spin_trylock_irqsave(&uap->port.lock, flags); else - spin_lock(&uap->port.lock); + spin_lock_irqsave(&uap->port.lock, flags); /* * First save the CR then disable the interrupts @ drivers/tty/serial/amba-pl011.c:2365 @ pl011_console_write(struct console *co, const char *s, unsigned int count) pl011_write(old_cr, uap, REG_CR); if (locked) - spin_unlock(&uap->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&uap->port.lock, flags); clk_disable(uap->clk); } @ drivers/tty/serial/omap-serial.c:1222 @ serial_omap_console_write(struct console *co, const char *s, unsigned int ier; int locked = 1; - local_irq_save(flags); - if (up->port.sysrq) - locked = 0; - else if (oops_in_progress) - locked = spin_trylock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); else - spin_lock(&up->port.lock); + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @ drivers/tty/serial/omap-serial.c:1252 @ serial_omap_console_write(struct console *co, const char *s, check_modem_status(up); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init @ drivers/tty/serial/serial_core.c:2339 @ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport) * able to Re-start_rx later. */ if (!console_suspend_enabled && uart_console(uport)) { - if (uport->ops->start_rx) + if (uport->ops->start_rx) { + spin_lock_irq(&uport->lock); uport->ops->stop_rx(uport); + spin_unlock_irq(&uport->lock); + } goto unlock; } @ drivers/tty/serial/serial_core.c:2436 @ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport) if (console_suspend_enabled) uart_change_pm(state, UART_PM_STATE_ON); uport->ops->set_termios(uport, &termios, NULL); - if (!console_suspend_enabled && uport->ops->start_rx) + if (!console_suspend_enabled && uport->ops->start_rx) { + spin_lock_irq(&uport->lock); uport->ops->start_rx(uport); + spin_unlock_irq(&uport->lock); + } if (console_suspend_enabled) console_start(uport->cons); } @ drivers/tty/tty_io.c:3546 @ static ssize_t show_cons_active(struct device *dev, for_each_console(c) { if (!c->device) continue; - if (!c->write) - continue; + if (c->flags & CON_NO_BKL) { + if (!(c->write_thread || c->write_atomic)) + continue; + } else { + if (!c->write) + continue; + } if ((c->flags & CON_ENABLED) == 0) continue; cs[i++] = c; @ fs/proc/consoles.c:24 @ static int show_console_dev(struct seq_file *m, void *v) { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, + { CON_NO_BKL, 'N' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, }; char flags[ARRAY_SIZE(con_flags) + 1]; struct console *con = v; + char con_write = '-'; unsigned int a; dev_t dev = 0; @ fs/proc/consoles.c:62 @ static int show_console_dev(struct seq_file *m, void *v) seq_setwidth(m, 21 - 1); seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); - seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', - con->write ? 'W' : '-', con->unblank ? 'U' : '-', - flags); + if (con->flags & CON_NO_BKL) { + if (con->write_thread || con->write_atomic) + con_write = 'W'; + } else { + if (con->write) + con_write = 'W'; + } + seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con_write, + con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); @ include/linux/console.h:19 @ #include <linux/atomic.h> #include <linux/bits.h> +#include <linux/irq_work.h> #include <linux/rculist.h> +#include <linux/rcuwait.h> #include <linux/types.h> struct vc_data; @ include/linux/console.h:159 @ static inline int con_debug_leave(void) * receiving the printk spam for obvious reasons. * @CON_EXTENDED: The console supports the extended output format of * /dev/kmesg which requires a larger output buffer. + * @CON_SUSPENDED: Indicates if a console is suspended. If true, the + * printing callbacks must not be called. + * @CON_NO_BKL: Console can operate outside of the BKL style console_lock + * constraints. */ enum cons_flags { CON_PRINTBUFFER = BIT(0), @ include/linux/console.h:172 @ enum cons_flags { CON_ANYTIME = BIT(4), CON_BRL = BIT(5), CON_EXTENDED = BIT(6), + CON_SUSPENDED = BIT(7), + CON_NO_BKL = BIT(8), }; +/** + * struct cons_state - console state for NOBKL consoles + * @atom: Compound of the state fields for atomic operations + * @seq: Sequence for record tracking (64bit only) + * @bits: Compound of the state bits below + * + * @locked: Console is locked by a writer + * @unsafe: Console is busy in a non takeover region + * @thread: Current owner is the printk thread + * @cur_prio: The priority of the current output + * @req_prio: The priority of a handover request + * @cpu: The CPU on which the writer runs + * + * To be used for state read and preparation of atomic_long_cmpxchg() + * operations. + * + * The @req_prio field is particularly important to allow spin-waiting to + * timeout and give up without the risk of it being assigned the lock + * after giving up. The @req_prio field has a nice side-effect that it + * also makes it possible for a single read+cmpxchg in the common case of + * acquire and release. + */ +struct cons_state { + union { + unsigned long atom; + struct { +#ifdef CONFIG_64BIT + u32 seq; +#endif + union { + u32 bits; + struct { + u32 locked : 1; + u32 unsafe : 1; + u32 thread : 1; + u32 cur_prio : 2; + u32 req_prio : 2; + u32 cpu : 18; + }; + }; + }; + }; +}; + +/** + * cons_prio - console writer priority for NOBKL consoles + * @CONS_PRIO_NONE: Unused + * @CONS_PRIO_NORMAL: Regular printk + * @CONS_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) + * @CONS_PRIO_PANIC: Panic output + * @CONS_PRIO_MAX: The number of priority levels + * + * Emergency output can carefully takeover the console even without consent + * of the owner, ideally only when @cons_state::unsafe is not set. Panic + * output can ignore the unsafe flag as a last resort. If panic output is + * active no takeover is possible until the panic output releases the + * console. + */ +enum cons_prio { + CONS_PRIO_NONE = 0, + CONS_PRIO_NORMAL, + CONS_PRIO_EMERGENCY, + CONS_PRIO_PANIC, + CONS_PRIO_MAX, +}; + +struct console; +struct printk_buffers; + +/** + * struct cons_context - Context for console acquire/release + * @console: The associated console + * @state: The state at acquire time + * @old_state: The old state when try_acquire() failed for analysis + * by the caller + * @hov_state: The handover state for spin and cleanup + * @req_state: The request state for spin and cleanup + * @spinwait_max_us: Limit for spinwait acquire + * @oldseq: The sequence number at acquire() + * @newseq: The sequence number for progress + * @prio: Priority of the context + * @pbufs: Pointer to the text buffer for this context + * @dropped: Dropped counter for the current context + * @thread: The acquire is printk thread context + * @hostile: Hostile takeover requested. Cleared on normal + * acquire or friendly handover + * @spinwait: Spinwait on acquire if possible + * @backlog: Ringbuffer has pending records + */ +struct cons_context { + struct console *console; + struct cons_state state; + struct cons_state old_state; + struct cons_state hov_state; + struct cons_state req_state; + u64 oldseq; + u64 newseq; + unsigned int spinwait_max_us; + enum cons_prio prio; + struct printk_buffers *pbufs; + unsigned long dropped; + unsigned int thread : 1; + unsigned int hostile : 1; + unsigned int spinwait : 1; + unsigned int backlog : 1; +}; + +/** + * struct cons_write_context - Context handed to the write callbacks + * @ctxt: The core console context + * @outbuf: Pointer to the text buffer for output + * @len: Length to write + * @unsafe: Invoked in unsafe state due to force takeover + */ +struct cons_write_context { + struct cons_context __private ctxt; + char *outbuf; + unsigned int len; + bool unsafe; +}; + +struct cons_context_data; + /** * struct console - The console descriptor structure * @name: The name of the console driver @ include/linux/console.h:318 @ enum cons_flags { * @dropped: Number of unreported dropped ringbuffer records * @data: Driver private data * @node: hlist node for the console list + * + * @atomic_state: State array for NOBKL consoles; real and handover + * @atomic_seq: Sequence for record tracking (32bit only) + * @thread_pbufs: Pointer to thread private buffer + * @kthread: Pointer to kernel thread + * @rcuwait: RCU wait for the kernel thread + * @irq_work: IRQ work for thread wakeup + * @kthread_waiting: Indicator whether the kthread is waiting to be woken + * @write_atomic: Write callback for atomic context + * @write_thread: Write callback for printk threaded printing + * @port_lock: Callback to lock/unlock the port lock + * @pcpu_data: Pointer to percpu context data */ struct console { char name[16]; @ include/linux/console.h:349 @ struct console { unsigned long dropped; void *data; struct hlist_node node; + + /* NOBKL console specific members */ + atomic_long_t __private atomic_state[2]; +#ifndef CONFIG_64BIT + atomic_t __private atomic_seq; +#endif + struct printk_buffers *thread_pbufs; + struct task_struct *kthread; + struct rcuwait rcuwait; + struct irq_work irq_work; + atomic_t kthread_waiting; + + bool (*write_atomic)(struct console *con, struct cons_write_context *wctxt); + bool (*write_thread)(struct console *con, struct cons_write_context *wctxt); + void (*port_lock)(struct console *con, bool do_lock, unsigned long *flags); + + struct cons_context_data __percpu *pcpu_data; }; #ifdef CONFIG_LOCKDEP @ include/linux/console.h:492 @ static inline bool console_is_registered(const struct console *con) lockdep_assert_console_list_lock_held(); \ hlist_for_each_entry(con, &console_list, node) +#ifdef CONFIG_PRINTK +extern enum cons_prio cons_atomic_enter(enum cons_prio prio); +extern void cons_atomic_exit(enum cons_prio prio, enum cons_prio prev_prio); +extern bool console_can_proceed(struct cons_write_context *wctxt); +extern bool console_enter_unsafe(struct cons_write_context *wctxt); +extern bool console_exit_unsafe(struct cons_write_context *wctxt); +extern bool console_try_acquire(struct cons_write_context *wctxt); +extern bool console_release(struct cons_write_context *wctxt); +#else +static inline enum cons_prio cons_atomic_enter(enum cons_prio prio) { return CONS_PRIO_NONE; } +static inline void cons_atomic_exit(enum cons_prio prio, enum cons_prio prev_prio) { } +static inline bool console_can_proceed(struct cons_write_context *wctxt) { return false; } +static inline bool console_enter_unsafe(struct cons_write_context *wctxt) { return false; } +static inline bool console_exit_unsafe(struct cons_write_context *wctxt) { return false; } +static inline bool console_try_acquire(struct cons_write_context *wctxt) { return false; } +static inline bool console_release(struct cons_write_context *wctxt) { return false; } +#endif + extern int console_set_on_cmdline; extern struct console *early_console; @ include/linux/entry-common.h:60 @ # define ARCH_EXIT_TO_USER_MODE_WORK (0) #endif +#ifdef CONFIG_PREEMPT_LAZY +# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) +#else +# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED) +#endif + #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ARCH_EXIT_TO_USER_MODE_WORK) /** @ include/linux/interrupt.h:608 @ extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); +#ifdef CONFIG_PREEMPT_RT +DECLARE_PER_CPU(struct task_struct *, timersd); +DECLARE_PER_CPU(unsigned long, pending_timer_softirq); + +extern void raise_timer_softirq(void); +extern void raise_hrtimer_softirq(void); + +static inline unsigned int local_pending_timers(void) +{ + return __this_cpu_read(pending_timer_softirq); +} + +#else +static inline void raise_timer_softirq(void) +{ + raise_softirq(TIMER_SOFTIRQ); +} + +static inline void raise_hrtimer_softirq(void) +{ + raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} + +static inline unsigned int local_pending_timers(void) +{ + return local_softirq_pending(); +} +#endif + DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) @ include/linux/io-mapping.h:72 @ io_mapping_map_atomic_wc(struct io_mapping *mapping, BUG_ON(offset >= mapping->size); phys_addr = mapping->base + offset; - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + migrate_disable(); pagefault_disable(); return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); } @ include/linux/io-mapping.h:85 @ io_mapping_unmap_atomic(void __iomem *vaddr) { kunmap_local_indexed((void __force *)vaddr); pagefault_enable(); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + migrate_enable(); } static inline void __iomem * @ include/linux/io-mapping.h:171 @ static inline void __iomem * io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) { - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + migrate_disable(); pagefault_disable(); return io_mapping_map_wc(mapping, offset, PAGE_SIZE); } @ include/linux/io-mapping.h:184 @ io_mapping_unmap_atomic(void __iomem *vaddr) { io_mapping_unmap(vaddr); pagefault_enable(); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + migrate_enable(); } static inline void __iomem * @ include/linux/netdevice.h:3205 @ struct softnet_data { int defer_count; int defer_ipi_scheduled; struct sk_buff *defer_list; +#ifndef CONFIG_PREEMPT_RT call_single_data_t defer_csd; +#else + struct work_struct defer_work; +#endif }; static inline void input_queue_head_incr(struct softnet_data *sd) @ include/linux/preempt.h:199 @ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) +#ifdef CONFIG_PREEMPT_LAZY +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) +#define inc_preempt_lazy_count() add_preempt_lazy_count(1) +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) +#else +#define add_preempt_lazy_count(val) do { } while (0) +#define sub_preempt_lazy_count(val) do { } while (0) +#define inc_preempt_lazy_count() do { } while (0) +#define dec_preempt_lazy_count() do { } while (0) +#define preempt_lazy_count() (0) +#endif + #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ @ include/linux/preempt.h:221 @ do { \ barrier(); \ } while (0) +#define preempt_lazy_disable() \ +do { \ + inc_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ @ include/linux/preempt.h:241 @ do { \ #define preempt_enable() \ do { \ barrier(); \ - if (unlikely(preempt_count_dec_and_test())) \ + if (unlikely(preempt_count_dec_and_test())) { \ + instrumentation_begin(); \ __preempt_schedule(); \ + instrumentation_end(); \ + } \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ - if (unlikely(__preempt_count_dec_and_test())) \ + if (unlikely(__preempt_count_dec_and_test())) { \ + instrumentation_begin(); \ __preempt_schedule_notrace(); \ + instrumentation_end(); \ + } \ } while (0) #define preempt_check_resched() \ @ include/linux/preempt.h:264 @ do { \ __preempt_schedule(); \ } while (0) +/* + * open code preempt_check_resched() because it is not exported to modules and + * used by local_unlock() or bpf_enable_instrumentation(). + */ +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ + if (should_resched(0)) \ + __preempt_schedule(); \ +} while (0) + #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ @ include/linux/preempt.h:283 @ do { \ preempt_count_dec(); \ } while (0) +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define preempt_enable_notrace() \ do { \ barrier(); \ @ include/linux/preempt.h:329 @ do { \ #define preempt_enable_notrace() barrier() #define preemptible() 0 +#define preempt_lazy_disable() barrier() +#define preempt_lazy_enable() barrier() + #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE @ include/linux/preempt.h:350 @ do { \ } while (0) #define preempt_fold_need_resched() \ do { \ - if (tif_need_resched()) \ + if (tif_need_resched_now()) \ set_preempt_need_resched(); \ } while (0) @ include/linux/preempt.h:466 @ extern void migrate_enable(void); #else -static inline void migrate_disable(void) { } -static inline void migrate_enable(void) { } +static inline void migrate_disable(void) +{ + preempt_lazy_disable(); +} + +static inline void migrate_enable(void) +{ + preempt_lazy_enable(); +} #endif /* CONFIG_SMP */ @ include/linux/printk.h:142 @ void early_printk(const char *s, ...) { } #endif struct dev_printk_info; +struct cons_write_context; #ifdef CONFIG_PRINTK asmlinkage __printf(4, 0) @ include/linux/printk.h:161 @ int _printk(const char *fmt, ...); */ __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); -extern void __printk_safe_enter(void); -extern void __printk_safe_exit(void); +extern void __printk_safe_enter(unsigned long *flags); +extern void __printk_safe_exit(unsigned long *flags); +extern void __printk_deferred_enter(void); +extern void __printk_deferred_exit(void); /* * The printk_deferred_enter/exit macros are available only as a hack for * some code paths that need to defer all printk console printing. Interrupts * must be disabled for the deferred duration. */ -#define printk_deferred_enter __printk_safe_enter -#define printk_deferred_exit __printk_safe_exit +#define printk_deferred_enter() __printk_deferred_enter() +#define printk_deferred_exit() __printk_deferred_exit() /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @ include/linux/printk.h:198 @ void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; void printk_trigger_flush(void); +extern void cons_atomic_flush(struct cons_write_context *printk_caller_wctxt, + bool skip_unsafe); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @ include/linux/printk.h:279 @ static inline void dump_stack(void) static inline void printk_trigger_flush(void) { } + +static inline void cons_atomic_flush(struct cons_write_context *printk_caller_wctxt, + bool skip_unsafe) +{ +} + #endif #ifdef CONFIG_SMP @ include/linux/sched.h:306 @ extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); + +extern void sched_submit_work(void); +extern void sched_resume_work(void); +extern void schedule_rtmutex(void); + #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif @ include/linux/sched.h:2067 @ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +#ifdef CONFIG_PREEMPT_LAZY +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); +} + +static inline int need_resched_lazy(void) +{ + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +} + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#else +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } +static inline int need_resched_lazy(void) { return 0; } + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#endif + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @ include/linux/sched/signal.h:25 @ struct sighand_struct { refcount_t count; wait_queue_head_t signalfd_wqh; struct k_sigaction action[_NSIG]; + struct sigqueue *sigqueue_cache; }; /* @ include/linux/sched/signal.h:139 @ struct signal_struct { #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ - int posix_timer_id; + unsigned int next_posix_timer_id; struct list_head posix_timers; /* ITIMER_REAL timer for the process */ @ include/linux/sched/signal.h:353 @ extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); +extern void sigqueue_free_cached_entry(struct sigqueue *q); extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); @ include/linux/serial_8250.h:128 @ struct uart_8250_port { #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; + bool console_newline_needed; + struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @ include/linux/serial_8250.h:144 @ struct uart_8250_port { /* Serial port overrun backoff */ struct delayed_work overrun_backoff; u32 overrun_backoff_time_ms; + + struct cons_write_context wctxt; + int cookie; }; static inline struct uart_8250_port *up_to_u8250p(struct uart_port *up) @ include/linux/serial_8250.h:186 @ void serial8250_tx_chars(struct uart_8250_port *up); unsigned int serial8250_modem_status(struct uart_8250_port *up); void serial8250_init_port(struct uart_8250_port *up); void serial8250_set_defaults(struct uart_8250_port *up); -void serial8250_console_write(struct uart_8250_port *up, const char *s, - unsigned int count); +bool serial8250_console_write_atomic(struct uart_8250_port *up, + struct cons_write_context *wctxt); +bool serial8250_console_write_thread(struct uart_8250_port *up, + struct cons_write_context *wctxt); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); @ include/linux/thread_info.h:181 @ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti #endif /* !CONFIG_GENERIC_ENTRY */ #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H +# ifdef CONFIG_PREEMPT_LAZY + +static __always_inline bool tif_need_resched(void) +{ + return read_thread_flags() & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY); +} + +static __always_inline bool tif_need_resched_now(void) +{ + return arch_test_bit(TIF_NEED_RESCHED, + (unsigned long *)(¤t_thread_info()->flags)); +} + +static __always_inline bool tif_need_resched_lazy(void) +{ + return arch_test_bit(TIF_NEED_RESCHED_LAZY, + (unsigned long *)(¤t_thread_info()->flags)); +} + +# else /* !CONFIG_PREEMPT_LAZY */ static __always_inline bool tif_need_resched(void) { @ include/linux/thread_info.h:208 @ static __always_inline bool tif_need_resched(void) (unsigned long *)(¤t_thread_info()->flags)); } -#else +static __always_inline bool tif_need_resched_now(void) +{ + return tif_need_resched(); +} + +static __always_inline bool tif_need_resched_lazy(void) +{ + return false; +} + +# endif /* CONFIG_PREEMPT_LAZY */ +#else /* !_ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ +# ifdef CONFIG_PREEMPT_LAZY + +static __always_inline bool tif_need_resched(void) +{ + return read_thread_flags() & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY); +} + +static __always_inline bool tif_need_resched_now(void) +{ + return test_bit(TIF_NEED_RESCHED, + (unsigned long *)(¤t_thread_info()->flags)); +} + +static __always_inline bool tif_need_resched_lazy(void) +{ + return test_bit(TIF_NEED_RESCHED_LAZY, + (unsigned long *)(¤t_thread_info()->flags)); +} + +# else /* !CONFIG_PREEMPT_LAZY */ static __always_inline bool tif_need_resched(void) { @ include/linux/thread_info.h:247 @ static __always_inline bool tif_need_resched(void) (unsigned long *)(¤t_thread_info()->flags)); } +static __always_inline bool tif_need_resched_now(void) +{ + return tif_need_resched(); +} + +static __always_inline bool tif_need_resched_lazy(void) +{ + return false; +} + +# endif /* !CONFIG_PREEMPT_LAZY */ #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES @ include/linux/trace_events.h:73 @ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + unsigned char preempt_lazy_count; }; #define TRACE_EVENT_TYPE_MAX \ @ include/linux/trace_events.h:162 @ static inline void tracing_generic_entry_update(struct trace_entry *entry, unsigned int trace_ctx) { entry->preempt_count = trace_ctx & 0xff; + entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; entry->pid = current->pid; entry->type = type; - entry->flags = trace_ctx >> 16; + entry->flags = trace_ctx >> 24; } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); @ include/linux/trace_events.h:176 @ enum trace_flag_type { TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, +#ifdef CONFIG_PREEMPT_LAZY + TRACE_FLAG_PREEMPT_RESCHED = 0x00, + TRACE_FLAG_NEED_RESCHED_LAZY = 0x20, +#else + TRACE_FLAG_NEED_RESCHED_LAZY = 0x00, TRACE_FLAG_PREEMPT_RESCHED = 0x20, +#endif TRACE_FLAG_NMI = 0x40, TRACE_FLAG_BH_OFF = 0x80, }; @ include/trace/events/timer.h:161 @ DEFINE_EVENT(timer_class, timer_cancel, { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \ - { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }) + { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }, \ + { HRTIMER_MODE_ABS_HARD, "ABS|HARD" }, \ + { HRTIMER_MODE_REL_HARD, "REL|HARD" }, \ + { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" }, \ + { HRTIMER_MODE_REL_PINNED_HARD, "REL|PINNED|HARD" }) /** * hrtimer_init - called when the hrtimer is initialized @ kernel/Kconfig.preempt:4 @ # SPDX-License-Identifier: GPL-2.0-only +config HAVE_PREEMPT_LAZY + bool + +config PREEMPT_LAZY + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT + config PREEMPT_NONE_BUILD bool @ kernel/debug/kdb/kdb_io.c:579 @ static void kdb_msg_write(const char *msg, int msg_len) continue; if (c == dbg_io_ops->cons) continue; + if (!c->write) + continue; /* * Set oops_in_progress to encourage the console drivers to * disregard their internal spin locks: in the current calling @ kernel/entry/common.c:158 @ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & _TIF_NEED_RESCHED_MASK) schedule(); if (ti_work & _TIF_UPROBE) @ kernel/entry/common.c:389 @ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (should_resched(0)) preempt_schedule_irq(); } } @ kernel/fork.c:1668 @ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; + sig->sigqueue_cache = NULL; refcount_set(&sig->count, 1); spin_lock_irq(¤t->sighand->siglock); @ kernel/fork.c:1685 @ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { if (refcount_dec_and_test(&sighand->count)) { + struct sigqueue *sigqueue = NULL; + signalfd_cleanup(sighand); + spin_lock_irq(&sighand->siglock); + if (sighand->sigqueue_cache) { + sigqueue = sighand->sigqueue_cache; + sighand->sigqueue_cache = NULL; + } + spin_unlock_irq(&sighand->siglock); + + sigqueue_free_cached_entry(sigqueue); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). @ kernel/ksysfs.c:170 @ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_CRASH_CORE */ +#if defined(CONFIG_PREEMPT_RT) +static ssize_t realtime_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", 1); +} +KERNEL_ATTR_RO(realtime); +#endif + /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @ kernel/ksysfs.c:271 @ static struct attribute * kernel_attrs[] = { #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, +#endif +#ifdef CONFIG_PREEMPT_RT + &realtime_attr.attr, #endif NULL }; @ kernel/locking/rtmutex.c:221 @ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, return try_cmpxchg_acquire(&lock->owner, &old, new); } +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + return rt_mutex_cmpxchg_acquire(lock, NULL, current); +} + static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) @ kernel/locking/rtmutex.c:305 @ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, } +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock); + +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + /* + * With debug enabled rt_mutex_cmpxchg trylock() will always fail, + * which will unconditionally invoke sched_submit/resume_work() in + * the slow path of __rt_mutex_lock() and __ww_rt_mutex_lock() even + * in the non-contended case. + * + * Avoid that by using rt_mutex_slow_trylock() which is covered by + * the debug code and can acquire a non-contended rtmutex. On + * success the callsite avoids the sched_submit/resume_work() + * dance. + */ + return rt_mutex_slowtrylock(lock); +} + static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) @ kernel/locking/rtmutex.c:1581 @ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, raw_spin_unlock_irq(&lock->wait_lock); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) - schedule(); + schedule_rtmutex(); raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @ kernel/locking/rtmutex.c:1610 @ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, WARN(1, "rtmutex deadlock detected\n"); while (1) { set_current_state(TASK_INTERRUPTIBLE); - schedule(); + schedule_rtmutex(); } } @ kernel/locking/rtmutex.c:1705 @ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, unsigned long flags; int ret; + /* + * The task is about to sleep. Invoke sched_submit_work() before + * blocking as that might take locks and corrupt tsk::pi_blocked_on. + */ + sched_submit_work(); + /* * Technically we could use raw_spin_[un]lock_irq() here, but this can * be called in early boot if the cmpxchg() fast path is disabled @ kernel/locking/rtmutex.c:1723 @ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + sched_resume_work(); return ret; } static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, unsigned int state) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + lockdep_assert(!current->pi_blocked_on); + + if (likely(rt_mutex_try_acquire(lock))) return 0; return rt_mutex_slowlock(lock, NULL, state); @ kernel/locking/rwbase_rt.c:75 @ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, int ret; raw_spin_lock_irq(&rtm->wait_lock); - /* - * Allow readers, as long as the writer has not completely - * acquired the semaphore for write. - */ - if (atomic_read(&rwb->readers) != WRITER_BIAS) { - atomic_inc(&rwb->readers); - raw_spin_unlock_irq(&rtm->wait_lock); - return 0; - } /* * Call into the slow lock path with the rtmutex->wait_lock @ kernel/locking/rwbase_rt.c:134 @ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, unsigned int state) { + int ret; + + lockdep_assert(!current->pi_blocked_on); + if (rwbase_read_trylock(rwb)) return 0; - return __rwbase_read_lock(rwb, state); + /* + * The task is about to sleep. For rwsems this submits work as that + * might take locks and corrupt tsk::pi_blocked_on. Must be + * explicit here because __rwbase_read_lock() cannot invoke + * rt_mutex_slowlock(). NOP for rwlocks. + */ + rwbase_sched_submit_work(); + ret = __rwbase_read_lock(rwb, state); + rwbase_sched_resume_work(); + return ret; } static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, @ kernel/locking/rwbase_rt.c:246 @ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, struct rt_mutex_base *rtm = &rwb->rtmutex; unsigned long flags; - /* Take the rtmutex as a first step */ + /* + * Take the rtmutex as a first step. For rwsem this will also + * invoke sched_submit_work() to flush IO and workers. + */ if (rwbase_rtmutex_lock_state(rtm, state)) return -EINTR; @ kernel/locking/rwsem.c:1418 @ static inline void __downgrade_write(struct rw_semaphore *sem) #define rwbase_rtmutex_lock_state(rtm, state) \ __rt_mutex_lock(rtm, state) +#define rwbase_sched_submit_work() \ + sched_submit_work() + +#define rwbase_sched_resume_work() \ + sched_resume_work() + #define rwbase_rtmutex_slowlock_locked(rtm, state) \ __rt_mutex_slowlock_locked(rtm, NULL, state) @ kernel/locking/spinlock_rt.c:40 @ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) { + lockdep_assert(!current->pi_blocked_on); + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) rtlock_slowlock(rtm); } @ kernel/locking/spinlock_rt.c:164 @ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state) return 0; } +static __always_inline void rwbase_sched_submit_work(void) { } +static __always_inline void rwbase_sched_resume_work(void) { } + static __always_inline int rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state) { @ kernel/locking/ww_rt_mutex.c:65 @ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, } mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip); - if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) { + if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) { if (ww_ctx) ww_mutex_set_context_fastpath(lock, ww_ctx); return 0; @ kernel/panic.c:278 @ static void panic_other_cpus_shutdown(bool crash_kexec) */ void panic(const char *fmt, ...) { + enum cons_prio prev_prio; static char buf[1024]; va_list args; long i, i_next = 0, len; @ kernel/panic.c:326 @ void panic(const char *fmt, ...) if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) panic_smp_self_stop(); + prev_prio = cons_atomic_enter(CONS_PRIO_PANIC); + console_verbose(); bust_spinlocks(1); va_start(args, fmt); @ kernel/panic.c:388 @ void panic(const char *fmt, ...) if (_crash_kexec_post_notifiers) __crash_kexec(NULL); + cons_atomic_flush(NULL, true); + console_unblank(); /* @ kernel/panic.c:414 @ void panic(const char *fmt, ...) * We can't use the "normal" timers since we just panicked. */ pr_emerg("Rebooting in %d seconds..\n", panic_timeout); + cons_atomic_flush(NULL, true); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); @ kernel/panic.c:433 @ void panic(const char *fmt, ...) */ if (panic_reboot_mode != REBOOT_UNDEFINED) reboot_mode = panic_reboot_mode; + cons_atomic_flush(NULL, true); emergency_restart(); } #ifdef __sparc__ @ kernel/panic.c:446 @ void panic(const char *fmt, ...) } #endif #if defined(CONFIG_S390) + cons_atomic_flush(NULL, true); disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); /* Do not scroll important messages printed above */ suppress_printk = 1; + + cons_atomic_exit(CONS_PRIO_PANIC, prev_prio); + local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); @ kernel/panic.c:666 @ struct warn_args { void __warn(const char *file, int line, void *caller, unsigned taint, struct pt_regs *regs, struct warn_args *args) { + enum cons_prio prev_prio; + + prev_prio = cons_atomic_enter(CONS_PRIO_EMERGENCY); + disable_trace_on_warning(); if (file) @ kernel/panic.c:700 @ void __warn(const char *file, int line, void *caller, unsigned taint, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); + + cons_atomic_exit(CONS_PRIO_EMERGENCY, prev_prio); } #ifndef __WARN_FLAGS @ kernel/printk/Makefile:3 @ # SPDX-License-Identifier: GPL-2.0-only obj-y = printk.o -obj-$(CONFIG_PRINTK) += printk_safe.o +obj-$(CONFIG_PRINTK) += printk_safe.o printk_nobkl.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK_INDEX) += index.o @ kernel/printk/internal.h:6 @ * internal.h - printk internal definitions */ #include <linux/percpu.h> +#include <linux/console.h> +#include "printk_ringbuffer.h" #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) void __init printk_sysctl_init(void); @ kernel/printk/internal.h:17 @ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, #define printk_sysctl_init() do { } while (0) #endif -#ifdef CONFIG_PRINTK +#define con_printk(lvl, con, fmt, ...) \ + printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt), \ + (con->flags & CON_NO_BKL) ? "" : "legacy ", \ + (con->flags & CON_BOOT) ? "boot" : "", \ + con->name, con->index, ##__VA_ARGS__) +#ifdef CONFIG_PRINTK #ifdef CONFIG_PRINTK_CALLER #define PRINTK_PREFIX_MAX 48 #else @ kernel/printk/internal.h:45 @ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; +extern struct printk_ringbuffer *prb; +extern bool have_bkl_console; +extern bool printk_threads_enabled; + +extern bool have_boot_console; + __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, @ kernel/printk/internal.h:61 @ __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); bool printk_percpu_data_ready(void); +/* + * The printk_safe_enter()/_exit() macros mark code blocks using locks that + * would lead to deadlock if an interrupting context were to call printk() + * while the interrupted context was within such code blocks. + * + * When a CPU is in such a code block, an interrupting context calling + * printk() will only log the new message to the lockless ringbuffer and + * then trigger console printing using irqwork. + */ + #define printk_safe_enter_irqsave(flags) \ do { \ - local_irq_save(flags); \ - __printk_safe_enter(); \ + __printk_safe_enter(&flags); \ } while (0) #define printk_safe_exit_irqrestore(flags) \ do { \ - __printk_safe_exit(); \ - local_irq_restore(flags); \ + __printk_safe_exit(&flags); \ } while (0) void defer_console_output(void); u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags); + +u64 cons_read_seq(struct console *con); +void cons_nobkl_cleanup(struct console *con); +bool cons_nobkl_init(struct console *con); +bool cons_alloc_percpu_data(struct console *con); +void cons_kthread_create(struct console *con); +void cons_wake_threads(void); +void cons_force_seq(struct console *con, u64 seq); +void console_bkl_kthread_create(void); + +/* + * Check if the given console is currently capable and allowed to print + * records. If the caller only works with certain types of consoles, the + * caller is responsible for checking the console type before calling + * this function. + */ +static inline bool console_is_usable(struct console *con, short flags) +{ + if (!(flags & CON_ENABLED)) + return false; + + if ((flags & CON_SUSPENDED)) + return false; + + /* + * The usability of a console varies depending on whether + * it is a NOBKL console or not. + */ + + if (flags & CON_NO_BKL) { + if (have_boot_console) + return false; + + } else { + if (!con->write) + return false; + /* + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) + return false; + } + + return true; +} + +/** + * cons_kthread_wake - Wake up a printk thread + * @con: Console to operate on + */ +static inline void cons_kthread_wake(struct console *con) +{ + rcuwait_wake_up(&con->rcuwait); +} + #else #define PRINTK_PREFIX_MAX 0 #define PRINTK_MESSAGE_MAX 0 #define PRINTKRB_RECORD_MAX 0 +static inline void cons_kthread_wake(struct console *con) { } +static inline void cons_kthread_create(struct console *con) { } +#define printk_threads_enabled (false) + /* * In !PRINTK builds we still export console_sem * semaphore and some of console functions (console_unlock()/etc.), so @ kernel/printk/internal.h:162 @ u16 printk_parse_prefix(const char *text, int *level, #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) static inline bool printk_percpu_data_ready(void) { return false; } +static inline bool cons_nobkl_init(struct console *con) { return true; } +static inline void cons_nobkl_cleanup(struct console *con) { } +static inline bool console_is_usable(struct console *con, short flags) { return false; } +static inline void cons_force_seq(struct console *con, u64 seq) { } + #endif /* CONFIG_PRINTK */ +extern bool have_boot_console; + /** * struct printk_buffers - Buffers to read/format/output printk messages. * @outbuf: After formatting, contains text to output. @ kernel/printk/internal.h:196 @ struct printk_message { u64 seq; unsigned long dropped; }; + +/** + * struct cons_context_data - console context data + * @wctxt: Write context per priority level + * @pbufs: Buffer for storing the text + * + * Used for early boot and for per CPU data. + * + * The write contexts are allocated to avoid having them on stack, e.g. in + * warn() or panic(). + */ +struct cons_context_data { + struct cons_write_context wctxt[CONS_PRIO_MAX]; + struct printk_buffers pbufs; +}; + +bool printk_get_next_message(struct printk_message *pmsg, u64 seq, + bool is_extended, bool may_supress); + +#ifdef CONFIG_PRINTK + +void console_prepend_dropped(struct printk_message *pmsg, + unsigned long dropped); + +#endif @ kernel/printk/printk.c:445 @ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); +/* + * Specifies if a BKL console was ever registered. Used to determine if the + * console lock/unlock dance is needed for console printing. + */ +bool have_bkl_console; + +/* + * Specifies if a boot console is registered. Used to determine if NOBKL + * consoles may be used since NOBKL consoles cannot synchronize with boot + * consoles. + */ +bool have_boot_console; + +static int unregister_console_locked(struct console *console); + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ @ kernel/printk/printk.c:510 @ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, static struct printk_ringbuffer printk_rb_dynamic; -static struct printk_ringbuffer *prb = &printk_rb_static; +struct printk_ringbuffer *prb = &printk_rb_static; /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before @ kernel/printk/printk.c:714 @ static ssize_t msg_print_ext_body(char *buf, size_t size, return len; } -static bool printk_get_next_message(struct printk_message *pmsg, u64 seq, - bool is_extended, bool may_supress); - /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; @ kernel/printk/printk.c:1115 @ static inline void log_buf_add_cpu(void) {} static void __init set_percpu_data_ready(void) { + struct hlist_node *tmp; + struct console *con; + + console_list_lock(); + + hlist_for_each_entry_safe(con, tmp, &console_list, node) { + if (!cons_alloc_percpu_data(con)) + unregister_console_locked(con); + } + __printk_percpu_data_ready = true; + + console_list_unlock(); } static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, @ kernel/printk/printk.c:2303 @ asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { + struct cons_write_context wctxt = { }; int printed_len; bool in_sched = false; @ kernel/printk/printk.c:2324 @ asmlinkage int vprintk_emit(int facility, int level, printed_len = vprintk_store(facility, level, dev_info, fmt, args); + /* + * The caller may be holding system-critical or + * timing-sensitive locks. Disable preemption during + * printing of all remaining records to all consoles so that + * this context can return as soon as possible. Hopefully + * another printk() caller will take over the printing. + */ + preempt_disable(); + + /* + * Flush the non-BKL consoles. This only leads to direct atomic + * printing for non-BKL consoles that do not have a printer + * thread available. Otherwise the printer thread will perform + * the printing. + */ + cons_atomic_flush(&wctxt, true); + /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { - /* - * The caller may be holding system-critical or - * timing-sensitive locks. Disable preemption during - * printing of all remaining records to all consoles so that - * this context can return as soon as possible. Hopefully - * another printk() caller will take over the printing. - */ - preempt_disable(); + if (!in_sched && have_bkl_console && !IS_ENABLED(CONFIG_PREEMPT_RT)) { /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers. With the @ kernel/printk/printk.c:2351 @ asmlinkage int vprintk_emit(int facility, int level, */ if (console_trylock_spinning()) console_unlock(); - preempt_enable(); } - wake_up_klogd(); + preempt_enable(); + + cons_wake_threads(); + if (in_sched) + defer_console_output(); + else + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @ kernel/printk/printk.c:2598 @ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig */ void suspend_console(void) { + struct console *con; + if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); + + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags | CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see that they are suspended so that it + * is guaranteed that all printing has stopped when this function + * completes. + */ + synchronize_srcu(&console_srcu); + console_lock(); console_suspended = 1; up_console_sem(); @ kernel/printk/printk.c:2625 @ void suspend_console(void) void resume_console(void) { + struct console *con; + short flags; + int cookie; + if (!console_suspend_enabled) return; down_console_sem(); console_suspended = 0; console_unlock(); + + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see they are no longer suspended so + * that they are guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + flags = console_srcu_read_flags(con); + if (flags & CON_NO_BKL) + cons_kthread_wake(con); + } + console_srcu_read_unlock(cookie); + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && have_bkl_console) + wake_up_interruptible(&log_wait); + pr_flush(1000, true); } @ kernel/printk/printk.c:2672 @ void resume_console(void) */ static int console_cpu_notify(unsigned int cpu) { - if (!cpuhp_tasks_frozen) { + if (!cpuhp_tasks_frozen && have_bkl_console) { /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); @ kernel/printk/printk.c:2747 @ static bool abandon_console_lock_in_panic(void) return atomic_read(&panic_cpu) != raw_smp_processor_id(); } -/* - * Check if the given console is currently capable and allowed to print - * records. - * - * Requires the console_srcu_read_lock. - */ -static inline bool console_is_usable(struct console *con) -{ - short flags = console_srcu_read_flags(con); - - if (!(flags & CON_ENABLED)) - return false; - - if (!con->write) - return false; - - /* - * Console drivers may assume that per-cpu resources have been - * allocated. So unless they're explicitly marked as being able to - * cope (CON_ANYTIME) don't call them until this CPU is officially up. - */ - if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) - return false; - - return true; -} - static void __console_unlock(void) { console_locked = 0; @ kernel/printk/printk.c:2768 @ static void __console_unlock(void) * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ #ifdef CONFIG_PRINTK -static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); @ kernel/printk/printk.c:2800 @ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d pmsg->outbuf_len += len; } #else -#define console_prepend_dropped(pmsg, dropped) +static inline void console_prepend_dropped(struct printk_message *pmsg, + unsigned long dropped) { } #endif /* CONFIG_PRINTK */ /* @ kernel/printk/printk.c:2823 @ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d * of @pmsg are valid. (See the documentation of struct printk_message * for information about the @pmsg fields.) */ -static bool printk_get_next_message(struct printk_message *pmsg, u64 seq, - bool is_extended, bool may_suppress) +bool printk_get_next_message(struct printk_message *pmsg, u64 seq, + bool is_extended, bool may_suppress) { static int panic_console_dropped; @ kernel/printk/printk.c:2993 @ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove cookie = console_srcu_read_lock(); for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); bool progress; - if (!console_is_usable(con)) + /* console_flush_all() is only for legacy consoles. */ + if (flags & CON_NO_BKL) + continue; + + if (!console_is_usable(con, flags)) continue; any_usable = true; @ kernel/printk/printk.c:3038 @ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove return false; } -/** - * console_unlock - unblock the console subsystem from printing - * - * Releases the console_lock which the caller holds to block printing of - * the console subsystem. - * - * While the console_lock was held, console output may have been buffered - * by printk(). If this is the case, console_unlock(); emits - * the output prior to releasing the lock. - * - * console_unlock(); may be called from any context. - */ -void console_unlock(void) +static u64 console_flush_and_unlock(void) { bool do_cond_resched; bool handover; bool flushed; u64 next_seq; - if (console_suspended) { - up_console_sem(); - return; - } - /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @ kernel/printk/printk.c:3081 @ void console_unlock(void) * fails, another context is already handling the printing. */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); + + return next_seq; +} + +/** + * console_unlock - unblock the console subsystem from printing + * + * Releases the console_lock which the caller holds to block printing of + * the console subsystem. + * + * While the console_lock was held, console output may have been buffered + * by printk(). If this is the case, console_unlock(); emits + * the output prior to releasing the lock. + * + * console_unlock(); may be called from any context. + */ +void console_unlock(void) +{ + if (console_suspended) { + up_console_sem(); + return; + } + + /* + * PREEMPT_RT relies on kthread and atomic consoles for printing. + * It never attempts to print from console_unlock(). + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + __console_unlock(); + return; + } + + console_flush_and_unlock(); } EXPORT_SYMBOL(console_unlock); @ kernel/printk/printk.c:3138 @ void console_unblank(void) struct console *c; int cookie; + if (!have_bkl_console) + return; + /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. @ kernel/printk/printk.c:3149 @ void console_unblank(void) * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { + /* Semaphores are not NMI-safe. */ + if (in_nmi()) + return; + if (down_trylock_console_sem() != 0) return; } else @ kernel/printk/printk.c:3182 @ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { + struct console *c; + short flags; + int cookie; + u64 seq; + + seq = prb_first_valid_seq(prb); + + /* + * Safely flush the atomic consoles before trying to flush any + * BKL/legacy consoles. + */ + if (mode == CONSOLE_REPLAY_ALL) { + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + flags = console_srcu_read_flags(c); + if (flags & CON_NO_BKL) + cons_force_seq(c, seq); + } + console_srcu_read_unlock(cookie); + } + cons_atomic_flush(NULL, true); + + if (!have_bkl_console) + return; + /* * If someone else is holding the console lock, trylock will fail * and may_schedule may be set. Ignore and proceed to unlock so * that messages are flushed out. As this can be called from any * context and we don't want to get preempted while flushing, * ensure may_schedule is cleared. + * + * Since semaphores are not NMI-safe, the console lock must be + * ignored if the panic is in NMI context. */ - console_trylock(); + if (!in_nmi()) + console_trylock(); console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) { - struct console *c; - int cookie; - u64 seq; - - seq = prb_first_valid_seq(prb); - cookie = console_srcu_read_lock(); for_each_console_srcu(c) { /* @ kernel/printk/printk.c:3233 @ void console_flush_on_panic(enum con_flush_mode mode) } console_srcu_read_unlock(cookie); } - console_unlock(); + if (!in_nmi()) + console_unlock(); } /* @ kernel/printk/printk.c:3291 @ EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { + short flags; + console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); + flags = console->flags; console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. The related + * printing context must be able to see it is enabled so that + * it is guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + + if (flags & CON_NO_BKL) + cons_kthread_wake(console); + else if (IS_ENABLED(CONFIG_PREEMPT_RT)) + wake_up_interruptible(&log_wait); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); +static struct task_struct *console_bkl_kthread; + +static bool printer_should_wake(u64 seq) +{ + bool available = false; + struct console *con; + int cookie; + + if (kthread_should_stop()) + return true; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + + if (flags & CON_NO_BKL) + continue; + if (!console_is_usable(con, flags)) + continue; + /* + * It is safe to read @seq because only this + * thread context updates @seq. + */ + if (prb_read_valid(prb, con->seq, NULL)) { + available = true; + break; + } + } + console_srcu_read_unlock(cookie); + + return available; +} + +static int console_bkl_kthread_func(void *unused) +{ + u64 seq = 0; + int error; + + for (;;) { + error = wait_event_interruptible(log_wait, printer_should_wake(seq)); + + if (kthread_should_stop()) + break; + + if (error) + continue; + + console_lock(); + if (console_suspended) + up_console_sem(); + else + seq = console_flush_and_unlock(); + } + return 0; +} + +void console_bkl_kthread_create(void) +{ + struct task_struct *kt; + struct console *c; + + lockdep_assert_held(&console_mutex); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + return; + + if (!printk_threads_enabled || console_bkl_kthread) + return; + + for_each_console(c) { + if (c->flags & CON_BOOT) + return; + } + + kt = kthread_run(console_bkl_kthread_func, NULL, "pr/bkl"); + if (IS_ERR(kt)) { + pr_err("unable to start BKL printing thread\n"); + return; + } + + console_bkl_kthread = kt; + + /* + * It is important that console printing threads are scheduled + * shortly after a printk call and with generous runtime budgets. + */ + sched_set_normal(console_bkl_kthread, -20); +} + static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) @ kernel/printk/printk.c:3486 @ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_CONSDEV; } -#define con_printk(lvl, con, fmt, ...) \ - printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \ - (con->flags & CON_BOOT) ? "boot" : "", \ - con->name, con->index, ##__VA_ARGS__) - static void console_init_seq(struct console *newcon, bool bootcon_registered) { struct console *con; @ kernel/printk/printk.c:3550 @ static void console_init_seq(struct console *newcon, bool bootcon_registered) #define console_first() \ hlist_entry(console_list.first, struct console, node) -static int unregister_console_locked(struct console *console); - /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @ kernel/printk/printk.c:3641 @ void register_console(struct console *newcon) newcon->dropped = 0; console_init_seq(newcon, bootcon_registered); + if (!(newcon->flags & CON_NO_BKL)) { + have_bkl_console = true; + console_bkl_kthread_create(); + } else if (!cons_nobkl_init(newcon)) { + goto unlock; + } + + if (newcon->flags & CON_BOOT) + have_boot_console = true; + /* * Put this console in the list - keep the * preferred driver at the head of the list. @ kernel/printk/printk.c:3694 @ void register_console(struct console *newcon) if (con->flags & CON_BOOT) unregister_console_locked(con); } + + /* All boot consoles have been unregistered. */ + have_boot_console = false; } unlock: console_list_unlock(); @ kernel/printk/printk.c:3706 @ EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { + struct console *c; + bool is_boot_con; int res; lockdep_assert_console_list_lock_held(); - con_printk(KERN_INFO, console, "disabled\n"); + is_boot_con = console->flags & CON_BOOT; res = _braille_unregister_console(console); if (res < 0) @ kernel/printk/printk.c:3720 @ static int unregister_console_locked(struct console *console) if (res > 0) return 0; - /* Disable it unconditionally */ - console_srcu_write_flags(console, console->flags & ~CON_ENABLED); - if (!console_is_registered_locked(console)) return -ENODEV; + console_srcu_write_flags(console, console->flags & ~CON_ENABLED); + + con_printk(KERN_INFO, console, "disabled\n"); + hlist_del_init_rcu(&console->node); /* @ kernel/printk/printk.c:3748 @ static int unregister_console_locked(struct console *console) */ synchronize_srcu(&console_srcu); + if (console->flags & CON_NO_BKL) + cons_nobkl_cleanup(console); + console_sysfs_notify(); if (console->exit) res = console->exit(console); + /* + * Each time a boot console unregisters, try to start up the printing + * threads. They will only start if this was the last boot console. + */ + if (is_boot_con) { + for_each_console(c) + cons_kthread_create(c); + } + return res; } @ kernel/printk/printk.c:3926 @ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre /* * Hold the console_lock to guarantee safe access to - * console->seq and to prevent changes to @console_suspended - * until all consoles have been processed. + * console->seq. */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { + short flags; + if (con && con != c) continue; - if (!console_is_usable(c)) + + flags = console_srcu_read_flags(c); + + if (!console_is_usable(c, flags)) continue; + + /* + * Since the console is locked, use this opportunity + * to update console->seq for NOBKL consoles. + */ + if (flags & CON_NO_BKL) + c->seq = cons_read_seq(c); + printk_seq = c->seq; if (printk_seq < seq) diff += seq - printk_seq; } console_srcu_read_unlock(cookie); - /* - * If consoles are suspended, it cannot be expected that they - * make forward progress, so timeout immediately. @diff is - * still used to return a valid flush status. - */ - if (console_suspended) - remaining = 0; - else if (diff != last_diff && reset_on_progress) + if (diff != last_diff && reset_on_progress) remaining = timeout_ms; console_unlock(); @ kernel/printk/printk.c:4013 @ static void wake_up_klogd_work_func(struct irq_work *irq_work) int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { - /* If trylock fails, someone else is doing the printing */ - if (console_trylock()) - console_unlock(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* The BKL thread waits on @log_wait. */ + pending |= PRINTK_PENDING_WAKEUP; + } else { + /* + * If trylock fails, some other context + * will do the printing. + */ + if (console_trylock()) + console_unlock(); + } } if (pending & PRINTK_PENDING_WAKEUP) @ kernel/printk/printk.c:4058 @ static void __wake_up_klogd(int val) preempt_enable(); } +/** + * wake_up_klogd - Wake kernel logging daemon + * + * Use this function when new records have been added to the ringbuffer + * and the console printing for those records is handled elsewhere. In + * this case only the logging daemon needs to be woken. + * + * Context: Any context. + */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } +/** + * defer_console_output - Wake kernel logging daemon and trigger + * console printing in a deferred context + * + * Use this function when new records have been added to the ringbuffer + * but the current context is unable to perform the console printing. + * This function also wakes the logging daemon. + * + * Context: Any context. + */ void defer_console_output(void) { + int val = PRINTK_PENDING_WAKEUP; + /* * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ - __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); + if (have_bkl_console) + val |= PRINTK_PENDING_OUTPUT; + __wake_up_klogd(val); } void printk_trigger_flush(void) { + struct cons_write_context wctxt = { }; + + preempt_disable(); + cons_atomic_flush(&wctxt, true); + preempt_enable(); + + cons_wake_threads(); defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) { - int r; - - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); - defer_console_output(); - - return r; + return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) @ kernel/printk/printk_nobkl.c:4 @ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2022 Linutronix GmbH, John Ogness +// Copyright (C) 2022 Intel, Thomas Gleixner + +#include <linux/kernel.h> +#include <linux/console.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/slab.h> +#include <linux/syscore_ops.h> +#include "printk_ringbuffer.h" +#include "internal.h" +/* + * Printk implementation for consoles that do not depend on the BKL style + * console_lock mechanism. + * + * Console is locked on a CPU when state::locked is set and state:cpu == + * current CPU. This is valid for the current execution context. + * + * Nesting execution contexts on the same CPU can carefully take over + * if the driver allows reentrancy via state::unsafe = false. When the + * interrupted context resumes it checks the state before entering + * an unsafe region and aborts the operation if it detects a takeover. + * + * In case of panic or emergency the nesting context can take over the + * console forcefully. The write callback is then invoked with the unsafe + * flag set in the write context data, which allows the driver side to avoid + * locks and to evaluate the driver state so it can use an emergency path + * or repair the state instead of blindly assuming that it works. + * + * If the interrupted context touches the assigned record buffer after + * takeover, it does not cause harm because at the same execution level + * there is no concurrency on the same CPU. A threaded printer always has + * its own record buffer so it can never interfere with any of the per CPU + * record buffers. + * + * A concurrent writer on a different CPU can request to take over the + * console by: + * + * 1) Carefully writing the desired state into state[REQ] + * if there is no same or higher priority request pending. + * This locks state[REQ] except for higher priority + * waiters. + * + * 2) Setting state[CUR].req_prio unless a same or higher + * priority waiter won the race. + * + * 3) Carefully spin on state[CUR] until that is locked with the + * expected state. When the state is not the expected one then it + * has to verify that state[REQ] is still the same and that + * state[CUR] has not been taken over or unlocked. + * + * The unlocker hands over to state[REQ], but only if state[CUR] + * matches. + * + * In case that the owner does not react on the request and does not make + * observable progress, the waiter will timeout and can then decide to do + * a hostile takeover. + */ + +#define copy_full_state(_dst, _src) do { _dst = _src; } while (0) +#define copy_bit_state(_dst, _src) do { _dst.bits = _src.bits; } while (0) + +#ifdef CONFIG_64BIT +#define copy_seq_state64(_dst, _src) do { _dst.seq = _src.seq; } while (0) +#else +#define copy_seq_state64(_dst, _src) do { } while (0) +#endif + +enum state_selector { + CON_STATE_CUR, + CON_STATE_REQ, +}; + +/** + * cons_state_set - Helper function to set the console state + * @con: Console to update + * @which: Selects real state or handover state + * @new: The new state to write + * + * Only to be used when the console is not yet or no longer visible in the + * system. Otherwise use cons_state_try_cmpxchg(). + */ +static inline void cons_state_set(struct console *con, enum state_selector which, + struct cons_state *new) +{ + atomic_long_set(&ACCESS_PRIVATE(con, atomic_state[which]), new->atom); +} + +/** + * cons_state_read - Helper function to read the console state + * @con: Console to update + * @which: Selects real state or handover state + * @state: The state to store the result + */ +static inline void cons_state_read(struct console *con, enum state_selector which, + struct cons_state *state) +{ + state->atom = atomic_long_read(&ACCESS_PRIVATE(con, atomic_state[which])); +} + +/** + * cons_state_try_cmpxchg() - Helper function for atomic_long_try_cmpxchg() on console state + * @con: Console to update + * @which: Selects real state or handover state + * @old: Old/expected state + * @new: New state + * + * Returns: True on success, false on fail + */ +static inline bool cons_state_try_cmpxchg(struct console *con, + enum state_selector which, + struct cons_state *old, + struct cons_state *new) +{ + return atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, atomic_state[which]), + &old->atom, new->atom); +} + +/** + * cons_state_full_match - Check whether the full state matches + * @cur: The state to check + * @prev: The previous state + * + * Returns: True if matching, false otherwise. + * + * Check the full state including state::seq on 64bit. For take over + * detection. + */ +static inline bool cons_state_full_match(struct cons_state cur, + struct cons_state prev) +{ + /* + * req_prio can be set by a concurrent writer for friendly + * handover. Ignore it in the comparison. + */ + cur.req_prio = prev.req_prio; + return cur.atom == prev.atom; +} + +/** + * cons_state_bits_match - Check for matching state bits + * @cur: The state to check + * @prev: The previous state + * + * Returns: True if state matches, false otherwise. + * + * Contrary to cons_state_full_match this checks only the bits and ignores + * a sequence change on 64bits. On 32bit the two functions are identical. + */ +static inline bool cons_state_bits_match(struct cons_state cur, struct cons_state prev) +{ + /* + * req_prio can be set by a concurrent writer for friendly + * handover. Ignore it in the comparison. + */ + cur.req_prio = prev.req_prio; + return cur.bits == prev.bits; +} + +/** + * cons_check_panic - Check whether a remote CPU is in panic + * + * Returns: True if a remote CPU is in panic, false otherwise. + */ +static inline bool cons_check_panic(void) +{ + unsigned int pcpu = atomic_read(&panic_cpu); + + return pcpu != PANIC_CPU_INVALID && pcpu != smp_processor_id(); +} + +static struct cons_context_data early_cons_ctxt_data __initdata; + +/** + * cons_context_set_pbufs - Set the output text buffer for the current context + * @ctxt: Pointer to the acquire context + * + * Buffer selection: + * 1) Early boot uses the global (initdata) buffer + * 2) Printer threads use the dynamically allocated per-console buffers + * 3) All other contexts use the per CPU buffers + * + * This guarantees that there is no concurrency on the output records ever. + * Early boot and per CPU nesting is not a problem. The takeover logic + * tells the interrupted context that the buffer has been overwritten. + * + * There are two critical regions that matter: + * + * 1) Context is filling the buffer with a record. After interruption + * it continues to sprintf() the record and before it goes to + * write it out, it checks the state, notices the takeover, discards + * the content and backs out. + * + * 2) Context is in a unsafe critical region in the driver. After + * interruption it might read overwritten data from the output + * buffer. When it leaves the critical region it notices and backs + * out. Hostile takeovers in driver critical regions are best effort + * and there is not much that can be done about that. + */ +static __ref void cons_context_set_pbufs(struct cons_context *ctxt) +{ + struct console *con = ctxt->console; + + /* Thread context or early boot? */ + if (ctxt->thread) + ctxt->pbufs = con->thread_pbufs; + else if (!con->pcpu_data) + ctxt->pbufs = &early_cons_ctxt_data.pbufs; + else + ctxt->pbufs = &(this_cpu_ptr(con->pcpu_data)->pbufs); +} + +/** + * cons_seq_init - Helper function to initialize the console sequence + * @con: Console to work on + * + * Set @con->atomic_seq to the starting record, or if that record no + * longer exists, the oldest available record. For init only. Do not + * use for runtime updates. + */ +static void cons_seq_init(struct console *con) +{ + u32 seq = (u32)max_t(u64, con->seq, prb_first_valid_seq(prb)); +#ifdef CONFIG_64BIT + struct cons_state state; + + cons_state_read(con, CON_STATE_CUR, &state); + state.seq = seq; + cons_state_set(con, CON_STATE_CUR, &state); +#else + atomic_set(&ACCESS_PRIVATE(con, atomic_seq), seq); +#endif +} + +/** + * cons_force_seq - Force a specified sequence number for a console + * @con: Console to work on + * @seq: Sequence number to force + * + * This function is only intended to be used in emergency situations. In + * particular: console_flush_on_panic(CONSOLE_REPLAY_ALL) + */ +void cons_force_seq(struct console *con, u64 seq) +{ +#ifdef CONFIG_64BIT + struct cons_state old; + struct cons_state new; + + do { + cons_state_read(con, CON_STATE_CUR, &old); + copy_bit_state(new, old); + new.seq = seq; + } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new)); +#else + atomic_set(&ACCESS_PRIVATE(con, atomic_seq), seq); +#endif +} + +static inline u64 cons_expand_seq(u64 seq) +{ + u64 rbseq; + + /* + * The provided sequence is only the lower 32bits of the ringbuffer + * sequence. It needs to be expanded to 64bit. Get the next sequence + * number from the ringbuffer and fold it. + */ + rbseq = prb_next_seq(prb); + seq = rbseq - ((u32)rbseq - (u32)seq); + + return seq; +} + +/** + * cons_read_seq - Read the current console sequence + * @con: Console to read the sequence of + * + * Returns: Sequence number of the next record to print on @con. + */ +u64 cons_read_seq(struct console *con) +{ + u64 seq; +#ifdef CONFIG_64BIT + struct cons_state state; + + cons_state_read(con, CON_STATE_CUR, &state); + seq = state.seq; +#else + seq = atomic_read(&ACCESS_PRIVATE(con, atomic_seq)); +#endif + return cons_expand_seq(seq); +} + +/** + * cons_context_set_seq - Setup the context with the next sequence to print + * @ctxt: Pointer to an acquire context that contains + * all information about the acquire mode + * + * On return the retrieved sequence number is stored in ctxt->oldseq. + * + * The sequence number is safe in forceful takeover situations. + * + * Either the writer succeeded to update before it got interrupted + * or it failed. In the latter case the takeover will print the + * same line again. + * + * The sequence is only the lower 32bits of the ringbuffer sequence. The + * ringbuffer must be 2^31 records ahead to get out of sync. This needs + * some care when starting a console, i.e setting the sequence to 0 is + * wrong. It has to be set to the oldest valid sequence in the ringbuffer + * as that cannot be more than 2^31 records away + * + * On 64bit the 32bit sequence is part of console::state, which is saved + * in @ctxt->state. This prevents the 32bit update race. + */ +static void cons_context_set_seq(struct cons_context *ctxt) +{ +#ifdef CONFIG_64BIT + ctxt->oldseq = ctxt->state.seq; +#else + ctxt->oldseq = atomic_read(&ACCESS_PRIVATE(ctxt->console, atomic_seq)); +#endif + ctxt->oldseq = cons_expand_seq(ctxt->oldseq); + ctxt->newseq = ctxt->oldseq; +} + +/** + * cons_seq_try_update - Try to update the console sequence number + * @ctxt: Pointer to an acquire context that contains + * all information about the acquire mode + * + * Returns: True if the console sequence was updated, false otherwise. + * + * Internal helper as the logic is different on 32bit and 64bit. + * + * On 32 bit the sequence is separate from state and therefore + * subject to a subtle race in the case of hostile takeovers. + * + * On 64 bit the sequence is part of the state and therefore safe + * vs. hostile takeovers. + * + * In case of fail the console has been taken over and @ctxt is + * invalid. Caller has to reacquire the console. + */ +#ifdef CONFIG_64BIT +static bool cons_seq_try_update(struct cons_context *ctxt) +{ + struct console *con = ctxt->console; + struct cons_state old; + struct cons_state new; + + cons_state_read(con, CON_STATE_CUR, &old); + do { + /* Make sure this context is still the owner. */ + if (!cons_state_bits_match(old, ctxt->state)) + return false; + + /* Preserve bit state */ + copy_bit_state(new, old); + new.seq = ctxt->newseq; + + /* + * Can race with hostile takeover or with a handover + * request. + */ + } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new)); + + copy_full_state(ctxt->state, new); + ctxt->oldseq = ctxt->newseq; + + return true; +} +#else +static bool cons_release(struct cons_context *ctxt); +static bool cons_seq_try_update(struct cons_context *ctxt) +{ + struct console *con = ctxt->console; + struct cons_state state; + int pcpu; + u32 old; + u32 new; + + /* + * There is a corner case that needs to be considered here: + * + * CPU0 CPU1 + * printk() + * acquire() -> emergency + * write() acquire() + * update_seq() + * state == OK + * --> NMI + * takeover() + * <--- write() + * cmpxchg() succeeds update_seq() + * cmpxchg() fails + * + * There is nothing that can be done about this other than having + * yet another state bit that needs to be tracked and analyzed, + * but fails to cover the problem completely. + * + * No other scenarios expose such a problem. On same CPU takeovers + * the cmpxchg() always fails on the interrupted context after the + * interrupting context finished printing, but that's fine as it + * does not own the console anymore. The state check after the + * failed cmpxchg prevents that. + */ + cons_state_read(con, CON_STATE_CUR, &state); + /* Make sure this context is still the owner. */ + if (!cons_state_bits_match(state, ctxt->state)) + return false; + + /* + * Get the original sequence number that was retrieved + * from @con->atomic_seq. @con->atomic_seq should be still + * the same. 32bit truncates. See cons_context_set_seq(). + */ + old = (u32)ctxt->oldseq; + new = (u32)ctxt->newseq; + if (atomic_try_cmpxchg(&ACCESS_PRIVATE(con, atomic_seq), &old, new)) { + ctxt->oldseq = ctxt->newseq; + return true; + } + + /* + * Reread the state. If this context does not own the console anymore + * then it cannot touch the sequence again. + */ + cons_state_read(con, CON_STATE_CUR, &state); + if (!cons_state_bits_match(state, ctxt->state)) + return false; + + pcpu = atomic_read(&panic_cpu); + if (pcpu == smp_processor_id()) { + /* + * This is the panic CPU. Emitting a warning here does not + * help at all. The callchain is clear and the priority is + * to get the messages out. In the worst case duplicated + * ones. That's a job for postprocessing. + */ + atomic_set(&ACCESS_PRIVATE(con, atomic_seq), new); + ctxt->oldseq = ctxt->newseq; + return true; + } + + /* + * Only emit a warning when this happens outside of a panic + * situation as on panic it's neither useful nor helping to let the + * panic CPU get the important stuff out. + */ + WARN_ON_ONCE(pcpu == PANIC_CPU_INVALID); + + cons_release(ctxt); + return false; +} +#endif + +/** + * cons_cleanup_handover - Cleanup a handover request + * @ctxt: Pointer to acquire context + * + * @ctxt->hov_state contains the state to clean up + */ +static void cons_cleanup_handover(struct cons_context *ctxt) +{ + struct console *con = ctxt->console; + struct cons_state new; + + /* + * No loop required. Either hov_state is still the same or + * not. + */ + new.atom = 0; + cons_state_try_cmpxchg(con, CON_STATE_REQ, &ctxt->hov_state, &new); +} + +/** + * cons_setup_handover - Setup a handover request + * @ctxt: Pointer to acquire context + * + * Returns: True if a handover request was setup, false otherwise. + * + * On success @ctxt->hov_state contains the requested handover state + * + * On failure this context is not allowed to request a handover from the + * current owner. Reasons would be priority too low or a remote CPU in panic. + * In both cases this context should give up trying to acquire the console. + */ +static bool cons_setup_handover(struct cons_context *ctxt) +{ + unsigned int cpu = smp_processor_id(); + struct console *con = ctxt->console; + struct cons_state old; + struct cons_state hstate = { + .locked = 1, + .cur_prio = ctxt->prio, + .cpu = cpu, + }; + + /* + * Try to store hstate in @con->atomic_state[REQ]. This might + * race with a higher priority waiter. + */ + cons_state_read(con, CON_STATE_REQ, &old); + do { + if (cons_check_panic()) + return false; + + /* Same or higher priority waiter exists? */ + if (old.cur_prio >= ctxt->prio) + return false; + + } while (!cons_state_try_cmpxchg(con, CON_STATE_REQ, &old, &hstate)); + + /* Save that state for comparison in spinwait */ + copy_full_state(ctxt->hov_state, hstate); + return true; +} + +/** + * cons_setup_request - Setup a handover request in state[CUR] + * @ctxt: Pointer to acquire context + * @old: The state that was used to make the decision to spin wait + * + * Returns: True if a handover request was setup in state[CUR], false + * otherwise. + * + * On success @ctxt->req_state contains the request state that was set in + * state[CUR] + * + * On failure this context encountered unexpected state values. This + * context should retry the full handover request setup process (the + * handover request setup by cons_setup_handover() is now invalidated + * and must be performed again). + */ +static bool cons_setup_request(struct cons_context *ctxt, struct cons_state old) +{ + struct console *con = ctxt->console; + struct cons_state cur; + struct cons_state new; + + /* Now set the request in state[CUR] */ + cons_state_read(con, CON_STATE_CUR, &cur); + do { + if (cons_check_panic()) + goto cleanup; + + /* Bit state changed vs. the decision to spinwait? */ + if (!cons_state_bits_match(cur, old)) + goto cleanup; + + /* + * A higher or equal priority context already setup a + * request? + */ + if (cur.req_prio >= ctxt->prio) + goto cleanup; + + /* Setup a request for handover. */ + copy_full_state(new, cur); + new.req_prio = ctxt->prio; + } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &cur, &new)); + + /* Save that state for comparison in spinwait */ + copy_bit_state(ctxt->req_state, new); + return true; + +cleanup: + cons_cleanup_handover(ctxt); + return false; +} + +/** + * cons_try_acquire_spin - Complete the spinwait attempt + * @ctxt: Pointer to an acquire context that contains + * all information about the acquire mode + * + * @ctxt->hov_state contains the handover state that was set in + * state[REQ] + * @ctxt->req_state contains the request state that was set in + * state[CUR] + * + * Returns: 0 if successfully locked. -EBUSY on timeout. -EAGAIN on + * unexpected state values. + * + * On success @ctxt->state contains the new state that was set in + * state[CUR] + * + * On -EBUSY failure this context timed out. This context should either + * give up or attempt a hostile takeover. + * + * On -EAGAIN failure this context encountered unexpected state values. + * This context should retry the full handover request setup process (the + * handover request setup by cons_setup_handover() is now invalidated and + * must be performed again). + */ +static int cons_try_acquire_spin(struct cons_context *ctxt) +{ + struct console *con = ctxt->console; + struct cons_state cur; + struct cons_state new; + int err = -EAGAIN; + int timeout; + + /* Now wait for the other side to hand over */ + for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) { + /* Timeout immediately if a remote panic is detected. */ + if (cons_check_panic()) + break; + + cons_state_read(con, CON_STATE_CUR, &cur); + + /* + * If the real state of the console matches the handover state + * that this context setup, then the handover was a success + * and this context is now the owner. + * + * Note that this might have raced with a new higher priority + * requester coming in after the lock was handed over. + * However, that requester will see that the owner changes and + * setup a new request for the current owner (this context). + */ + if (cons_state_bits_match(cur, ctxt->hov_state)) + goto success; + + /* + * If state changed since the request was made, give up as + * it is no longer consistent. This must include + * state::req_prio since there could be a higher priority + * request available. + */ + if (cur.bits != ctxt->req_state.bits) + goto cleanup; + + /* + * Finally check whether the handover state is still + * the same. + */ + cons_state_read(con, CON_STATE_REQ, &cur); + if (cur.atom != ctxt->hov_state.atom) + goto cleanup; + + /* Account time */ + if (timeout > 0) + udelay(1); + } + + /* + * Timeout. Cleanup the handover state and carefully try to reset + * req_prio in the real state. The reset is important to ensure + * that the owner does not hand over the lock after this context + * has given up waiting. + */ + cons_cleanup_handover(ctxt); + + cons_state_read(con, CON_STATE_CUR, &cur); + do { + /* + * The timeout might have raced with the owner coming late + * and handing it over gracefully. + */ + if (cons_state_bits_match(cur, ctxt->hov_state)) + goto success; + + /* + * Validate that the state matches with the state at request + * time. If this check fails, there is already a higher + * priority context waiting or the owner has changed (either + * by higher priority or by hostile takeover). In all fail + * cases this context is no longer in line for a handover to + * take place, so no reset is necessary. + */ + if (cur.bits != ctxt->req_state.bits) + goto cleanup; + + copy_full_state(new, cur); + new.req_prio = 0; + } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &cur, &new)); + /* Reset worked. Report timeout. */ + return -EBUSY; + +success: + /* Store the real state */ + copy_full_state(ctxt->state, cur); + ctxt->hostile = false; + err = 0; + +cleanup: + cons_cleanup_handover(ctxt); + return err; +} + +/** + * __cons_try_acquire - Try to acquire the console for printk output + * @ctxt: Pointer to an acquire context that contains + * all information about the acquire mode + * + * Returns: True if the acquire was successful. False on fail. + * + * In case of success @ctxt->state contains the acquisition + * state. + * + * In case of fail @ctxt->old_state contains the state + * that was read from @con->state for analysis by the caller. + */ +static bool __cons_try_acquire(struct cons_context *ctxt) +{ + unsigned int cpu = smp_processor_id(); + struct console *con = ctxt->console; + short flags = console_srcu_read_flags(con); + struct cons_state old; + struct cons_state new; + int err; + + if (WARN_ON_ONCE(!(flags & CON_NO_BKL))) + return false; +again: + cons_state_read(con, CON_STATE_CUR, &old); + + /* Preserve it for the caller and for spinwait */ + copy_full_state(ctxt->old_state, old); + + if (cons_check_panic()) + return false; + + /* Set up the new state for takeover */ + copy_full_state(new, old); + new.locked = 1; + new.thread = ctxt->thread; + new.cur_prio = ctxt->prio; + new.req_prio = CONS_PRIO_NONE; + new.cpu = cpu; + + /* Attempt to acquire it directly if unlocked */ + if (!old.locked) { + if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new)) + goto again; + + ctxt->hostile = false; + copy_full_state(ctxt->state, new); + goto success; + } + + /* + * A threaded printer context will never spin or perform a + * hostile takeover. The atomic writer will wake the thread + * when it is done with the important output. + */ + if (ctxt->thread) + return false; + + /* + * If the active context is on the same CPU then there is + * obviously no handshake possible. + */ + if (old.cpu == cpu) + goto check_hostile; + + /* + * If a handover request with same or higher priority is already + * pending then this context cannot setup a handover request. + */ + if (old.req_prio >= ctxt->prio) + goto check_hostile; + + /* + * If the caller did not request spin-waiting then performing a + * handover is not an option. + */ + if (!ctxt->spinwait) + goto check_hostile; + + /* + * Setup the request in state[REQ]. If this fails then this + * context is not allowed to setup a handover request. + */ + if (!cons_setup_handover(ctxt)) + goto check_hostile; + + /* + * Setup the request in state[CUR]. Hand in the state that was + * used to make the decision to spinwait above, for comparison. If + * this fails then unexpected state values were encountered and the + * full request setup process is retried. + */ + if (!cons_setup_request(ctxt, old)) + goto again; + + /* + * Spin-wait to acquire the console. If this fails then unexpected + * state values were encountered (for example, a hostile takeover by + * another context) and the full request setup process is retried. + */ + err = cons_try_acquire_spin(ctxt); + if (err) { + if (err == -EAGAIN) + goto again; + goto check_hostile; + } +success: + /* Common updates on success */ + cons_context_set_seq(ctxt); + cons_context_set_pbufs(ctxt); + return true; + +check_hostile: + if (!ctxt->hostile) + return false; + + if (cons_check_panic()) + return false; + + if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new)) + goto again; + + copy_full_state(ctxt->state, new); + goto success; +} + +/** + * cons_try_acquire - Try to acquire the console for printk output + * @ctxt: Pointer to an acquire context that contains + * all information about the acquire mode + * + * Returns: True if the acquire was successful. False on fail. + * + * In case of success @ctxt->state contains the acquisition + * state. + * + * In case of fail @ctxt->old_state contains the state + * that was read from @con->state for analysis by the caller. + */ +static bool cons_try_acquire(struct cons_context *ctxt) +{ + if (__cons_try_acquire(ctxt)) + return true; + + ctxt->state.atom = 0; + return false; +} + +/** + * __cons_release - Release the console after output is done + * @ctxt: The acquire context that contains the state + * at cons_try_acquire() + * + * Returns: True if the release was regular + * + * False if the console is in unusable state or was handed over + * with handshake or taken over hostile without handshake. + * + * The return value tells the caller whether it needs to evaluate further + * printing. + */ +static bool __cons_release(struct cons_context *ctxt) +{ + struct console *con = ctxt->console; + short flags = console_srcu_read_flags(con); + struct cons_state hstate; + struct cons_state old; + struct cons_state new; + + if (WARN_ON_ONCE(!(flags & CON_NO_BKL))) + return false; + + cons_state_read(con, CON_STATE_CUR, &old); +again: + if (!cons_state_bits_match(old, ctxt->state)) + return false; + + /* Release it directly when no handover request is pending. */ + if (!old.req_prio) + goto unlock; + + /* Read the handover target state */ + cons_state_read(con, CON_STATE_REQ, &hstate); + + /* If the waiter gave up hstate is 0 */ + if (!hstate.atom) + goto unlock; + + /* + * If a higher priority waiter raced against a lower priority + * waiter then unlock instead of handing over to either. The + * higher priority waiter will notice the updated state and + * retry. + */ + if (hstate.cur_prio != old.req_prio) + goto unlock; + + /* Switch the state and preserve the sequence on 64bit */ + copy_bit_state(new, hstate); + copy_seq_state64(new, old); + if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new)) + goto again; + + return true; + +unlock: + /* Clear the state and preserve the sequence on 64bit */ + new.atom = 0; + copy_seq_state64(new, old); + if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new)) + goto again; + + return true; +} + +bool printk_threads_enabled __ro_after_init; +static bool printk_force_atomic __initdata; + +/** + * cons_release - Release the console after output is done + * @ctxt: The acquire context that contains the state + * at cons_try_acquire() + * + * Returns: True if the release was regular + * + * False if the console is in unusable state or was handed over + * with handshake or taken over hostile without handshake. + * + * The return value tells the caller whether it needs to evaluate further + * printing. + */ +static bool cons_release(struct cons_context *ctxt) +{ + bool ret = __cons_release(ctxt); + + /* Invalidate the buffer pointer. It is no longer valid. */ + ctxt->pbufs = NULL; + + ctxt->state.atom = 0; + return ret; +} + +bool console_try_acquire(struct cons_write_context *wctxt) +{ + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + return cons_try_acquire(ctxt); +} +EXPORT_SYMBOL_GPL(console_try_acquire); + +bool console_release(struct cons_write_context *wctxt) +{ + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + return cons_release(ctxt); +} +EXPORT_SYMBOL_GPL(console_release); + +/** + * cons_alloc_percpu_data - Allocate percpu data for a console + * @con: Console to allocate for + * + * Returns: True on success. False otherwise and the console cannot be used. + * + * If it is not yet possible to allocate per CPU data, success is returned. + * When per CPU data becomes possible, set_percpu_data_ready() will call + * this function again for all registered consoles. + */ +bool cons_alloc_percpu_data(struct console *con) +{ + if (!printk_percpu_data_ready()) + return true; + + con->pcpu_data = alloc_percpu(typeof(*con->pcpu_data)); + if (con->pcpu_data) + return true; + + con_printk(KERN_WARNING, con, "failed to allocate percpu buffers\n"); + return false; +} + +/** + * cons_free_percpu_data - Free percpu data of a console on unregister + * @con: Console to clean up + */ +static void cons_free_percpu_data(struct console *con) +{ + if (!con->pcpu_data) + return; + + free_percpu(con->pcpu_data); + con->pcpu_data = NULL; +} + +/** + * console_can_proceed - Check whether printing can proceed + * @wctxt: The write context that was handed to the write function + * + * Returns: True if the state is correct. False if a handover + * has been requested or if the console was taken + * over. + * + * Must be invoked after the record was dumped into the assigned record + * buffer and at appropriate safe places in the driver. For unsafe driver + * sections see console_enter_unsafe(). + * + * When this function returns false then the calling context is not allowed + * to go forward and has to back out immediately and carefully. The buffer + * content is no longer trusted either and the console lock is no longer + * held. + */ +bool console_can_proceed(struct cons_write_context *wctxt) +{ + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + struct cons_state state; + + cons_state_read(con, CON_STATE_CUR, &state); + /* Store it for analysis or reuse */ + copy_full_state(ctxt->old_state, state); + + /* Make sure this context is still the owner. */ + if (!cons_state_full_match(state, ctxt->state)) + return false; + + /* + * Having a safe point for take over and eventually a few + * duplicated characters or a full line is way better than a + * hostile takeover. Post processing can take care of the garbage. + * Continue if the requested priority is not sufficient. + */ + if (state.req_prio <= state.cur_prio) + return true; + + /* + * A console printer within an unsafe region is allowed to continue. + * It can perform the handover when exiting the safe region. Otherwise + * a hostile takeover will be necessary. + */ + if (state.unsafe) + return true; + + /* Release and hand over */ + cons_release(ctxt); + /* + * This does not check whether the handover succeeded. The + * outermost callsite has to make the final decision whether printing + * should continue or not (via reacquire, possibly hostile). The + * console is unlocked already so go back all the way instead of + * trying to implement heuristics in tons of places. + */ + return false; +} +EXPORT_SYMBOL_GPL(console_can_proceed); + +/** + * __console_update_unsafe - Update the unsafe bit in @con->atomic_state + * @wctxt: The write context that was handed to the write function + * + * Returns: True if the state is correct. False if a handover + * has been requested or if the console was taken + * over. + * + * Must be invoked before an unsafe driver section is entered. + * + * When this function returns false then the calling context is not allowed + * to go forward and has to back out immediately and carefully. The buffer + * content is no longer trusted either and the console lock is no longer + * held. + * + * Internal helper to avoid duplicated code + */ +static bool __console_update_unsafe(struct cons_write_context *wctxt, bool unsafe) +{ + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + struct cons_state new; + + do { + if (!console_can_proceed(wctxt)) + return false; + /* + * console_can_proceed() saved the real state in + * ctxt->old_state + */ + copy_full_state(new, ctxt->old_state); + new.unsafe = unsafe; + + } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &ctxt->old_state, &new)); + + copy_full_state(ctxt->state, new); + return true; +} + +/** + * console_enter_unsafe - Enter an unsafe region in the driver + * @wctxt: The write context that was handed to the write function + * + * Returns: True if the state is correct. False if a handover + * has been requested or if the console was taken + * over. + * + * Must be invoked before an unsafe driver section is entered. + * + * When this function returns false then the calling context is not allowed + * to go forward and has to back out immediately and carefully. The buffer + * content is no longer trusted either and the console lock is no longer + * held. + */ +bool console_enter_unsafe(struct cons_write_context *wctxt) +{ + return __console_update_unsafe(wctxt, true); +} +EXPORT_SYMBOL_GPL(console_enter_unsafe); + +/** + * console_exit_unsafe - Exit an unsafe region in the driver + * @wctxt: The write context that was handed to the write function + * + * Returns: True if the state is correct. False if a handover + * has been requested or if the console was taken + * over. + * + * Must be invoked before an unsafe driver section is exited. + * + * When this function returns false then the calling context is not allowed + * to go forward and has to back out immediately and carefully. The buffer + * content is no longer trusted either and the console lock is no longer + * held. + */ +bool console_exit_unsafe(struct cons_write_context *wctxt) +{ + return __console_update_unsafe(wctxt, false); +} +EXPORT_SYMBOL_GPL(console_exit_unsafe); + +/** + * cons_get_record - Fill the buffer with the next pending ringbuffer record + * @wctxt: The write context which will be handed to the write function + * + * Returns: True if there are records available. If the next record should + * be printed, the output buffer is filled and @wctxt->outbuf + * points to the text to print. If @wctxt->outbuf is NULL after + * the call, the record should not be printed but the caller must + * still update the console sequence number. + * + * False means that there are no pending records anymore and the + * printing can stop. + */ +static bool cons_get_record(struct cons_write_context *wctxt) +{ + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; + struct printk_message pmsg = { + .pbufs = ctxt->pbufs, + }; + + if (!printk_get_next_message(&pmsg, ctxt->newseq, is_extended, true)) + return false; + + ctxt->newseq = pmsg.seq; + ctxt->dropped += pmsg.dropped; + + if (pmsg.outbuf_len == 0) { + wctxt->outbuf = NULL; + } else { + if (ctxt->dropped && !is_extended) + console_prepend_dropped(&pmsg, ctxt->dropped); + wctxt->outbuf = &pmsg.pbufs->outbuf[0]; + } + + wctxt->len = pmsg.outbuf_len; + + return true; +} + +/** + * cons_emit_record - Emit record in the acquired context + * @wctxt: The write context that will be handed to the write function + * + * Returns: False if the operation was aborted (takeover or handover). + * True otherwise + * + * When false is returned, the caller is not allowed to touch console state. + * The console is owned by someone else. If the caller wants to print more + * it has to reacquire the console first. + * + * When true is returned, @wctxt->ctxt.backlog indicates whether there are + * still records pending in the ringbuffer, + */ +static bool cons_emit_record(struct cons_write_context *wctxt) +{ + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + bool done = false; + + /* + * @con->dropped is not protected in case of hostile takeovers so + * the update below is racy. Annotate it accordingly. + */ + ctxt->dropped = data_race(READ_ONCE(con->dropped)); + + /* Fill the output buffer with the next record */ + ctxt->backlog = cons_get_record(wctxt); + if (!ctxt->backlog) + return true; + + /* Safety point. Don't touch state in case of takeover */ + if (!console_can_proceed(wctxt)) + return false; + + /* Counterpart to the read above */ + WRITE_ONCE(con->dropped, ctxt->dropped); + + /* + * In case of skipped records, Update sequence state in @con. + */ + if (!wctxt->outbuf) + goto update; + + /* Tell the driver about potential unsafe state */ + wctxt->unsafe = ctxt->state.unsafe; + + if (!ctxt->thread && con->write_atomic) { + done = con->write_atomic(con, wctxt); + } else if (ctxt->thread && con->write_thread) { + done = con->write_thread(con, wctxt); + } else { + cons_release(ctxt); + WARN_ON_ONCE(1); + return false; + } + + /* If not done, the write was aborted due to takeover */ + if (!done) + return false; + + /* If there was a dropped message, it has now been output. */ + if (ctxt->dropped) { + ctxt->dropped = 0; + /* Counterpart to the read above */ + WRITE_ONCE(con->dropped, ctxt->dropped); + } +update: + ctxt->newseq++; + /* + * The sequence update attempt is not part of console_release() + * because in panic situations the console is not released by + * the panic CPU until all records are written. On 32bit the + * sequence is separate from state anyway. + */ + return cons_seq_try_update(ctxt); +} + +/** + * cons_kthread_should_wakeup - Check whether the printk thread should wakeup + * @con: Console to operate on + * @ctxt: The acquire context that contains the state + * at console_acquire() + * + * Returns: True if the thread should shutdown or if the console is allowed to + * print and a record is available. False otherwise + * + * After the thread wakes up, it must first check if it should shutdown before + * attempting any printing. + */ +static bool cons_kthread_should_wakeup(struct console *con, struct cons_context *ctxt) +{ + bool is_usable; + short flags; + int cookie; + + if (kthread_should_stop()) + return true; + + cookie = console_srcu_read_lock(); + flags = console_srcu_read_flags(con); + is_usable = console_is_usable(con, flags); + console_srcu_read_unlock(cookie); + + if (!is_usable) + return false; + + /* This reads state and sequence on 64bit. On 32bit only state */ + cons_state_read(con, CON_STATE_CUR, &ctxt->state); + + /* + * Atomic printing is running on some other CPU. The owner + * will wake the console thread on unlock if necessary. + */ + if (ctxt->state.locked) + return false; + + /* Bring the sequence in @ctxt up to date */ + cons_context_set_seq(ctxt); + + return prb_read_valid(prb, ctxt->oldseq, NULL); +} + +/** + * cons_kthread_func - The printk thread function + * @__console: Console to operate on + */ +static int cons_kthread_func(void *__console) +{ + struct console *con = __console; + struct cons_write_context wctxt = { + .ctxt.console = con, + .ctxt.prio = CONS_PRIO_NORMAL, + .ctxt.thread = 1, + }; + struct cons_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + unsigned long flags; + short con_flags; + bool backlog; + int cookie; + int ret; + + for (;;) { + atomic_inc(&con->kthread_waiting); + + /* + * Provides a full memory barrier vs. cons_kthread_wake(). + */ + ret = rcuwait_wait_event(&con->rcuwait, + cons_kthread_should_wakeup(con, ctxt), + TASK_INTERRUPTIBLE); + + atomic_dec(&con->kthread_waiting); + + if (kthread_should_stop()) + break; + + /* Wait was interrupted by a spurious signal, go back to sleep */ + if (ret) + continue; + + for (;;) { + cookie = console_srcu_read_lock(); + + /* + * Ensure this stays on the CPU to make handover and + * takeover possible. + */ + if (con->port_lock) + con->port_lock(con, true, &flags); + else + migrate_disable(); + + /* + * Try to acquire the console without attempting to + * take over. If an atomic printer wants to hand + * back to the thread it simply wakes it up. + */ + if (!cons_try_acquire(ctxt)) + break; + + con_flags = console_srcu_read_flags(con); + + if (console_is_usable(con, con_flags)) { + /* + * If the emit fails, this context is no + * longer the owner. Abort the processing and + * wait for new records to print. + */ + if (!cons_emit_record(&wctxt)) + break; + backlog = ctxt->backlog; + } else { + backlog = false; + } + + /* + * If the release fails, this context was not the + * owner. Abort the processing and wait for new + * records to print. + */ + if (!cons_release(ctxt)) + break; + + /* Backlog done? */ + if (!backlog) + break; + + if (con->port_lock) + con->port_lock(con, false, &flags); + else + migrate_enable(); + + console_srcu_read_unlock(cookie); + + cond_resched(); + } + if (con->port_lock) + con->port_lock(con, false, &flags); + else + migrate_enable(); + + console_srcu_read_unlock(cookie); + } + return 0; +} + +/** + * cons_irq_work - irq work to wake printk thread + * @irq_work: The irq work to operate on + */ +static void cons_irq_work(struct irq_work *irq_work) +{ + struct console *con = container_of(irq_work, struct console, irq_work); + + cons_kthread_wake(con); +} + +/** + * cons_wake_threads - Wake up printing threads + * + * A printing thread is only woken if it is within the @kthread_waiting + * block. If it is not within the block (or enters the block later), it + * will see any new records and continue printing on its own. + */ +void cons_wake_threads(void) +{ + struct console *con; + int cookie; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + if (con->kthread && atomic_read(&con->kthread_waiting)) + irq_work_queue(&con->irq_work); + } + console_srcu_read_unlock(cookie); +} + +/** + * struct cons_cpu_state - Per CPU printk context state + * @prio: The current context priority level + * @nesting: Per priority nest counter + */ +struct cons_cpu_state { + enum cons_prio prio; + int nesting[CONS_PRIO_MAX]; +}; + +static DEFINE_PER_CPU(struct cons_cpu_state, cons_pcpu_state); +static struct cons_cpu_state early_cons_pcpu_state __initdata; + +/** + * cons_get_cpu_state - Get the per CPU console state pointer + * + * Returns either a pointer to the per CPU state of the current CPU or to + * the init data state during early boot. + */ +static __ref struct cons_cpu_state *cons_get_cpu_state(void) +{ + if (!printk_percpu_data_ready()) + return &early_cons_pcpu_state; + + return this_cpu_ptr(&cons_pcpu_state); +} + +/** + * cons_get_wctxt - Get the write context for atomic printing + * @con: Console to operate on + * @prio: Priority of the context + * + * Returns either the per CPU context or the builtin context for + * early boot. + */ +static __ref struct cons_write_context *cons_get_wctxt(struct console *con, + enum cons_prio prio) +{ + if (!con->pcpu_data) + return &early_cons_ctxt_data.wctxt[prio]; + + return &this_cpu_ptr(con->pcpu_data)->wctxt[prio]; +} + +/** + * cons_atomic_try_acquire - Try to acquire the console for atomic printing + * @con: The console to acquire + * @ctxt: The console context instance to work on + * @prio: The priority of the current context + */ +static bool cons_atomic_try_acquire(struct console *con, struct cons_context *ctxt, + enum cons_prio prio, bool skip_unsafe) +{ + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->console = con; + ctxt->spinwait_max_us = 2000; + ctxt->prio = prio; + ctxt->spinwait = 1; + + /* Try to acquire it directly or via a friendly handover */ + if (cons_try_acquire(ctxt)) + return true; + + /* Investigate whether a hostile takeover is due */ + if (ctxt->old_state.cur_prio >= prio) + return false; + + if (!ctxt->old_state.unsafe || !skip_unsafe) + ctxt->hostile = 1; + return cons_try_acquire(ctxt); +} + +/** + * cons_atomic_flush_con - Flush one console in atomic mode + * @wctxt: The write context struct to use for this context + * @con: The console to flush + * @prio: The priority of the current context + * @skip_unsafe: True, to avoid unsafe hostile takeovers + */ +static void cons_atomic_flush_con(struct cons_write_context *wctxt, struct console *con, + enum cons_prio prio, bool skip_unsafe) +{ + struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + bool wake_thread = false; + short flags; + + if (!cons_atomic_try_acquire(con, ctxt, prio, skip_unsafe)) + return; + + do { + flags = console_srcu_read_flags(con); + + if (!console_is_usable(con, flags)) + break; + + /* + * For normal prio messages let the printer thread handle + * the printing if it is available. + */ + if (prio <= CONS_PRIO_NORMAL && con->kthread) { + wake_thread = true; + break; + } + + /* + * cons_emit_record() returns false when the console was + * handed over or taken over. In both cases the context is + * no longer valid. + */ + if (!cons_emit_record(wctxt)) + return; + } while (ctxt->backlog); + + cons_release(ctxt); + + if (wake_thread && atomic_read(&con->kthread_waiting)) + irq_work_queue(&con->irq_work); +} + +/** + * cons_atomic_flush - Flush consoles in atomic mode if required + * @printk_caller_wctxt: The write context struct to use for this + * context (for printk() context only) + * @skip_unsafe: True, to avoid unsafe hostile takeovers + */ +void cons_atomic_flush(struct cons_write_context *printk_caller_wctxt, bool skip_unsafe) +{ + struct cons_write_context *wctxt; + struct cons_cpu_state *cpu_state; + struct console *con; + short flags; + int cookie; + + cpu_state = cons_get_cpu_state(); + + /* + * When in an elevated priority, the printk() calls are not + * individually flushed. This is to allow the full output to + * be dumped to the ringbuffer before starting with printing + * the backlog. + */ + if (cpu_state->prio > CONS_PRIO_NORMAL && printk_caller_wctxt) + return; + + /* + * Let the outermost write of this priority print. This avoids + * nasty hackery for nested WARN() where the printing itself + * generates one. + * + * cpu_state->prio <= CONS_PRIO_NORMAL is not subject to nesting + * and can proceed in order to allow atomic printing when consoles + * do not have a printer thread. + */ + if (cpu_state->prio > CONS_PRIO_NORMAL && + cpu_state->nesting[cpu_state->prio] != 1) + return; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + if (!con->write_atomic) + continue; + + flags = console_srcu_read_flags(con); + + if (!console_is_usable(con, flags)) + continue; + + if (cpu_state->prio > CONS_PRIO_NORMAL || !con->kthread) { + if (printk_caller_wctxt) + wctxt = printk_caller_wctxt; + else + wctxt = cons_get_wctxt(con, cpu_state->prio); + cons_atomic_flush_con(wctxt, con, cpu_state->prio, skip_unsafe); + } + } + console_srcu_read_unlock(cookie); +} + +/** + * cons_atomic_enter - Enter a context that enforces atomic printing + * @prio: Priority of the context + * + * Returns: The previous priority that needs to be fed into + * the corresponding cons_atomic_exit() + */ +enum cons_prio cons_atomic_enter(enum cons_prio prio) +{ + struct cons_cpu_state *cpu_state; + enum cons_prio prev_prio; + + migrate_disable(); + cpu_state = cons_get_cpu_state(); + + prev_prio = cpu_state->prio; + if (prev_prio < prio) + cpu_state->prio = prio; + + /* + * Increment the nesting on @cpu_state->prio so a WARN() + * nested into a panic printout does not attempt to + * scribble state. + */ + cpu_state->nesting[cpu_state->prio]++; + + return prev_prio; +} + +/** + * cons_atomic_exit - Exit a context that enforces atomic printing + * @prio: Priority of the context to leave + * @prev_prio: Priority of the previous context for restore + * + * @prev_prio is the priority returned by the corresponding cons_atomic_enter(). + */ +void cons_atomic_exit(enum cons_prio prio, enum cons_prio prev_prio) +{ + struct cons_cpu_state *cpu_state; + + cons_atomic_flush(NULL, true); + + cpu_state = cons_get_cpu_state(); + + if (cpu_state->prio == CONS_PRIO_PANIC) + cons_atomic_flush(NULL, false); + + /* + * Undo the nesting of cons_atomic_enter() at the CPU state + * priority. + */ + cpu_state->nesting[cpu_state->prio]--; + + /* + * Restore the previous priority, which was returned by + * cons_atomic_enter(). + */ + cpu_state->prio = prev_prio; + + migrate_enable(); +} + +/** + * cons_kthread_stop - Stop a printk thread + * @con: Console to operate on + */ +static void cons_kthread_stop(struct console *con) +{ + lockdep_assert_console_list_lock_held(); + + if (!con->kthread) + return; + + kthread_stop(con->kthread); + con->kthread = NULL; + + kfree(con->thread_pbufs); + con->thread_pbufs = NULL; +} + +/** + * cons_kthread_create - Create a printk thread + * @con: Console to operate on + * + * If it fails, let the console proceed. The atomic part might + * be usable and useful. + */ +void cons_kthread_create(struct console *con) +{ + struct task_struct *kt; + struct console *c; + + lockdep_assert_console_list_lock_held(); + + if (!(con->flags & CON_NO_BKL) || !con->write_thread) + return; + + if (!printk_threads_enabled || con->kthread) + return; + + /* + * Printer threads cannot be started as long as any boot console is + * registered because there is no way to synchronize the hardware + * registers between boot console code and regular console code. + */ + for_each_console(c) { + if (c->flags & CON_BOOT) + return; + } + have_boot_console = false; + + con->thread_pbufs = kmalloc(sizeof(*con->thread_pbufs), GFP_KERNEL); + if (!con->thread_pbufs) { + con_printk(KERN_ERR, con, "failed to allocate printing thread buffers\n"); + return; + } + + kt = kthread_run(cons_kthread_func, con, "pr/%s%d", con->name, con->index); + if (IS_ERR(kt)) { + con_printk(KERN_ERR, con, "failed to start printing thread\n"); + kfree(con->thread_pbufs); + con->thread_pbufs = NULL; + return; + } + + con->kthread = kt; + + /* + * It is important that console printing threads are scheduled + * shortly after a printk call and with generous runtime budgets. + */ + sched_set_normal(con->kthread, -20); +} + +static int __init printk_setup_threads(void) +{ + struct console *con; + + if (printk_force_atomic) + return 0; + + console_list_lock(); + printk_threads_enabled = true; + for_each_console(con) + cons_kthread_create(con); + if (have_bkl_console) + console_bkl_kthread_create(); + console_list_unlock(); + return 0; +} +early_initcall(printk_setup_threads); + +/** + * cons_nobkl_init - Initialize the NOBKL console specific data + * @con: Console to initialize + * + * Returns: True on success. False otherwise and the console cannot be used. + */ +bool cons_nobkl_init(struct console *con) +{ + struct cons_state state = { }; + + if (!cons_alloc_percpu_data(con)) + return false; + + rcuwait_init(&con->rcuwait); + atomic_set(&con->kthread_waiting, 0); + init_irq_work(&con->irq_work, cons_irq_work); + cons_state_set(con, CON_STATE_CUR, &state); + cons_state_set(con, CON_STATE_REQ, &state); + cons_seq_init(con); + cons_kthread_create(con); + return true; +} + +/** + * cons_nobkl_cleanup - Cleanup the NOBKL console specific data + * @con: Console to cleanup + */ +void cons_nobkl_cleanup(struct console *con) +{ + struct cons_state state = { }; + + cons_kthread_stop(con); + cons_state_set(con, CON_STATE_CUR, &state); + cons_state_set(con, CON_STATE_REQ, &state); + cons_free_percpu_data(con); +} + +/** + * printk_kthread_shutdown - shutdown all threaded printers + * + * On system shutdown all threaded printers are stopped. This allows printk + * to transition back to atomic printing, thus providing a robust mechanism + * for the final shutdown/reboot messages to be output. + */ +static void printk_kthread_shutdown(void) +{ + struct console *con; + + console_list_lock(); + for_each_console(con) { + if (con->flags & CON_NO_BKL) + cons_kthread_stop(con); + } + console_list_unlock(); +} + +static struct syscore_ops printk_syscore_ops = { + .shutdown = printk_kthread_shutdown, +}; + +static int __init printk_init_ops(void) +{ + register_syscore_ops(&printk_syscore_ops); + return 0; +} +device_initcall(printk_init_ops); @ kernel/printk/printk_safe.c:15 @ #include "internal.h" -static DEFINE_PER_CPU(int, printk_context); +struct printk_context { + local_lock_t cpu; + int recursion; +}; + +static DEFINE_PER_CPU(struct printk_context, printk_context) = { + .cpu = INIT_LOCAL_LOCK(cpu), +}; /* Can be preempted by NMI. */ -void __printk_safe_enter(void) +void __printk_safe_enter(unsigned long *flags) { - this_cpu_inc(printk_context); + WARN_ON_ONCE(in_nmi()); + local_lock_irqsave(&printk_context.cpu, *flags); + this_cpu_inc(printk_context.recursion); } /* Can be preempted by NMI. */ -void __printk_safe_exit(void) +void __printk_safe_exit(unsigned long *flags) { - this_cpu_dec(printk_context); + WARN_ON_ONCE(in_nmi()); + this_cpu_dec(printk_context.recursion); + local_unlock_irqrestore(&printk_context.cpu, *flags); +} + +void __printk_deferred_enter(void) +{ + WARN_ON_ONCE(!in_atomic()); + this_cpu_inc(printk_context.recursion); +} + +void __printk_deferred_exit(void) +{ + WARN_ON_ONCE(!in_atomic()); + this_cpu_dec(printk_context.recursion); } asmlinkage int vprintk(const char *fmt, va_list args) @ kernel/printk/printk_safe.c:64 @ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if (this_cpu_read(printk_context) || in_nmi()) { - int len; - - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - defer_console_output(); - return len; - } + if (this_cpu_read(printk_context.recursion) || in_nmi()) + return vprintk_deferred(fmt, args); /* No obstacles. */ return vprintk_default(fmt, args); @ kernel/rcu/rcutorture.c:2410 @ static int rcutorture_booster_init(unsigned int cpu) WARN_ON_ONCE(!t); sp.sched_priority = 2; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); +#ifdef CONFIG_PREEMPT_RT + t = per_cpu(timersd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); +#endif } /* Don't allow time recalculation while creating a new task. */ @ kernel/rcu/tree_stall.h:11 @ */ #include <linux/kvm_para.h> +#include <linux/console.h> ////////////////////////////////////////////////////////////////////////////// // @ kernel/rcu/tree_stall.h:586 @ static void rcu_check_gp_kthread_expired_fqs_timer(void) static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { + enum cons_prio prev_prio; int cpu; unsigned long flags; unsigned long gpa; @ kernel/rcu/tree_stall.h:602 @ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) if (rcu_stall_is_suppressed()) return; + prev_prio = cons_atomic_enter(CONS_PRIO_EMERGENCY); + /* * OK, time to rat on our buddy... * See Documentation/RCU/stallwarn.rst for info on how to debug @ kernel/rcu/tree_stall.h:658 @ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) panic_on_rcu_stall(); rcu_force_quiescent_state(); /* Kick them all. */ + + cons_atomic_exit(CONS_PRIO_EMERGENCY, prev_prio); } static void print_cpu_stall(unsigned long gps) @ kernel/sched/core.c:1045 @ void resched_curr(struct rq *rq) trace_sched_wake_idle_without_ipi(cpu); } +#ifdef CONFIG_PREEMPT_LAZY + +static int tsk_is_polling(struct task_struct *p) +{ +#ifdef TIF_POLLING_NRFLAG + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); +#else + return 0; +#endif +} + +void resched_curr_lazy(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + int cpu; + + if (!sched_feat(PREEMPT_LAZY)) { + resched_curr(rq); + return; + } + + if (test_tsk_need_resched(curr)) + return; + + if (test_tsk_need_resched_lazy(curr)) + return; + + set_tsk_need_resched_lazy(curr); + + cpu = cpu_of(rq); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED_LAZY must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(curr)) + smp_send_reschedule(cpu); +} +#endif + void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @ kernel/sched/core.c:2273 @ void migrate_disable(void) preempt_disable(); this_rq()->nr_pinned++; p->migration_disabled = 1; + preempt_lazy_disable(); preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @ kernel/sched/core.c:2309 @ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; + preempt_lazy_enable(); preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @ kernel/sched/core.c:3363 @ int migrate_swap(struct task_struct *cur, struct task_struct *p, } #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_PREEMPT_RT + +/* + * Consider: + * + * set_special_state(X); + * + * do_things() + * // Somewhere in there is an rtlock that can be contended: + * current_save_and_set_rtlock_wait_state(); + * [...] + * schedule_rtlock(); (A) + * [...] + * current_restore_rtlock_saved_state(); + * + * schedule(); (B) + * + * If p->saved_state is anything else than TASK_RUNNING, then p blocked on an + * rtlock (A) *before* voluntarily calling into schedule() (B) after setting its + * state to X. For things like ptrace (X=TASK_TRACED), the task could have more + * work to do upon acquiring the lock in do_things() before whoever called + * wait_task_inactive() should return. IOW, we have to wait for: + * + * p.saved_state = TASK_RUNNING + * p.__state = X + * + * which implies the task isn't blocked on an RT lock and got to schedule() (B). + * + * Also see comments in ttwu_state_match(). + */ + +static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state) +{ + unsigned long flags; + bool mismatch; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + if (READ_ONCE(p->__state) & match_state) + mismatch = false; + else if (READ_ONCE(p->saved_state) & match_state) + mismatch = false; + else + mismatch = true; + + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return mismatch; +} +static __always_inline bool state_match(struct task_struct *p, unsigned int match_state, + bool *wait) +{ + if (READ_ONCE(p->__state) & match_state) + return true; + if (READ_ONCE(p->saved_state) & match_state) { + *wait = true; + return true; + } + return false; +} +#else +static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state) +{ + return !(READ_ONCE(p->__state) & match_state); +} +static __always_inline bool state_match(struct task_struct *p, unsigned int match_state, + bool *wait) +{ + return (READ_ONCE(p->__state) & match_state); +} +#endif + /* * wait_task_inactive - wait for a thread to unschedule. * @ kernel/sched/core.c:3451 @ int migrate_swap(struct task_struct *cur, struct task_struct *p, */ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) { - int running, queued; + bool running, wait; struct rq_flags rf; unsigned long ncsw; struct rq *rq; @ kernel/sched/core.c:3477 @ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * is actually now running somewhere else! */ while (task_on_cpu(rq, p)) { - if (!(READ_ONCE(p->__state) & match_state)) + if (state_mismatch(p, match_state)) return 0; cpu_relax(); } @ kernel/sched/core.c:3490 @ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); running = task_on_cpu(rq, p); - queued = task_on_rq_queued(p); + wait = task_on_rq_queued(p); ncsw = 0; - if (READ_ONCE(p->__state) & match_state) + + if (state_match(p, match_state, &wait)) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @ kernel/sched/core.c:3523 @ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(queued)) { + if (unlikely(wait)) { ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBLE); @ kernel/sched/core.c:4828 @ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->on_cpu = 0; #endif init_task_preempt_count(p); +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(p)->preempt_lazy_count = 0; +#endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); @ kernel/sched/core.c:6707 @ static void __sched notrace __schedule(unsigned int sched_mode) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); + clear_tsk_need_resched_lazy(prev); clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; @ kernel/sched/core.c:6768 @ void __noreturn do_task_dead(void) cpu_relax(); } -static inline void sched_submit_work(struct task_struct *tsk) +void sched_submit_work(void) { - unsigned int task_flags; + struct task_struct *tsk = current; + unsigned int task_flags = tsk->flags; - if (task_is_running(tsk)) - return; - - task_flags = tsk->flags; /* * If a worker goes to sleep, notify and ask workqueue whether it * wants to wake up a task to maintain concurrency. @ kernel/sched/core.c:6798 @ static inline void sched_submit_work(struct task_struct *tsk) blk_flush_plug(tsk->plug, true); } -static void sched_update_worker(struct task_struct *tsk) +void sched_resume_work(void) { + struct task_struct *tsk = current; + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); @ kernel/sched/core.c:6810 @ static void sched_update_worker(struct task_struct *tsk) } } -asmlinkage __visible void __sched schedule(void) +static void schedule_loop(unsigned int sched_mode) { - struct task_struct *tsk = current; - - sched_submit_work(tsk); do { preempt_disable(); - __schedule(SM_NONE); + __schedule(sched_mode); sched_preempt_enable_no_resched(); } while (need_resched()); - sched_update_worker(tsk); +} + +asmlinkage __visible void __sched schedule(void) +{ + if (!task_is_running(current)) + sched_submit_work(); + schedule_loop(SM_NONE); + sched_resume_work(); } EXPORT_SYMBOL(schedule); +void schedule_rtmutex(void) +{ + schedule_loop(SM_NONE); +} + /* * synchronize_rcu_tasks() makes sure that no task is stuck in preempted * state (have scheduled out non-voluntarily) by making sure that all @ kernel/sched/core.c:6892 @ void __sched schedule_preempt_disabled(void) #ifdef CONFIG_PREEMPT_RT void __sched notrace schedule_rtlock(void) { - do { - preempt_disable(); - __schedule(SM_RTLOCK_WAIT); - sched_preempt_enable_no_resched(); - } while (need_resched()); + schedule_loop(SM_RTLOCK_WAIT); } NOKPROBE_SYMBOL(schedule_rtlock); #endif @ kernel/sched/core.c:6926 @ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } +#ifdef CONFIG_PREEMPT_LAZY +/* + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as + * preempt_lazy_count counter >0. + */ +static __always_inline int preemptible_lazy(void) +{ + if (test_thread_flag(TIF_NEED_RESCHED)) + return 1; + if (current_thread_info()->preempt_lazy_count) + return 0; + return 1; +} + +#else + +static inline int preemptible_lazy(void) +{ + return 1; +} + +#endif + #ifdef CONFIG_PREEMPTION /* * This is the entry point to schedule() from in-kernel preemption @ kernel/sched/core.c:6963 @ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; + if (!preemptible_lazy()) + return; preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); @ kernel/sched/core.c:7012 @ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) if (likely(!preemptible())) return; + if (!preemptible_lazy()) + return; + do { /* * Because the function tracer can trace preempt_count_sub() @ kernel/sched/core.c:9320 @ void __init init_idle(struct task_struct *idle, int cpu) /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); - +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(idle)->preempt_lazy_count = 0; +#endif /* * The idle tasks have their own, simple scheduling class: */ @ kernel/sched/fair.c:4951 @ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @ kernel/sched/fair.c:4975 @ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static void @ kernel/sched/fair.c:5121 @ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } /* @ kernel/sched/fair.c:5270 @ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static __always_inline @ kernel/sched/fair.c:6145 @ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (task_current(rq, p)) - resched_curr(rq); + resched_curr_lazy(rq); return; } hrtick_start(rq, delta); @ kernel/sched/fair.c:7874 @ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @ kernel/sched/fair.c:12039 @ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_curr(rq); + resched_curr_lazy(rq); } se->vruntime -= cfs_rq->min_vruntime; @ kernel/sched/fair.c:12066 @ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (task_current(rq, p)) { if (p->prio > oldprio) - resched_curr(rq); + resched_curr_lazy(rq); } else check_preempt_curr(rq, p, 0); } @ kernel/sched/features.h:51 @ SCHED_FEAT(NONTASK_CAPACITY, true) #ifdef CONFIG_PREEMPT_RT SCHED_FEAT(TTWU_QUEUE, false) +# ifdef CONFIG_PREEMPT_LAZY +SCHED_FEAT(PREEMPT_LAZY, true) +# endif #else /* @ kernel/sched/sched.h:2373 @ extern void reweight_task(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); +#ifdef CONFIG_PREEMPT_LAZY +extern void resched_curr_lazy(struct rq *rq); +#else +static inline void resched_curr_lazy(struct rq *rq) +{ + resched_curr(rq); +} +#endif + extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @ kernel/signal.c:435 @ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, return NULL; if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); + + if (!sigqueue_flags) { + struct sighand_struct *sighand = t->sighand; + + lockdep_assert_held(&sighand->siglock); + if (sighand->sigqueue_cache) { + q = sighand->sigqueue_cache; + sighand->sigqueue_cache = NULL; + } + } + if (!q) + q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); } else { print_dropped_signal(sig); } @ kernel/signal.c:461 @ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, return q; } -static void __sigqueue_free(struct sigqueue *q) +static bool sigqueue_cleanup_accounting(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) - return; + return false; if (q->ucounts) { dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING); q->ucounts = NULL; } + return true; +} + +static void __sigqueue_free(struct sigqueue *q) +{ + if (!sigqueue_cleanup_accounting(q)) + return; + kmem_cache_free(sigqueue_cachep, q); +} + +void sigqueue_free_cached_entry(struct sigqueue *q) +{ + if (!q) + return; + kmem_cache_free(sigqueue_cachep, q); +} + +static void sigqueue_cache_or_free(struct sigqueue *q) +{ + struct sighand_struct *sighand = current->sighand; + + if (!sigqueue_cleanup_accounting(q)) + return; + + lockdep_assert_held(&sighand->siglock); + if (!sighand->sigqueue_cache) { + sighand->sigqueue_cache = q; + return; + } kmem_cache_free(sigqueue_cachep, q); } @ kernel/signal.c:637 @ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i (info->si_code == SI_TIMER) && (info->si_sys_private); - __sigqueue_free(first); + sigqueue_cache_or_free(first); } else { /* * Ok, it wasn't in the queue. This must be @ kernel/signal.c:2339 @ static int ptrace_stop(int exit_code, int why, unsigned long message, do_notify_parent_cldstop(current, false, why); /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. + * The previous do_notify_parent_cldstop() invocation woke ptracer. + * One a PREEMPTION kernel this can result in preemption requirement + * which will be fulfilled after read_unlock() and the ptracer will be + * put on the CPU. + * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for + * this task wait in schedule(). If this task gets preempted then it + * remains enqueued on the runqueue. The ptracer will observe this and + * then sleep for a delay of one HZ tick. In the meantime this task + * gets scheduled, enters schedule() and will wait for the ptracer. * - * XXX: implement read_unlock_no_resched(). + * This preemption point is not bad from correctness point of view but + * extends the runtime by one HZ tick time due to the ptracer's sleep. + * The preempt-disable section ensures that there will be no preemption + * between unlock and schedule() and so improving the performance since + * the ptracer has no reason to sleep. + * + * This optimisation is not doable on PREEMPT_RT due to the spinlock_t + * within the preempt-disable section. */ - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); - preempt_enable_no_resched(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable_no_resched(); schedule(); cgroup_leave_frozen(true); @ kernel/softirq.c:83 @ static void wakeup_softirqd(void) wake_up_process(tsk); } -/* - * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness, - * unless we're doing some of the synchronous softirqs. - */ -#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) -static bool ksoftirqd_running(unsigned long pending) -{ - struct task_struct *tsk = __this_cpu_read(ksoftirqd); - - if (pending & SOFTIRQ_NOW_MASK) - return false; - return tsk && task_is_running(tsk) && !__kthread_should_park(tsk); -} - #ifdef CONFIG_TRACE_IRQFLAGS DEFINE_PER_CPU(int, hardirqs_enabled); DEFINE_PER_CPU(int, hardirq_context); @ kernel/softirq.c:224 @ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) goto out; pending = local_softirq_pending(); - if (!pending || ksoftirqd_running(pending)) + if (!pending) goto out; /* @ kernel/softirq.c:420 @ static inline bool should_wake_ksoftirqd(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running(local_softirq_pending())) - return; - if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* @ kernel/softirq.c:453 @ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running(pending)) + if (pending) do_softirq_own_stack(); local_irq_restore(flags); @ kernel/softirq.c:622 @ static inline void tick_irq_exit(void) #endif } +#ifdef CONFIG_PREEMPT_RT +DEFINE_PER_CPU(struct task_struct *, timersd); +DEFINE_PER_CPU(unsigned long, pending_timer_softirq); + +static void wake_timersd(void) +{ + struct task_struct *tsk = __this_cpu_read(timersd); + + if (tsk) + wake_up_process(tsk); +} + +#else + +static inline void wake_timersd(void) { } + +#endif + static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED @ kernel/softirq.c:652 @ static inline void __irq_exit_rcu(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() && + !(in_nmi() | in_hardirq())) + wake_timersd(); + tick_irq_exit(); } @ kernel/softirq.c:983 @ static struct smp_hotplug_thread softirq_threads = { .thread_comm = "ksoftirqd/%u", }; +#ifdef CONFIG_PREEMPT_RT +static void timersd_setup(unsigned int cpu) +{ + sched_set_fifo_low(current); +} + +static int timersd_should_run(unsigned int cpu) +{ + return local_pending_timers(); +} + +static void run_timersd(unsigned int cpu) +{ + unsigned int timer_si; + + ksoftirqd_run_begin(); + + timer_si = local_pending_timers(); + __this_cpu_write(pending_timer_softirq, 0); + or_softirq_pending(timer_si); + + __do_softirq(); + + ksoftirqd_run_end(); +} + +static void raise_ktimers_thread(unsigned int nr) +{ + trace_softirq_raise(nr); + __this_cpu_or(pending_timer_softirq, 1 << nr); +} + +void raise_hrtimer_softirq(void) +{ + raise_ktimers_thread(HRTIMER_SOFTIRQ); +} + +void raise_timer_softirq(void) +{ + unsigned long flags; + + local_irq_save(flags); + raise_ktimers_thread(TIMER_SOFTIRQ); + wake_timersd(); + local_irq_restore(flags); +} + +static struct smp_hotplug_thread timer_threads = { + .store = &timersd, + .setup = timersd_setup, + .thread_should_run = timersd_should_run, + .thread_fn = run_timersd, + .thread_comm = "ktimers/%u", +}; +#endif + static __init int spawn_ksoftirqd(void) { cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, takeover_tasklets); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); - +#ifdef CONFIG_PREEMPT_RT + BUG_ON(smpboot_register_percpu_thread(&timer_threads)); +#endif return 0; } early_initcall(spawn_ksoftirqd); @ kernel/time/hrtimer.c:1808 @ void hrtimer_interrupt(struct clock_event_device *dev) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_hrtimer_softirq(); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @ kernel/time/hrtimer.c:1921 @ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_hrtimer_softirq(); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @ kernel/time/posix-timers.c:143 @ static struct k_itimer *posix_timer_by_id(timer_t id) static int posix_timer_add(struct k_itimer *timer) { struct signal_struct *sig = current->signal; - int first_free_id = sig->posix_timer_id; struct hlist_head *head; - int ret = -ENOENT; + unsigned int start, id; - do { + /* Can be written by a different task concurrently in the loop below */ + start = READ_ONCE(sig->next_posix_timer_id); + + for (id = ~start; start != id; id++) { spin_lock(&hash_lock); - head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; - if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + id = sig->next_posix_timer_id; + + /* Write the next ID back. Clamp it to the positive space */ + WRITE_ONCE(sig->next_posix_timer_id, (id + 1) & INT_MAX); + + head = &posix_timers_hashtable[hash(sig, id)]; + if (!__posix_timers_find(head, sig, id)) { hlist_add_head_rcu(&timer->t_hash, head); - ret = sig->posix_timer_id; + spin_unlock(&hash_lock); + return id; } - if (++sig->posix_timer_id < 0) - sig->posix_timer_id = 0; - if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) - /* Loop over all possible ids completed */ - ret = -EAGAIN; spin_unlock(&hash_lock); - } while (ret == -ENOENT); - return ret; + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) @ kernel/time/posix-timers.c:1044 @ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) } /* - * return timer owned by the process, used by exit_itimers + * Delete a timer if it is armed, remove it from the hash and schedule it + * for RCU freeing. */ static void itimer_delete(struct k_itimer *timer) { -retry_delete: - spin_lock_irq(&timer->it_lock); + unsigned long flags; +retry_delete: + /* + * irqsave is required to make timer_wait_running() work. + */ + spin_lock_irqsave(&timer->it_lock, flags); + + /* + * Even if the timer is not longer accessible from other tasks + * it still might be armed and queued in the underlying timer + * mechanism. Worse, that timer mechanism might run the expiry + * function concurrently. + */ if (timer_delete_hook(timer) == TIMER_RETRY) { - spin_unlock_irq(&timer->it_lock); + /* + * Timer is expired concurrently, prevent livelocks + * and pointless spinning on RT. + * + * The CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y case is + * irrelevant here because obviously the exiting task + * cannot be expiring timer in task work concurrently. + * Ditto for CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n as the + * tick interrupt cannot run on this CPU because the above + * spin_lock disabled interrupts. + * + * timer_wait_running() drops timer::it_lock, which opens + * the possibility for another task to delete the timer. + * + * That's not possible here because this is invoked from + * do_exit() only for the last thread of the thread group. + * So no other task can access that timer. + */ + if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) + return; + goto retry_delete; } list_del(&timer->list); - spin_unlock_irq(&timer->it_lock); + spin_unlock_irqrestore(&timer->it_lock, flags); release_posix_timer(timer, IT_ID_SET); } /* - * This is called by do_exit or de_thread, only when nobody else can - * modify the signal->posix_timers list. Yet we need sighand->siglock - * to prevent the race with /proc/pid/timers. + * Invoked from do_exit() when the last thread of a thread group exits. + * At that point no other task can access the timers of the dying + * task anymore. */ void exit_itimers(struct task_struct *tsk) { @ kernel/time/posix-timers.c:1106 @ void exit_itimers(struct task_struct *tsk) if (list_empty(&tsk->signal->posix_timers)) return; + /* Protect against concurrent read via /proc/$PID/timers */ spin_lock_irq(&tsk->sighand->siglock); list_replace_init(&tsk->signal->posix_timers, &timers); spin_unlock_irq(&tsk->sighand->siglock); + /* The timers are not longer accessible via tsk::signal */ while (!list_empty(&timers)) { tmr = list_first_entry(&timers, struct k_itimer, list); itimer_delete(tmr); @ kernel/time/tick-sched.c:792 @ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static inline bool local_timer_softirq_pending(void) { - return local_softirq_pending() & BIT(TIMER_SOFTIRQ); + return local_pending_timers() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) @ kernel/time/timer.c:2057 @ static void run_local_timers(void) if (time_before(jiffies, base->next_expiry)) return; } - raise_softirq(TIMER_SOFTIRQ); + raise_timer_softirq(); } /* @ kernel/trace/trace.c:2697 @ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) trace_flags |= TRACE_FLAG_BH_OFF; - if (tif_need_resched()) + if (tif_need_resched_now()) trace_flags |= TRACE_FLAG_NEED_RESCHED; +#ifdef CONFIG_PREEMPT_LAZY + /* Run out of bits. Share the LAZY and PREEMPT_RESCHED */ + if (need_resched_lazy()) + trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; +#else if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; - return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | +#endif + + return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (preempt_lazy_count() & 0xff) << 16 | (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } @ kernel/trace/trace.c:4298 @ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off/BH-disabled\n" - "# | / _----=> need-resched \n" - "# || / _---=> hardirq/softirq \n" - "# ||| / _--=> preempt-depth \n" - "# |||| / _-=> migrate-disable \n" - "# ||||| / delay \n" - "# cmd pid |||||| time | caller \n" - "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# _--------=> CPU# \n" + "# / _-------=> irqs-off/BH-disabled\n" + "# | / _------=> need-resched \n" + "# || / _-----=> need-resched-lazy\n" + "# ||| / _----=> hardirq/softirq \n" + "# |||| / _---=> preempt-depth \n" + "# ||||| / _--=> preempt-lazy-depth\n" + "# |||||| / _-=> migrate-disable \n" + "# ||||||| / delay \n" + "# cmd pid |||||||| time | caller \n" + "# \\ / |||||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @ kernel/trace/trace.c:4342 @ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space); - seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); - seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); - seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); - seq_printf(m, "# %.*s|||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); + seq_printf(m, "# %.*s _-------=> irqs-off/BH-disabled\n", prec, space); + seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space); + seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space); + seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); } void @ kernel/trace/trace_events.c:211 @ static int trace_define_common_fields(void) /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(unsigned char, preempt_lazy_count); return ret; } @ kernel/trace/trace_output.c:445 @ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { char hardsoft_irq; char need_resched; + char need_resched_lazy; char irqs_off; int hardirq; int softirq; @ kernel/trace/trace_output.c:466 @ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED)) { +#ifndef CONFIG_PREEMPT_LAZY case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: need_resched = 'N'; break; +#endif case TRACE_FLAG_NEED_RESCHED: need_resched = 'n'; break; +#ifndef CONFIG_PREEMPT_LAZY case TRACE_FLAG_PREEMPT_RESCHED: need_resched = 'p'; break; +#endif default: need_resched = '.'; break; } + need_resched_lazy = + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; + hardsoft_irq = (nmi && hardirq) ? 'Z' : nmi ? 'z' : @ kernel/trace/trace_output.c:495 @ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) softirq ? 's' : '.' ; - trace_seq_printf(s, "%c%c%c", - irqs_off, need_resched, hardsoft_irq); + trace_seq_printf(s, "%c%c%c%c", + irqs_off, need_resched, need_resched_lazy, + hardsoft_irq); if (entry->preempt_count & 0xf) trace_seq_printf(s, "%x", entry->preempt_count & 0xf); else trace_seq_putc(s, '.'); + if (entry->preempt_lazy_count) + trace_seq_printf(s, "%x", entry->preempt_lazy_count); + else + trace_seq_putc(s, '.'); + if (entry->preempt_count & 0xf0) trace_seq_printf(s, "%x", entry->preempt_count >> 4); else @ localversion-rt:1 @ +-rt15 @ net/core/dev.c:4576 @ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ -/* Called from hardirq (IPI) context */ -static void trigger_rx_softirq(void *data) -{ - struct softnet_data *sd = data; - - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - smp_store_release(&sd->defer_ipi_scheduled, 0); -} - /* * Check if this softnet_data structure is another cpu one * If yes, queue it to our IPI list and return 1 @ net/core/dev.c:6626 @ static void skb_defer_free_flush(struct softnet_data *sd) } } +#ifndef CONFIG_PREEMPT_RT +/* Called from hardirq (IPI) context */ +static void trigger_rx_softirq(void *data) +{ + struct softnet_data *sd = data; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); +} + +#else + +static void trigger_rx_softirq(struct work_struct *defer_work) +{ + struct softnet_data *sd; + + sd = container_of(defer_work, struct softnet_data, defer_work); + smp_store_release(&sd->defer_ipi_scheduled, 0); + local_bh_disable(); + skb_defer_free_flush(sd); + local_bh_enable(); +} +#endif + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @ net/core/dev.c:11427 @ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif +#ifndef CONFIG_PREEMPT_RT INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); +#else + INIT_WORK(&sd->defer_work, trigger_rx_softirq); +#endif spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); @ net/core/skbuff.c:6859 @ nodefer: __kfree_skb(skb); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) + if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { +#ifndef CONFIG_PREEMPT_RT smp_call_function_single_async(cpu, &sd->defer_csd); +#else + schedule_work_on(cpu, &sd->defer_work); +#endif + } }