@ Documentation/admin-guide/cgroup-v1/memory.rst:67 @ Brief summary of control files. threads cgroup.procs show list of processes cgroup.event_control an interface for event_fd() + This knob is not available on CONFIG_PREEMPT_RT systems. memory.usage_in_bytes show current usage for memory (See 5.5 for details) memory.memsw.usage_in_bytes show current usage for memory+Swap @ Documentation/admin-guide/cgroup-v1/memory.rst:79 @ Brief summary of control files. memory.max_usage_in_bytes show max memory usage recorded memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded memory.soft_limit_in_bytes set/show soft limit of memory usage + This knob is not available on CONFIG_PREEMPT_RT systems. memory.stat show various statistics memory.use_hierarchy set/show hierarchical account enabled This knob is deprecated and shouldn't be @ arch/alpha/include/asm/spinlock_types.h:5 @ #ifndef _ALPHA_SPINLOCK_TYPES_H #define _ALPHA_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/arm/Kconfig:36 @ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @ arch/arm/Kconfig:72 @ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL select HAVE_ARCH_MMAP_RND_BITS if MMU @ arch/arm/Kconfig:113 @ config ARM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ @ arch/arm/Kconfig:129 @ config ARM select OLD_SIGSUSPEND3 select PCI_SYSCALL if PCI select PERF_USE_VMALLOC + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select RTC_LIB select SYS_SUPPORTS_APM_EMULATION select THREAD_INFO_IN_TASK if CURRENT_POINTER_IN_TPIDRURO @ arch/arm/include/asm/spinlock_types.h:5 @ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/arm/include/asm/thread_info.h:57 @ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ #ifndef CONFIG_THREAD_INFO_IN_TASK struct task_struct *task; /* main task structure */ #endif @ arch/arm/include/asm/thread_info.h:156 @ extern int vfp_restore_user_hwstate(struct user_vfp *, #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ #define TIF_SECCOMP 7 /* seccomp syscall filtering active */ #define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 9 #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ @ arch/arm/include/asm/thread_info.h:171 @ extern int vfp_restore_user_hwstate(struct user_vfp *, #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) /* Checks for any syscall work in entry-common.S */ @ arch/arm/include/asm/thread_info.h:181 @ extern int vfp_restore_user_hwstate(struct user_vfp *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NOTIFY_SIGNAL) @ arch/arm/kernel/asm-offsets.c:46 @ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); #ifndef CONFIG_THREAD_INFO_IN_TASK DEFINE(TI_TASK, offsetof(struct thread_info, task)); #endif @ arch/arm/kernel/entry-armv.S:206 @ ENDPROC(__dabt_svc) #ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count - ldr r0, [tsk, #TI_FLAGS] @ get flags teq r8, #0 @ if preempt count != 0 + bne 1f @ return from exeption + ldr r0, [tsk, #TI_FLAGS] @ get flags + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set + blne svc_preempt @ preempt! + + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r8, #0 @ if preempt lazy count != 0 movne r0, #0 @ force flags to 0 - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED_LAZY blne svc_preempt +1: #endif svc_exit r5, irq = 1 @ return from exception @ arch/arm/kernel/entry-armv.S:232 @ ENDPROC(__irq_svc) 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED + bne 1b + tst r0, #_TIF_NEED_RESCHED_LAZY reteq r8 @ go again - b 1b + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r0, #0 @ if preempt lazy count != 0 + beq 1b + ret r8 @ go again + #endif __und_fault: @ arch/arm/kernel/signal.c:610 @ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) */ trace_hardirqs_off(); do { - if (likely(thread_flags & _TIF_NEED_RESCHED)) { + if (likely(thread_flags & (_TIF_NEED_RESCHED | + _TIF_NEED_RESCHED_LAZY))) { schedule(); } else { if (unlikely(!user_mode(regs))) @ arch/arm/mm/fault.c:410 @ do_translation_fault(unsigned long addr, unsigned int fsr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); + if (interrupts_enabled(regs)) + local_irq_enable(); + if (user_mode(regs)) goto bad_area; @ arch/arm/mm/fault.c:483 @ do_translation_fault(unsigned long addr, unsigned int fsr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + if (interrupts_enabled(regs)) + local_irq_enable(); + do_bad_area(addr, fsr, regs); return 0; } @ arch/arm64/Kconfig:92 @ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_RT select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT @ arch/arm64/Kconfig:196 @ config ARM64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_PREEMPT_LAZY select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUTEX_CMPXCHG if FUTEX @ arch/arm64/include/asm/pgtable.h:1004 @ static inline void update_mmu_cache(struct vm_area_struct *vma, */ static inline bool arch_faults_on_old_pte(void) { - WARN_ON(preemptible()); + WARN_ON(is_migratable()); return !cpu_has_hw_af(); } @ arch/arm64/include/asm/preempt.h:73 @ static inline bool __preempt_count_dec_and_test(void) * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE * pair. */ - return !pc || !READ_ONCE(ti->preempt_count); + if (!pc || !READ_ONCE(ti->preempt_count)) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if ((pc & ~PREEMPT_NEED_RESCHED)) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif } static inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u64 pc = READ_ONCE(current_thread_info()->preempt_count); + if (pc == preempt_offset) + return true; + + if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset) + return false; + + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else u64 pc = READ_ONCE(current_thread_info()->preempt_count); return pc == preempt_offset; +#endif } #ifdef CONFIG_PREEMPTION @ arch/arm64/include/asm/signal.h:25 @ static inline void __user *arch_untagged_si_addr(void __user *addr, } #define arch_untagged_si_addr arch_untagged_si_addr +#if defined(CONFIG_PREEMPT_RT) +#define ARCH_RT_DELAYS_SIGNAL_SEND +#endif + #endif @ arch/arm64/include/asm/spinlock_types.h:8 @ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) # error "please don't include this file directly" #endif @ arch/arm64/include/asm/thread_info.h:29 @ struct thread_info { #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ #endif + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ union { u64 preempt_count; /* 0 => preemptible, <0 => bug */ struct { @ arch/arm64/include/asm/thread_info.h:72 @ int arch_dup_task_struct(struct task_struct *dst, #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 7 #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @ arch/arm64/include/asm/thread_info.h:103 @ int arch_dup_task_struct(struct task_struct *dst, #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ _TIF_NOTIFY_SIGNAL) @ arch/arm64/include/asm/thread_info.h:115 @ int arch_dup_task_struct(struct task_struct *dst, _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) + #ifdef CONFIG_SHADOW_CALL_STACK #define INIT_SCS \ .scs_base = init_shadow_call_stack, \ @ arch/arm64/kernel/asm-offsets.c:35 @ int main(void) DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif @ arch/arm64/kernel/fpsimd.c:204 @ static void __get_cpu_fpsimd_context(void) * * The double-underscore version must only be called if you know the task * can't be preempted. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. */ static void get_cpu_fpsimd_context(void) { - local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); __get_cpu_fpsimd_context(); } @ arch/arm64/kernel/fpsimd.c:237 @ static void __put_cpu_fpsimd_context(void) static void put_cpu_fpsimd_context(void) { __put_cpu_fpsimd_context(); - local_bh_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } static bool have_cpu_fpsimd_context(void) @ arch/arm64/kernel/fpsimd.c:1140 @ static void fpsimd_flush_thread_vl(enum vec_type type) void fpsimd_flush_thread(void) { + void *sve_state = NULL; + if (!system_supports_fpsimd()) return; @ arch/arm64/kernel/fpsimd.c:1153 @ void fpsimd_flush_thread(void) if (system_supports_sve()) { clear_thread_flag(TIF_SVE); - sve_free(current); + + /* Defer kfree() while in atomic context */ + sve_state = current->thread.sve_state; + current->thread.sve_state = NULL; + fpsimd_flush_thread_vl(ARM64_VEC_SVE); } put_cpu_fpsimd_context(); + kfree(sve_state); } /* @ arch/arm64/kernel/signal.c:923 @ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { do { - if (thread_flags & _TIF_NEED_RESCHED) { + if (thread_flags & _TIF_NEED_RESCHED_MASK) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); @ arch/arm64/kernel/signal.c:931 @ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) } else { local_daif_restore(DAIF_PROCCTX); +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (unlikely(current->forced_info.si_signo)) { + struct task_struct *t = current; + force_sig_info(&t->forced_info); + t->forced_info.si_signo = 0; + } +#endif + if (thread_flags & _TIF_UPROBE) uprobe_notify_resume(regs); @ arch/arm64/kvm/arm.c:823 @ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * involves poking the GIC, which must be done in a * non-preemptible context. */ - preempt_disable(); + migrate_disable(); kvm_pmu_flush_hwstate(vcpu); @ arch/arm64/kvm/arm.c:847 @ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); - preempt_enable(); + migrate_enable(); continue; } @ arch/arm64/kvm/arm.c:919 @ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); - preempt_enable(); + migrate_enable(); /* * The ARMv8 architecture doesn't give the hypervisor @ arch/csky/include/asm/spinlock_types.h:6 @ #ifndef __ASM_CSKY_SPINLOCK_TYPES_H #define __ASM_CSKY_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/hexagon/include/asm/spinlock_types.h:11 @ #ifndef _ASM_SPINLOCK_TYPES_H #define _ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/ia64/include/asm/spinlock_types.h:5 @ #ifndef _ASM_IA64_SPINLOCK_TYPES_H #define _ASM_IA64_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/ia64/include/asm/thread_info.h:58 @ struct thread_info { #ifndef ASM_OFFSETS_C /* how to get the thread information struct from C */ #define current_thread_info() ((struct thread_info *) ((char *) current + IA64_TASK_SIZE)) -#define alloc_thread_stack_node(tsk, node) \ +#define arch_alloc_thread_stack_node(tsk, node) \ ((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE)) #define task_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) #else #define current_thread_info() ((struct thread_info *) 0) -#define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) +#define arch_alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) #define task_thread_info(tsk) ((struct thread_info *) 0) #endif -#define free_thread_stack(tsk) /* nothing */ +#define arch_free_thread_stack(tsk) /* nothing */ #define task_stack_page(tsk) ((void *)(tsk)) #define __HAVE_THREAD_FUNCTIONS @ arch/powerpc/Kconfig:156 @ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST @ arch/powerpc/Kconfig:225 @ config PPC select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE select HAVE_KERNEL_LZO if DEFAULT_UIMAGE @ arch/powerpc/Kconfig:242 @ config PPC select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ @ arch/powerpc/include/asm/simple_spinlock_types.h:5 @ #ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/powerpc/include/asm/spinlock_types.h:5 @ #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H #define _ASM_POWERPC_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/powerpc/include/asm/stackprotector.h:27 @ static __always_inline void boot_init_stack_canary(void) unsigned long canary; /* Try to get a semi random initial value. */ +#ifdef CONFIG_PREEMPT_RT + canary = (unsigned long)&canary; +#else canary = get_random_canary(); +#endif canary ^= mftb(); canary ^= LINUX_VERSION_CODE; canary &= CANARY_MASK; @ arch/powerpc/include/asm/thread_info.h:50 @ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_lazy_count; /* 0 => preemptable, + <0 => BUG */ #ifdef CONFIG_SMP unsigned int cpu; #endif @ arch/powerpc/include/asm/thread_info.h:76 @ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .preempt_count = INIT_PREEMPT_COUNT, \ + .preempt_lazy_count = 0, \ .flags = 0, \ } @ arch/powerpc/include/asm/thread_info.h:102 @ void arch_setup_new_exec(void); #define TIF_PATCH_PENDING 6 /* pending live patching update */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SINGLESTEP 8 /* singlestepping active */ +#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */ #define TIF_SECCOMP 10 /* secure computing */ #define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ #define TIF_NOERROR 12 /* Force successful syscall return */ @ arch/powerpc/include/asm/thread_info.h:118 @ void arch_setup_new_exec(void); #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_32BIT 20 /* 32 bit binary */ + /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) @ arch/powerpc/include/asm/thread_info.h:130 @ void arch_setup_new_exec(void); #define _TIF_PATCH_PENDING (1<<TIF_PATCH_PENDING) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) #define _TIF_SECCOMP (1<<TIF_SECCOMP) #define _TIF_RESTOREALL (1<<TIF_RESTOREALL) #define _TIF_NOERROR (1<<TIF_NOERROR) @ arch/powerpc/include/asm/thread_info.h:144 @ void arch_setup_new_exec(void); _TIF_SYSCALL_EMU) #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ + _TIF_NEED_RESCHED_LAZY | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \ _TIF_NOTIFY_SIGNAL) #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) /* Bits in local_flags */ /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ @ arch/powerpc/kernel/interrupt.c:349 @ interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs) ti_flags = READ_ONCE(current_thread_info()->flags); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); - if (ti_flags & _TIF_NEED_RESCHED) { + if (ti_flags & _TIF_NEED_RESCHED_MASK) { schedule(); } else { /* @ arch/powerpc/kernel/interrupt.c:555 @ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) /* Returning to a kernel context with local irqs enabled. */ WARN_ON_ONCE(!(regs->msr & MSR_EE)); again: - if (IS_ENABLED(CONFIG_PREEMPT)) { + if (IS_ENABLED(CONFIG_PREEMPTION)) { /* Return to preemptible kernel context */ if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) { if (preempt_count() == 0) preempt_schedule_irq(); + } else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) { + if ((preempt_count() == 0) && + (current_thread_info()->preempt_lazy_count == 0)) + preempt_schedule_irq(); } } @ arch/powerpc/kernel/irq.c:693 @ static inline void check_stack_overflow(void) } } +#ifndef CONFIG_PREEMPT_RT static __always_inline void call_do_softirq(const void *sp) { /* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */ @ arch/powerpc/kernel/irq.c:712 @ static __always_inline void call_do_softirq(const void *sp) "r11", "r12" ); } +#endif static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) { @ arch/powerpc/kernel/irq.c:825 @ void *mcheckirq_ctx[NR_CPUS] __read_mostly; void *softirq_ctx[NR_CPUS] __read_mostly; void *hardirq_ctx[NR_CPUS] __read_mostly; +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { call_do_softirq(softirq_ctx[smp_processor_id()]); } +#endif irq_hw_number_t virq_to_hw(unsigned int virq) { @ arch/powerpc/kernel/traps.c:263 @ static char *get_mmu_str(void) static int __die(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", PAGE_SIZE / 1024, get_mmu_str(), - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", @ arch/powerpc/kvm/Kconfig:181 @ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" depends on KVM && E500 + depends on !PREEMPT_RT select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING @ arch/powerpc/platforms/pseries/iommu.c:27 @ #include <linux/of.h> #include <linux/iommu.h> #include <linux/rculist.h> +#include <linux/local_lock.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> @ arch/powerpc/platforms/pseries/iommu.c:199 @ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, return ret; } -static DEFINE_PER_CPU(__be64 *, tce_page); +struct tce_page { + __be64 * page; + local_lock_t lock; +}; +static DEFINE_PER_CPU(struct tce_page, tce_page) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @ arch/powerpc/platforms/pseries/iommu.c:228 @ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, direction, attrs); } - local_irq_save(flags); /* to protect tcep and the page behind it */ + /* to protect tcep and the page behind it */ + local_lock_irqsave(&tce_page.lock, flags); - tcep = __this_cpu_read(tce_page); + tcep = __this_cpu_read(tce_page.page); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() @ arch/powerpc/platforms/pseries/iommu.c:240 @ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, tceshift, npages, uaddr, direction, attrs); } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } rpn = __pa(uaddr) >> tceshift; @ arch/powerpc/platforms/pseries/iommu.c:275 @ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum += limit; } while (npages > 0 && !rc); - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; @ arch/powerpc/platforms/pseries/iommu.c:451 @ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, DMA_BIDIRECTIONAL, 0); } - local_irq_disable(); /* to protect tcep and the page behind it */ - tcep = __this_cpu_read(tce_page); + /* to protect tcep and the page behind it */ + local_lock_irq(&tce_page.lock); + tcep = __this_cpu_read(tce_page.page); if (!tcep) { tcep = (__be64 *)__get_free_page(GFP_ATOMIC); if (!tcep) { - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return -ENOMEM; } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; @ arch/powerpc/platforms/pseries/iommu.c:504 @ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, /* error cleanup: caller will clear whole range */ - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return rc; } @ arch/riscv/include/asm/spinlock_types.h:9 @ #ifndef _ASM_RISCV_SPINLOCK_TYPES_H #define _ASM_RISCV_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/s390/include/asm/spinlock_types.h:5 @ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/sh/include/asm/spinlock_types.h:5 @ #ifndef __ASM_SH_SPINLOCK_TYPES_H #define __ASM_SH_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ arch/sh/kernel/irq.c:152 @ void irq_ctx_exit(int cpu) hardirq_ctx[cpu] = NULL; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct thread_info *curctx; @ arch/sh/kernel/irq.c:180 @ void do_softirq_own_stack(void) "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" ); } +#endif #else static inline void handle_one_irq(unsigned int irq) { @ arch/sparc/kernel/irq_64.c:858 @ void __irq_entry handler_irq(int pil, struct pt_regs *regs) set_irq_regs(old_regs); } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { void *orig_sp, *sp = softirq_stack[smp_processor_id()]; @ arch/sparc/kernel/irq_64.c:873 @ void do_softirq_own_stack(void) __asm__ __volatile__("mov %0, %%sp" : : "r" (orig_sp)); } +#endif #ifdef CONFIG_HOTPLUG_CPU void fixup_irqs(void) @ arch/x86/Kconfig:111 @ config X86 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS @ arch/x86/Kconfig:238 @ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API @ arch/x86/include/asm/pgtable.h:25 @ #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) #ifndef __ASSEMBLY__ +#include <linux/spinlock.h> #include <asm/x86_init.h> #include <asm/pkru.h> #include <asm/fpu/api.h> @ arch/x86/include/asm/preempt.h:93 @ static __always_inline void __preempt_count_sub(int val) * a decrement which hits zero means we have no preempt_count and should * reschedule. */ -static __always_inline bool __preempt_count_dec_and_test(void) +static __always_inline bool ____preempt_count_dec_and_test(void) { return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); } +static __always_inline bool __preempt_count_dec_and_test(void) +{ + if (____preempt_count_dec_and_test()) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if (preempt_count()) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif +} + /* * Returns true when we need to resched and can (barring IRQ state). */ static __always_inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u32 tmp; + tmp = raw_cpu_read_4(__preempt_count); + if (tmp == preempt_offset) + return true; + + /* preempt count == 0 ? */ + tmp &= ~PREEMPT_NEED_RESCHED; + if (tmp != preempt_offset) + return false; + /* XXX PREEMPT_LOCK_OFFSET */ + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); +#endif } #ifdef CONFIG_PREEMPTION @ arch/x86/include/asm/signal.h:31 @ typedef struct { #define SA_IA32_ABI 0x02000000u #define SA_X32_ABI 0x01000000u +/* + * Because some traps use the IST stack, we must keep preemption + * disabled while calling do_trap(), but do_trap() may call + * force_sig_info() which will grab the signal spin_locks for the + * task, which in PREEMPT_RT are mutexes. By defining + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the + * trap. + */ +#if defined(CONFIG_PREEMPT_RT) +#define ARCH_RT_DELAYS_SIGNAL_SEND +#endif + #ifndef CONFIG_COMPAT #define compat_sigset_t compat_sigset_t typedef sigset_t compat_sigset_t; @ arch/x86/include/asm/thread_info.h:60 @ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ + int preempt_lazy_count; /* 0 => lazy preemptable + <0 => BUG */ #ifdef CONFIG_SMP u32 cpu; /* current CPU */ #endif @ arch/x86/include/asm/thread_info.h:70 @ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .flags = 0, \ + .preempt_lazy_count = 0, \ } #else /* !__ASSEMBLY__ */ @ arch/x86/include/asm/thread_info.h:99 @ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ +#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @ arch/x86/include/asm/thread_info.h:124 @ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SLD (1 << TIF_SLD) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) @ arch/x86/kernel/cpu/mshyperv.c:82 @ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); - add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR); ack_APIC_irq(); set_irq_regs(old_regs); @ arch/x86/kvm/x86.c:8658 @ int kvm_arch_init(void *opaque) goto out; } + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); + r = -EOPNOTSUPP; + goto out; + } + r = -ENOMEM; x86_emulator_cache = kvm_alloc_emulator_cache(); @ arch/xtensa/include/asm/spinlock_types.h:5 @ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) # error "please don't include this file directly" #endif @ block/blk-mq.c:1860 @ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { - int cpu = get_cpu(); + int cpu = get_cpu_light(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); - put_cpu(); + put_cpu_light(); return; } - put_cpu(); + put_cpu_light(); } kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, @ crypto/cryptd.c:39 @ static struct workqueue_struct *cryptd_wq; struct cryptd_cpu_queue { struct crypto_queue queue; struct work_struct work; + spinlock_t qlock; }; struct cryptd_queue { @ crypto/cryptd.c:109 @ static int cryptd_init_queue(struct cryptd_queue *queue, cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu); crypto_init_queue(&cpu_queue->queue, max_cpu_qlen); INIT_WORK(&cpu_queue->work, cryptd_queue_worker); + spin_lock_init(&cpu_queue->qlock); } pr_info("cryptd: max_cpu_qlen set to %d\n", max_cpu_qlen); return 0; @ crypto/cryptd.c:134 @ static int cryptd_enqueue_request(struct cryptd_queue *queue, struct cryptd_cpu_queue *cpu_queue; refcount_t *refcnt; - cpu = get_cpu(); - cpu_queue = this_cpu_ptr(queue->cpu_queue); + cpu_queue = raw_cpu_ptr(queue->cpu_queue); + spin_lock_bh(&cpu_queue->qlock); + cpu = smp_processor_id(); + err = crypto_enqueue_request(&cpu_queue->queue, request); refcnt = crypto_tfm_ctx(request->tfm); @ crypto/cryptd.c:153 @ static int cryptd_enqueue_request(struct cryptd_queue *queue, refcount_inc(refcnt); out_put_cpu: - put_cpu(); + spin_unlock_bh(&cpu_queue->qlock); return err; } @ crypto/cryptd.c:169 @ static void cryptd_queue_worker(struct work_struct *work) cpu_queue = container_of(work, struct cryptd_cpu_queue, work); /* * Only handle one request at a time to avoid hogging crypto workqueue. - * preempt_disable/enable is used to prevent being preempted by - * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent - * cryptd_enqueue_request() being accessed from software interrupts. */ - local_bh_disable(); - preempt_disable(); + spin_lock_bh(&cpu_queue->qlock); backlog = crypto_get_backlog(&cpu_queue->queue); req = crypto_dequeue_request(&cpu_queue->queue); - preempt_enable(); - local_bh_enable(); + spin_unlock_bh(&cpu_queue->qlock); if (!req) return; @ drivers/block/zram/zram_drv.c:62 @ static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); +#ifdef CONFIG_PREEMPT_RT +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) +{ + size_t index; + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); +} + +static int zram_slot_trylock(struct zram *zram, u32 index) +{ + int ret; + + ret = spin_trylock(&zram->table[index].lock); + if (ret) + __set_bit(ZRAM_LOCK, &zram->table[index].flags); + return ret; +} + +static void zram_slot_lock(struct zram *zram, u32 index) +{ + spin_lock(&zram->table[index].lock); + __set_bit(ZRAM_LOCK, &zram->table[index].flags); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + __clear_bit(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); +} + +#else + +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } static int zram_slot_trylock(struct zram *zram, u32 index) { @ drivers/block/zram/zram_drv.c:111 @ static void zram_slot_unlock(struct zram *zram, u32 index) { bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } +#endif static inline bool init_done(struct zram *zram) { @ drivers/block/zram/zram_drv.c:1237 @ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + zram_meta_init_table_locks(zram, num_pages); return true; } @ drivers/block/zram/zram_drv.h:66 @ struct zram_table_entry { unsigned long element; }; unsigned long flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_MEMORY_TRACKING ktime_t ac_time; #endif @ drivers/char/random.c:203 @ * void add_device_randomness(const void *buf, unsigned int size); * void add_input_randomness(unsigned int type, unsigned int code, * unsigned int value); - * void add_interrupt_randomness(int irq, int irq_flags); + * void add_interrupt_randomness(int irq); * void add_disk_randomness(struct gendisk *disk); * * add_device_randomness() is for adding data to the random pool that @ drivers/char/random.c:1263 @ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs) return *ptr; } -void add_interrupt_randomness(int irq, int irq_flags) +static bool process_interrupt_randomness_pool(struct fast_pool *fast_pool) { struct entropy_store *r; + + if (unlikely(crng_init == 0)) { + bool pool_reset = false; + + if ((fast_pool->count >= 64) && + crng_fast_load((char *) fast_pool->pool, + sizeof(fast_pool->pool))) + pool_reset = true; + + return pool_reset; + } + + if ((fast_pool->count < 64) && + !time_after(jiffies, fast_pool->last + HZ)) + return false; + + r = &input_pool; + if (!spin_trylock(&r->lock)) + return false; + + __mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool)); + spin_unlock(&r->lock); + + /* award one bit for the contents of the fast pool */ + credit_entropy_bits(r, 1); + return true; +} + +#ifdef CONFIG_PREEMPT_RT +void process_interrupt_randomness(void) +{ + struct fast_pool *cpu_pool; + struct fast_pool fast_pool; + + lockdep_assert_irqs_enabled(); + + migrate_disable(); + cpu_pool = this_cpu_ptr(&irq_randomness); + + local_irq_disable(); + memcpy(&fast_pool, cpu_pool, sizeof(fast_pool)); + local_irq_enable(); + + if (process_interrupt_randomness_pool(&fast_pool)) { + local_irq_disable(); + cpu_pool->last = jiffies; + cpu_pool->count = 0; + local_irq_enable(); + } + memzero_explicit(&fast_pool, sizeof(fast_pool)); + migrate_enable(); +} +#endif + +void add_interrupt_randomness(int irq) +{ struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness); struct pt_regs *regs = get_irq_regs(); unsigned long now = jiffies; @ drivers/char/random.c:1343 @ void add_interrupt_randomness(int irq, int irq_flags) fast_mix(fast_pool); add_interrupt_bench(cycles); - if (unlikely(crng_init == 0)) { - if ((fast_pool->count >= 64) && - crng_fast_load((char *) fast_pool->pool, - sizeof(fast_pool->pool))) { - fast_pool->count = 0; + /* + * On PREEMPT_RT the entropy can not be fed into the input_pool because + * it needs to acquire sleeping locks with disabled interrupts. + * This is deferred to the threaded handler. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (process_interrupt_randomness_pool(fast_pool)) { fast_pool->last = now; + fast_pool->count = 0; } - return; } - - if ((fast_pool->count < 64) && - !time_after(now, fast_pool->last + HZ)) - return; - - r = &input_pool; - if (!spin_trylock(&r->lock)) - return; - - fast_pool->last = now; - __mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool)); - spin_unlock(&r->lock); - - fast_pool->count = 0; - - /* award one bit for the contents of the fast pool */ - credit_entropy_bits(r, 1); } EXPORT_SYMBOL_GPL(add_interrupt_randomness); @ drivers/char/tpm/tpm_tis.c:53 @ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da return container_of(data, struct tpm_tis_tcg_phy, priv); } +#ifdef CONFIG_PREEMPT_RT +/* + * Flushes previous write operations to chip so that a subsequent + * ioread*()s won't stall a cpu. + */ +static inline void tpm_tis_flush(void __iomem *iobase) +{ + ioread8(iobase + TPM_ACCESS(0)); +} +#else +#define tpm_tis_flush(iobase) do { } while (0) +#endif + +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) +{ + iowrite8(b, iobase + addr); + tpm_tis_flush(iobase); +} + +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) +{ + iowrite32(b, iobase + addr); + tpm_tis_flush(iobase); +} + static int interrupts = -1; module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); @ drivers/char/tpm/tpm_tis.c:197 @ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); while (len--) - iowrite8(*value++, phy->iobase + addr); + tpm_tis_iowrite8(*value++, phy->iobase, addr); return 0; } @ drivers/char/tpm/tpm_tis.c:224 @ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) { struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); - iowrite32(value, phy->iobase + addr); + tpm_tis_iowrite32(value, phy->iobase, addr); return 0; } @ drivers/gpu/drm/i915/display/intel_crtc.c:428 @ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) */ intel_psr_wait_for_idle(new_crtc_state); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); crtc->debug.min_vbl = min; crtc->debug.max_vbl = max; @ drivers/gpu/drm/i915/display/intel_crtc.c:454 @ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) break; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); timeout = schedule_timeout(timeout); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } finish_wait(wq, &wait); @ drivers/gpu/drm/i915/display/intel_crtc.c:493 @ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) return; irq_disable: - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) @ drivers/gpu/drm/i915/display/intel_crtc.c:573 @ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) new_crtc_state->uapi.event = NULL; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); /* Send VRR Push to terminate Vblank */ intel_vrr_send_push(new_crtc_state); @ drivers/gpu/drm/i915/gt/intel_breadcrumbs.c:314 @ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) /* Kick the work once more to drain the signalers, and disarm the irq */ irq_work_sync(&b->irq_work); while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { - local_irq_disable(); - signal_irq_work(&b->irq_work); - local_irq_enable(); + irq_work_queue(&b->irq_work); cond_resched(); + irq_work_sync(&b->irq_work); } } @ drivers/gpu/drm/i915/gt/intel_context.h:214 @ static inline void intel_context_enter(struct intel_context *ce) static inline void intel_context_mark_active(struct intel_context *ce) { - lockdep_assert_held(&ce->timeline->mutex); + lockdep_assert(lockdep_is_held(&ce->timeline->mutex) || + test_bit(CONTEXT_IS_PARKING, &ce->flags)); ++ce->active_count; } @ drivers/gpu/drm/i915/gt/intel_context_types.h:121 @ struct intel_context { #define CONTEXT_LRCA_DIRTY 9 #define CONTEXT_GUC_INIT 10 #define CONTEXT_PERMA_PIN 11 +#define CONTEXT_IS_PARKING 12 struct { u64 timeout_us; @ drivers/gpu/drm/i915/gt/intel_engine_pm.c:83 @ static int __engine_unpark(struct intel_wakeref *wf) return 0; } -#if IS_ENABLED(CONFIG_LOCKDEP) - -static unsigned long __timeline_mark_lock(struct intel_context *ce) -{ - unsigned long flags; - - local_irq_save(flags); - mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); - - return flags; -} - -static void __timeline_mark_unlock(struct intel_context *ce, - unsigned long flags) -{ - mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); - local_irq_restore(flags); -} - -#else - -static unsigned long __timeline_mark_lock(struct intel_context *ce) -{ - return 0; -} - -static void __timeline_mark_unlock(struct intel_context *ce, - unsigned long flags) -{ -} - -#endif /* !IS_ENABLED(CONFIG_LOCKDEP) */ - static void duration(struct dma_fence *fence, struct dma_fence_cb *cb) { struct i915_request *rq = to_request(fence); @ drivers/gpu/drm/i915/gt/intel_engine_pm.c:129 @ static bool switch_to_kernel_context(struct intel_engine_cs *engine) { struct intel_context *ce = engine->kernel_context; struct i915_request *rq; - unsigned long flags; bool result = true; /* @ drivers/gpu/drm/i915/gt/intel_engine_pm.c:183 @ static bool switch_to_kernel_context(struct intel_engine_cs *engine) * engine->wakeref.count, we may see the request completion and retire * it causing an underflow of the engine->wakeref. */ - flags = __timeline_mark_lock(ce); + set_bit(CONTEXT_IS_PARKING, &ce->flags); GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0); rq = __i915_request_create(ce, GFP_NOWAIT); @ drivers/gpu/drm/i915/gt/intel_engine_pm.c:215 @ static bool switch_to_kernel_context(struct intel_engine_cs *engine) result = false; out_unlock: - __timeline_mark_unlock(ce, flags); + clear_bit(CONTEXT_IS_PARKING, &ce->flags); return result; } @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1287 @ static void execlists_dequeue(struct intel_engine_cs *engine) * and context switches) submission. */ - spin_lock(&sched_engine->lock); + spin_lock_irq(&sched_engine->lock); /* * If the queue is higher priority than the last @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1387 @ static void execlists_dequeue(struct intel_engine_cs *engine) * Even if ELSP[1] is occupied and not worthy * of timeslices, our queue might be. */ - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); return; } } @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1413 @ static void execlists_dequeue(struct intel_engine_cs *engine) if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.sched_engine->lock); - spin_unlock(&engine->sched_engine->lock); + spin_unlock_irq(&engine->sched_engine->lock); return; /* leave this for another sibling */ } @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1575 @ static void execlists_dequeue(struct intel_engine_cs *engine) */ sched_engine->queue_priority_hint = queue_prio(sched_engine); i915_sched_engine_reset_on_empty(sched_engine); - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); /* * We can skip poking the HW if we ended up with exactly the same set @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:1601 @ static void execlists_dequeue(struct intel_engine_cs *engine) } } -static void execlists_dequeue_irq(struct intel_engine_cs *engine) -{ - local_irq_disable(); /* Suspend interrupts across request submission */ - execlists_dequeue(engine); - local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ -} - static void clear_ports(struct i915_request **ports, int count) { memset_p((void **)ports, NULL, count); @ drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2420 @ static void execlists_submission_tasklet(struct tasklet_struct *t) } if (!engine->execlists.pending[0]) { - execlists_dequeue_irq(engine); + execlists_dequeue(engine); start_timeslice(engine); } @ drivers/gpu/drm/i915/i915_irq.c:919 @ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, */ spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); - /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); /* Get optional system timestamp before query. */ if (stime) @ drivers/gpu/drm/i915/i915_irq.c:984 @ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, if (etime) *etime = ktime_get(); - /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); @ drivers/gpu/drm/i915/i915_request.c:563 @ bool __i915_request_submit(struct i915_request *request) RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* @ drivers/gpu/drm/i915/i915_request.c:671 @ void __i915_request_unsubmit(struct i915_request *request) */ RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* @ drivers/gpu/drm/i915/i915_request.h:645 @ i915_request_timeline(const struct i915_request *rq) { /* Valid only while the request is being constructed (or retired). */ return rcu_dereference_protected(rq->timeline, - lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex)); + lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex) || + test_bit(CONTEXT_IS_PARKING, &rq->context->flags)); } static inline struct i915_gem_context * @ drivers/gpu/drm/i915/i915_trace.h:5 @ #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _I915_TRACE_H_ +#ifdef CONFIG_PREEMPT_RT +#define NOTRACE +#endif + #include <linux/stringify.h> #include <linux/types.h> #include <linux/tracepoint.h> @ drivers/gpu/drm/i915/i915_trace.h:826 @ DEFINE_EVENT(i915_request, i915_request_add, TP_ARGS(rq) ); -#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) +#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) DEFINE_EVENT(i915_request, i915_request_guc_submit, TP_PROTO(struct i915_request *rq), TP_ARGS(rq) @ drivers/gpu/drm/i915/i915_utils.h:347 @ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ -#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) +#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) #else # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) @ drivers/hv/vmbus_drv.c:1384 @ static void vmbus_isr(void) tasklet_schedule(&hv_cpu->msg_dpc); } - add_interrupt_randomness(vmbus_interrupt, 0); + add_interrupt_randomness(vmbus_interrupt); } static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) @ drivers/i2c/busses/i2c-cht-wc.c:102 @ static irqreturn_t cht_wc_i2c_adap_thread_handler(int id, void *data) * interrupt handler as well, so running the client irq handler from * this thread will cause things to lock up. */ - if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) { - /* - * generic_handle_irq expects local IRQs to be disabled - * as normally it is called from interrupt context. - */ - local_irq_disable(); - generic_handle_irq(adap->client_irq); - local_irq_enable(); - } + if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) + generic_handle_irq_safe(adap->client_irq); return IRQ_HANDLED; } @ drivers/i2c/i2c-core-base.c:1426 @ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr) if (irq <= 0) return -ENXIO; - generic_handle_irq(irq); + generic_handle_irq_safe(irq); return 0; } @ drivers/md/raid5.c:2220 @ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) struct raid5_percpu *percpu; unsigned long cpu; - cpu = get_cpu(); + cpu = get_cpu_light(); percpu = per_cpu_ptr(conf->percpu, cpu); + spin_lock(&percpu->lock); if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; @ drivers/md/raid5.c:2281 @ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } - put_cpu(); + spin_unlock(&percpu->lock); + put_cpu_light(); } static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) @ drivers/md/raid5.c:7107 @ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) __func__, cpu); return -ENOMEM; } + spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); return 0; } @ drivers/md/raid5.h:638 @ struct r5conf { int recovery_disabled; /* per cpu variables */ struct raid5_percpu { + spinlock_t lock; /* Protection for -RT */ struct page *spare_page; /* Used when checking P/Q in raid6 */ void *scribble; /* space for constructing buffer * lists and performing address @ drivers/mfd/ezx-pcap.c:196 @ static void pcap_isr_work(struct work_struct *work) ezx_pcap_write(pcap, PCAP_REG_MSR, isr | msr); ezx_pcap_write(pcap, PCAP_REG_ISR, isr); - local_irq_disable(); service = isr & ~msr; for (irq = pcap->irq_base; service; service >>= 1, irq++) { if (service & 1) - generic_handle_irq(irq); + generic_handle_irq_safe(irq); } - local_irq_enable(); ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr); } while (gpio_get_value(pdata->gpio)); } @ drivers/misc/hi6421v600-irq.c:120 @ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) * If both powerkey down and up IRQs are received, * handle them at the right order */ - generic_handle_irq(priv->irqs[POWERKEY_DOWN]); - generic_handle_irq(priv->irqs[POWERKEY_UP]); + generic_handle_irq_safe(priv->irqs[POWERKEY_DOWN]); + generic_handle_irq_safe(priv->irqs[POWERKEY_UP]); pending &= ~HISI_IRQ_POWERKEY_UP_DOWN; } @ drivers/misc/hi6421v600-irq.c:129 @ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) continue; for_each_set_bit(offset, &pending, BITS_PER_BYTE) { - generic_handle_irq(priv->irqs[offset + i * BITS_PER_BYTE]); + generic_handle_irq_safe(priv->irqs[offset + i * BITS_PER_BYTE]); } } @ drivers/mmc/core/block.c:2054 @ static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req) mmc_put_card(mq->card, &mq->ctx); } -static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) +static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req, + bool can_sleep) { struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req); struct mmc_request *mrq = &mqrq->brq.mrq; @ drivers/mmc/core/block.c:2067 @ static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) * Block layer timeouts race with completions which means the normal * completion path cannot be used during recovery. */ - if (mq->in_recovery) + if (mq->in_recovery) { mmc_blk_mq_complete_rq(mq, req); - else if (likely(!blk_should_fake_timeout(req->q))) - blk_mq_complete_request(req); + } else if (likely(!blk_should_fake_timeout(req->q))) { + if (can_sleep) + blk_mq_complete_request_direct(req, mmc_blk_mq_complete); + else + blk_mq_complete_request(req); + } mmc_blk_mq_dec_in_flight(mq, req); } @ drivers/mmc/core/block.c:2095 @ void mmc_blk_mq_recovery(struct mmc_queue *mq) mmc_blk_urgent_bkops(mq, mqrq); - mmc_blk_mq_post_req(mq, req); + mmc_blk_mq_post_req(mq, req, true); } static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, @ drivers/mmc/core/block.c:2114 @ static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, if (prev_req) *prev_req = mq->complete_req; else - mmc_blk_mq_post_req(mq, mq->complete_req); + mmc_blk_mq_post_req(mq, mq->complete_req, true); mq->complete_req = NULL; @ drivers/mmc/core/block.c:2186 @ static void mmc_blk_mq_req_done(struct mmc_request *mrq) mq->rw_wait = false; wake_up(&mq->wait); - mmc_blk_mq_post_req(mq, req); + /* context unknown */ + mmc_blk_mq_post_req(mq, req, false); } static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err) @ drivers/mmc/core/block.c:2247 @ static int mmc_blk_mq_issue_rw_rq(struct mmc_queue *mq, err = mmc_start_request(host, &mqrq->brq.mrq); if (prev_req) - mmc_blk_mq_post_req(mq, prev_req); + mmc_blk_mq_post_req(mq, prev_req, true); if (err) mq->rw_wait = false; @ drivers/net/usb/lan78xx.c:1370 @ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb) netif_dbg(dev, link, dev->net, "PHY INTR: 0x%08x\n", intdata); lan78xx_defer_kevent(dev, EVENT_LINK_RESET); - if (dev->domain_data.phyirq > 0) { - local_irq_disable(); - generic_handle_irq(dev->domain_data.phyirq); - local_irq_enable(); - } + if (dev->domain_data.phyirq > 0) + generic_handle_irq_safe(dev->domain_data.phyirq); } else { netdev_warn(dev->net, "unexpected interrupt: 0x%08x\n", intdata); @ drivers/scsi/fcoe/fcoe.c:1453 @ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen) { struct fcoe_percpu_s *fps; - int rc; + int rc, cpu = get_cpu_light(); - fps = &get_cpu_var(fcoe_percpu); + fps = &per_cpu(fcoe_percpu, cpu); rc = fcoe_get_paged_crc_eof(skb, tlen, fps); - put_cpu_var(fcoe_percpu); + put_cpu_light(); return rc; } @ drivers/scsi/fcoe/fcoe.c:1642 @ static inline int fcoe_filter_frames(struct fc_lport *lport, return 0; } - stats = per_cpu_ptr(lport->stats, get_cpu()); + stats = per_cpu_ptr(lport->stats, get_cpu_light()); stats->InvalidCRCCount++; if (stats->InvalidCRCCount < 5) printk(KERN_WARNING "fcoe: dropping frame with CRC error\n"); - put_cpu(); + put_cpu_light(); return -EINVAL; } @ drivers/scsi/fcoe/fcoe.c:1687 @ static void fcoe_recv_frame(struct sk_buff *skb) */ hp = (struct fcoe_hdr *) skb_network_header(skb); - stats = per_cpu_ptr(lport->stats, get_cpu()); + stats = per_cpu_ptr(lport->stats, get_cpu_light()); if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) { if (stats->ErrorFrames < 5) printk(KERN_WARNING "fcoe: FCoE version " @ drivers/scsi/fcoe/fcoe.c:1719 @ static void fcoe_recv_frame(struct sk_buff *skb) goto drop; if (!fcoe_filter_frames(lport, fp)) { - put_cpu(); + put_cpu_light(); fc_exch_recv(lport, fp); return; } drop: stats->ErrorFrames++; - put_cpu(); + put_cpu_light(); kfree_skb(skb); } @ drivers/scsi/fcoe/fcoe_ctlr.c:831 @ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) INIT_LIST_HEAD(&del_list); - stats = per_cpu_ptr(fip->lp->stats, get_cpu()); + stats = per_cpu_ptr(fip->lp->stats, get_cpu_light()); list_for_each_entry_safe(fcf, next, &fip->fcfs, list) { deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2; @ drivers/scsi/fcoe/fcoe_ctlr.c:867 @ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) sel_time = fcf->time; } } - put_cpu(); + put_cpu_light(); list_for_each_entry_safe(fcf, next, &del_list, list) { /* Removes fcf from current list */ @ drivers/scsi/libfc/fc_exch.c:828 @ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, } memset(ep, 0, sizeof(*ep)); - cpu = get_cpu(); + cpu = get_cpu_light(); pool = per_cpu_ptr(mp->pool, cpu); spin_lock_bh(&pool->lock); - put_cpu(); + put_cpu_light(); /* peek cache of free slot */ if (pool->left != FC_XID_UNKNOWN) { @ drivers/staging/greybus/gpio.c:394 @ static int gb_gpio_request_handler(struct gb_operation *op) return -EINVAL; } - local_irq_disable(); - ret = generic_handle_irq(irq); - local_irq_enable(); - + ret = generic_handle_irq_safe(irq); if (ret) dev_err(dev, "failed to invoke irq handler\n"); @ drivers/tty/serial/8250/8250.h:135 @ static inline void serial_dl_write(struct uart_8250_port *up, int value) up->dl_write(up, value); } +static inline void serial8250_set_IER(struct uart_8250_port *up, + unsigned char ier) +{ + struct uart_port *port = &up->port; + unsigned long flags; + bool is_console; + + is_console = uart_console(port); + + if (is_console) + printk_cpu_sync_get_irqsave(flags); + + serial_out(up, UART_IER, ier); + + if (is_console) + printk_cpu_sync_put_irqrestore(flags); +} + +static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) +{ + struct uart_port *port = &up->port; + unsigned int clearval = 0; + unsigned long flags; + unsigned int prior; + bool is_console; + + is_console = uart_console(port); + + if (up->capabilities & UART_CAP_UUE) + clearval = UART_IER_UUE; + + if (is_console) + printk_cpu_sync_get_irqsave(flags); + + prior = serial_port_in(port, UART_IER); + serial_port_out(port, UART_IER, clearval); + + if (is_console) + printk_cpu_sync_put_irqrestore(flags); + + return prior; +} + static inline bool serial8250_set_THRI(struct uart_8250_port *up) { if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } @ drivers/tty/serial/8250/8250.h:192 @ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } @ drivers/tty/serial/8250/8250_core.c:267 @ static void serial8250_backup_timeout(struct timer_list *t) * Must disable interrupts or else we risk racing with the interrupt * based handler. */ - if (up->port.irq) { - ier = serial_in(up, UART_IER); - serial_out(up, UART_IER, 0); - } + if (up->port.irq) + ier = serial8250_clear_IER(up); iir = serial_in(up, UART_IIR); @ drivers/tty/serial/8250/8250_core.c:291 @ static void serial8250_backup_timeout(struct timer_list *t) serial8250_tx_chars(up); if (up->port.irq) - serial_out(up, UART_IER, ier); + serial8250_set_IER(up, ier); spin_unlock_irqrestore(&up->port.lock, flags); @ drivers/tty/serial/8250/8250_core.c:569 @ serial8250_register_ports(struct uart_driver *drv, struct device *dev) #ifdef CONFIG_SERIAL_8250_CONSOLE +static void univ8250_console_write_atomic(struct console *co, const char *s, + unsigned int count) +{ + struct uart_8250_port *up = &serial8250_ports[co->index]; + + serial8250_console_write_atomic(up, s, count); +} + static void univ8250_console_write(struct console *co, const char *s, unsigned int count) { @ drivers/tty/serial/8250/8250_core.c:670 @ static int univ8250_console_match(struct console *co, char *name, int idx, static struct console univ8250_console = { .name = "ttyS", + .write_atomic = univ8250_console_write_atomic, .write = univ8250_console_write, .device = uart_console_device, .setup = univ8250_console_setup, @ drivers/tty/serial/8250/8250_fsl.c:59 @ int fsl8250_handle_irq(struct uart_port *port) /* Stop processing interrupts on input overrun */ if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { + unsigned long flags; unsigned long delay; + bool is_console; + is_console = uart_console(port); + + if (is_console) + printk_cpu_sync_get_irqsave(flags); up->ier = port->serial_in(port, UART_IER); + if (is_console) + printk_cpu_sync_put_irqrestore(flags); + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { port->ops->stop_rx(port); } else { @ drivers/tty/serial/8250/8250_ingenic.c:149 @ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) { + unsigned long flags; + bool is_console; int ier; switch (offset) { @ drivers/tty/serial/8250/8250_ingenic.c:172 @ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) * If we have enabled modem status IRQs we should enable * modem mode. */ + is_console = uart_console(p); + if (is_console) + printk_cpu_sync_get_irqsave(flags); ier = p->serial_in(p, UART_IER); + if (is_console) + printk_cpu_sync_put_irqrestore(flags); if (ier & UART_IER_MSI) value |= UART_MCR_MDCE | UART_MCR_FCM; @ drivers/tty/serial/8250/8250_mtk.c:221 @ static void mtk8250_shutdown(struct uart_port *port) static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; + bool is_console; + + is_console = uart_console(port); + + if (is_console) + printk_cpu_sync_get_irqsave(flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier & (~mask)); + + if (is_console) + printk_cpu_sync_put_irqrestore(flags); } static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; + + if (uart_console(port)) + printk_cpu_sync_get_irqsave(flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier | mask); + + if (uart_console(port)) + printk_cpu_sync_put_irqrestore(flags); } static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) @ drivers/tty/serial/8250/8250_port.c:765 @ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) serial_out(p, UART_EFR, UART_EFR_ECB); serial_out(p, UART_LCR, 0); } - serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); + serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); if (p->capabilities & UART_CAP_EFR) { serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, efr); @ drivers/tty/serial/8250/8250_port.c:1439 @ static void serial8250_stop_rx(struct uart_port *port) up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); up->port.read_status_mask &= ~UART_LSR_DR; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @ drivers/tty/serial/8250/8250_port.c:1469 @ void serial8250_em485_stop_tx(struct uart_8250_port *p) serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; - serial_port_out(&p->port, UART_IER, p->ier); + serial8250_set_IER(p, p->ier); } } EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); @ drivers/tty/serial/8250/8250_port.c:1691 @ static void serial8250_disable_ms(struct uart_port *port) mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); } static void serial8250_enable_ms(struct uart_port *port) @ drivers/tty/serial/8250/8250_port.c:1707 @ static void serial8250_enable_ms(struct uart_port *port) up->ier |= UART_IER_MSI; serial8250_rpm_get(up); - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @ drivers/tty/serial/8250/8250_port.c:2128 @ static void serial8250_put_poll_char(struct uart_port *port, struct uart_8250_port *up = up_to_u8250p(port); serial8250_rpm_get(up); - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else - serial_port_out(port, UART_IER, 0); + ier = serial8250_clear_IER(up); wait_for_xmitr(up, BOTH_EMPTY); /* @ drivers/tty/serial/8250/8250_port.c:2141 @ static void serial8250_put_poll_char(struct uart_port *port, * and restore the IER */ wait_for_xmitr(up, BOTH_EMPTY); - serial_port_out(port, UART_IER, ier); + serial8250_set_IER(up, ier); serial8250_rpm_put(up); } @ drivers/tty/serial/8250/8250_port.c:2444 @ void serial8250_do_shutdown(struct uart_port *port) */ spin_lock_irqsave(&port->lock, flags); up->ier = 0; - serial_port_out(port, UART_IER, 0); + serial8250_set_IER(up, 0); spin_unlock_irqrestore(&port->lock, flags); synchronize_irq(port->irq); @ drivers/tty/serial/8250/8250_port.c:2826 @ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; @ drivers/tty/serial/8250/8250_port.c:3292 @ EXPORT_SYMBOL_GPL(serial8250_set_defaults); #ifdef CONFIG_SERIAL_8250_CONSOLE -static void serial8250_console_putchar(struct uart_port *port, int ch) +static void serial8250_console_putchar_locked(struct uart_port *port, int ch) { struct uart_8250_port *up = up_to_u8250p(port); @ drivers/tty/serial/8250/8250_port.c:3300 @ static void serial8250_console_putchar(struct uart_port *port, int ch) serial_port_out(port, UART_TX, ch); } +static void serial8250_console_putchar(struct uart_port *port, int ch) +{ + struct uart_8250_port *up = up_to_u8250p(port); + unsigned long flags; + + wait_for_xmitr(up, UART_LSR_THRE); + + printk_cpu_sync_get_irqsave(flags); + serial8250_console_putchar_locked(port, ch); + printk_cpu_sync_put_irqrestore(flags); +} + /* * Restore serial console when h/w power-off detected */ @ drivers/tty/serial/8250/8250_port.c:3333 @ static void serial8250_console_restore(struct uart_8250_port *up) serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); } +void serial8250_console_write_atomic(struct uart_8250_port *up, + const char *s, unsigned int count) +{ + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; + + printk_cpu_sync_get_irqsave(flags); + + touch_nmi_watchdog(); + + ier = serial8250_clear_IER(up); + + if (atomic_fetch_inc(&up->console_printing)) { + uart_console_write(port, "\n", 1, + serial8250_console_putchar_locked); + } + uart_console_write(port, s, count, serial8250_console_putchar_locked); + atomic_dec(&up->console_printing); + + wait_for_xmitr(up, BOTH_EMPTY); + serial8250_set_IER(up, ier); + + printk_cpu_sync_put_irqrestore(flags); +} + /* * Print a string to the serial port trying not to disturb * any possible real use of the port... @ drivers/tty/serial/8250/8250_port.c:3375 @ void serial8250_console_write(struct uart_8250_port *up, const char *s, struct uart_port *port = &up->port; unsigned long flags; unsigned int ier; - int locked = 1; touch_nmi_watchdog(); - if (oops_in_progress) - locked = spin_trylock_irqsave(&port->lock, flags); - else - spin_lock_irqsave(&port->lock, flags); + spin_lock_irqsave(&port->lock, flags); - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); - - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else - serial_port_out(port, UART_IER, 0); + ier = serial8250_clear_IER(up); /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { @ drivers/tty/serial/8250/8250_port.c:3394 @ void serial8250_console_write(struct uart_8250_port *up, const char *s, mdelay(port->rs485.delay_rts_before_send); } + atomic_inc(&up->console_printing); uart_console_write(port, s, count, serial8250_console_putchar); + atomic_dec(&up->console_printing); /* * Finally, wait for transmitter to become empty @ drivers/tty/serial/8250/8250_port.c:3409 @ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (em485->tx_stopped) up->rs485_stop_tx(up); } - - serial_port_out(port, UART_IER, ier); + serial8250_set_IER(up, ier); /* * The receive handling will happen properly because the @ drivers/tty/serial/8250/8250_port.c:3421 @ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (up->msr_saved_flags) serial8250_modem_status(up); - if (locked) - spin_unlock_irqrestore(&port->lock, flags); + spin_unlock_irqrestore(&port->lock, flags); } static unsigned int probe_baud(struct uart_port *port) @ drivers/tty/serial/8250/8250_port.c:3441 @ static unsigned int probe_baud(struct uart_port *port) int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { + struct uart_8250_port *up = up_to_u8250p(port); int baud = 9600; int bits = 8; int parity = 'n'; @ drivers/tty/serial/8250/8250_port.c:3451 @ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) if (!port->iobase && !port->membase) return -ENODEV; + atomic_set(&up->console_printing, 0); + if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) @ drivers/tty/serial/8250/Kconfig:12 @ config SERIAL_8250 depends on !S390 select SERIAL_CORE select SERIAL_MCTRL_GPIO if GPIOLIB + select HAVE_ATOMIC_CONSOLE help This selects whether you want to include the driver for the standard serial ports. The standard answer is Y. People who might say N @ drivers/tty/serial/amba-pl011.c:2339 @ pl011_console_write(struct console *co, const char *s, unsigned int count) { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; - unsigned long flags; + unsigned long flags = 0; int locked = 1; clk_enable(uap->clk); - local_irq_save(flags); + /* + * local_irq_save(flags); + * + * This local_irq_save() is nonsense. If we come in via sysrq + * handling then interrupts are already disabled. Aside of + * that the port.sysrq check is racy on SMP regardless. + */ if (uap->port.sysrq) locked = 0; else if (oops_in_progress) - locked = spin_trylock(&uap->port.lock); + locked = spin_trylock_irqsave(&uap->port.lock, flags); else - spin_lock(&uap->port.lock); + spin_lock_irqsave(&uap->port.lock, flags); /* * First save the CR then disable the interrupts @ drivers/tty/serial/amba-pl011.c:2382 @ pl011_console_write(struct console *co, const char *s, unsigned int count) pl011_write(old_cr, uap, REG_CR); if (locked) - spin_unlock(&uap->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&uap->port.lock, flags); clk_disable(uap->clk); } @ drivers/tty/serial/omap-serial.c:1258 @ serial_omap_console_write(struct console *co, const char *s, unsigned int ier; int locked = 1; - local_irq_save(flags); - if (up->port.sysrq) - locked = 0; - else if (oops_in_progress) - locked = spin_trylock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); else - spin_lock(&up->port.lock); + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @ drivers/tty/serial/omap-serial.c:1288 @ serial_omap_console_write(struct console *co, const char *s, check_modem_status(up); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init @ drivers/tty/vt/vt.c:3164 @ static struct console vt_console_driver = { .write = vt_console_print, .device = vt_console_device, .unblank = unblank_screen, - .flags = CON_PRINTBUFFER, + .flags = CON_PRINTBUFFER|CON_MIGHT_SLEEP, .index = -1, }; #endif @ drivers/virt/acrn/irqfd.c:20 @ #include "acrn_drv.h" static LIST_HEAD(acrn_irqfd_clients); -static DEFINE_MUTEX(acrn_irqfds_mutex); /** * struct hsm_irqfd - Properties of HSM irqfd @ fs/afs/dir_silly.c:242 @ int afs_silly_iput(struct dentry *dentry, struct inode *inode) struct dentry *alias; int ret; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); @ fs/cifs/readdir.c:72 @ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct inode *inode; struct super_block *sb = parent->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); @ fs/dcache.c:2540 @ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - + /* + * The caller has a spinlock_t (dentry::d_lock) acquired which disables + * preemption on !PREEMPT_RT. On PREEMPT_RT the lock does not disable + * preemption and it has be done explicitly. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @ fs/dcache.c:2558 @ static inline unsigned start_dir_add(struct inode *dir) static inline void end_dir_add(struct inode *dir, unsigned n) { smp_store_release(&dir->i_dir_seq, n + 2); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } static void d_wait_lookup(struct dentry *dentry) { - if (d_in_lookup(dentry)) { - DECLARE_WAITQUEUE(wait, current); - add_wait_queue(dentry->d_wait, &wait); - do { - set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock(&dentry->d_lock); - schedule(); - spin_lock(&dentry->d_lock); - } while (d_in_lookup(dentry)); - } + struct swait_queue __wait; + + if (!d_in_lookup(dentry)) + return; + + INIT_LIST_HEAD(&__wait.task_list); + do { + prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&dentry->d_lock); + schedule(); + spin_lock(&dentry->d_lock); + } while (d_in_lookup(dentry)); + finish_swait(dentry->d_wait, &__wait); } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, - wait_queue_head_t *wq) + struct swait_queue_head *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); @ fs/dcache.c:2696 @ void __d_lookup_done(struct dentry *dentry) hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); - wake_up_all(dentry->d_wait); + swake_up_all(dentry->d_wait); dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); @ fs/fscache/internal.h:84 @ extern unsigned fscache_debug; extern struct kobject *fscache_root; extern struct workqueue_struct *fscache_object_wq; extern struct workqueue_struct *fscache_op_wq; -DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); @ fs/fscache/main.c:44 @ struct kobject *fscache_root; struct workqueue_struct *fscache_object_wq; struct workqueue_struct *fscache_op_wq; -DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); - /* these values serve as lower bounds, will be adjusted in fscache_init() */ static unsigned fscache_object_max_active = 4; static unsigned fscache_op_max_active = 2; @ fs/fscache/main.c:139 @ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) static int __init fscache_init(void) { unsigned int nr_cpus = num_possible_cpus(); - unsigned int cpu; int ret; fscache_object_max_active = @ fs/fscache/main.c:161 @ static int __init fscache_init(void) if (!fscache_op_wq) goto error_op_wq; - for_each_possible_cpu(cpu) - init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); - ret = fscache_proc_init(); if (ret < 0) goto error_proc; @ fs/fscache/object.c:801 @ void fscache_object_destroy(struct fscache_object *object) } EXPORT_SYMBOL(fscache_object_destroy); +static DECLARE_WAIT_QUEUE_HEAD(fscache_object_cong_wait); + /* * enqueue an object for metadata-type processing */ @ fs/fscache/object.c:811 @ void fscache_enqueue_object(struct fscache_object *object) _enter("{OBJ%x}", object->debug_id); if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { - wait_queue_head_t *cong_wq = - &get_cpu_var(fscache_object_cong_wait); if (queue_work(fscache_object_wq, &object->work)) { if (fscache_object_congested()) - wake_up(cong_wq); + wake_up(&fscache_object_cong_wait); } else fscache_put_object(object, fscache_obj_put_queue); - - put_cpu_var(fscache_object_cong_wait); } } @ fs/fscache/object.c:834 @ void fscache_enqueue_object(struct fscache_object *object) */ bool fscache_object_sleep_till_congested(signed long *timeoutp) { - wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); DEFINE_WAIT(wait); if (fscache_object_congested()) return true; - add_wait_queue_exclusive(cong_wq, &wait); + add_wait_queue_exclusive(&fscache_object_cong_wait, &wait); if (!fscache_object_congested()) *timeoutp = schedule_timeout(*timeoutp); - finish_wait(cong_wq, &wait); + finish_wait(&fscache_object_cong_wait, &wait); return fscache_object_congested(); } @ fs/fuse/readdir.c:161 @ static int fuse_direntplus_link(struct file *file, struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); if (!o->nodeid) { /* @ fs/namei.c:1636 @ static struct dentry *__lookup_slow(const struct qstr *name, { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) @ fs/namei.c:3195 @ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); @ fs/namespace.c:346 @ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) - cpu_relax(); + might_lock(&mount_lock.lock); + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + cpu_relax(); + } else { + /* + * This prevents priority inversion, if the task + * setting MNT_WRITE_HOLD got preempted on a remote + * CPU, and it prevents life lock if the task setting + * MNT_WRITE_HOLD has a lower priority and is bound to + * the same CPU as the task that is spinning here. + */ + preempt_enable(); + lock_mount_hash(); + unlock_mount_hash(); + preempt_disable(); + } + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until @ fs/nfs/dir.c:641 @ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *inode; @ fs/nfs/dir.c:1863 @ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; @ fs/nfs/unlink.c:16 @ #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/sched.h> -#include <linux/wait.h> +#include <linux/swait.h> #include <linux/namei.h> #include <linux/fsnotify.h> @ fs/nfs/unlink.c:183 @ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) data->cred = get_current_cred(); data->res.dir_attr = &data->dir_attr; - init_waitqueue_head(&data->wq); + init_swait_queue_head(&data->wq); status = -EBUSY; spin_lock(&dentry->d_lock); @ fs/proc/base.c:99 @ #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> +#include <linux/swait.h> #include <linux/cn_proc.h> #include <trace/events/oom.h> #include "internal.h" @ fs/proc/base.c:2049 @ bool proc_fill_cache(struct file *file, struct dir_context *ctx, child = d_hash_and_lookup(dir, &qname); if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; @ fs/proc/proc_sysctl.c:681 @ static bool proc_sys_fill_cache(struct file *file, child = d_lookup(dir, &qname); if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; @ include/asm-generic/softirq_stack.h:5 @ #ifndef __ASM_GENERIC_SOFTIRQ_STACK_H #define __ASM_GENERIC_SOFTIRQ_STACK_H -#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK +#if defined(CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK) && !defined(CONFIG_PREEMPT_RT) void do_softirq_own_stack(void); #else static inline void do_softirq_own_stack(void) @ include/linux/blk-mq.h:755 @ static inline void blk_mq_set_request_complete(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); } +/* + * Complete the request directly instead of deferring it to softirq or + * completing it another CPU. Useful in preemptible instead of an interrupt. + */ +static inline void blk_mq_complete_request_direct(struct request *rq, + void (*complete)(struct request *rq)) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + complete(rq); +} + void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); @ include/linux/console.h:19 @ #include <linux/atomic.h> #include <linux/types.h> +#include <linux/mutex.h> struct vc_data; struct console_font_op; @ include/linux/console.h:137 @ static inline int con_debug_leave(void) #define CON_CONSDEV (2) /* Preferred console, /dev/console */ #define CON_ENABLED (4) #define CON_BOOT (8) -#define CON_ANYTIME (16) /* Safe to call when cpu is offline */ +#define CON_ANYTIME (16) /* Safe to call before per-cpu resources ready */ #define CON_BRL (32) /* Used for a braille device */ #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ +#define CON_PAUSED (128) /* Sleep while console is locked */ +#define CON_MIGHT_SLEEP (256) /* Can only be called from sleepable context */ + +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE +struct console_atomic_data { + u64 seq; + char *text; + char *ext_text; + char *dropped_text; +}; +#endif struct console { char name[16]; void (*write)(struct console *, const char *, unsigned); + void (*write_atomic)(struct console *, const char *, unsigned); int (*read)(struct console *, char *, unsigned); struct tty_driver *(*device)(struct console *, int *); void (*unblank)(void); @ include/linux/console.h:167 @ struct console { int cflag; uint ispeed; uint ospeed; + u64 seq; + atomic_long_t dropped; +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE + struct console_atomic_data *atomic_data; +#endif + struct task_struct *thread; + + /* + * The per-console lock is used by printing kthreads to synchronize + * this console with callers of console_lock(). This is necessary in + * order to allow printing kthreads to run in parallel to each other, + * while each safely accessing their own @flags and synchronizing + * against direct printing via console_lock/console_unlock. + * + * Note: For synchronizing against direct printing via + * console_trylock/console_unlock, see the static global + * variable @console_lock_count. + */ + struct mutex lock; + void *data; struct console *next; }; @ include/linux/console.h:201 @ extern int console_set_on_cmdline; extern struct console *early_console; enum con_flush_mode { + CONSOLE_ATOMIC_FLUSH_PENDING, CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; @ include/linux/dcache.h:111 @ struct dentry { union { struct list_head d_lru; /* LRU list */ - wait_queue_head_t *d_wait; /* in-lookup ones only */ + struct swait_queue_head *d_wait; /* in-lookup ones only */ }; struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ @ include/linux/dcache.h:243 @ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, - wait_queue_head_t *); + struct swait_queue_head *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); @ include/linux/entry-common.h:60 @ # define ARCH_EXIT_TO_USER_MODE_WORK (0) #endif +#ifdef CONFIG_PREEMPT_LAZY +# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) +#else +# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED) +#endif + #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ARCH_EXIT_TO_USER_MODE_WORK) /** @ include/linux/interrupt.h:557 @ extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); +#ifdef CONFIG_PREEMPT_RT +extern void raise_timer_softirq(void); +extern void raise_hrtimer_softirq(void); + +#else +static inline void raise_timer_softirq(void) +{ + raise_softirq(TIMER_SOFTIRQ); +} + +static inline void raise_hrtimer_softirq(void) +{ + raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} +#endif + DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) @ include/linux/irqdesc.h:163 @ static inline void generic_handle_irq_desc(struct irq_desc *desc) int handle_irq_desc(struct irq_desc *desc); int generic_handle_irq(unsigned int irq); +int generic_handle_irq_safe(unsigned int irq); #ifdef CONFIG_IRQ_DOMAIN /* @ include/linux/irqflags.h:74 @ do { \ do { \ __this_cpu_dec(hardirq_context); \ } while (0) -# define lockdep_softirq_enter() \ -do { \ - current->softirq_context++; \ -} while (0) -# define lockdep_softirq_exit() \ -do { \ - current->softirq_context--; \ -} while (0) # define lockdep_hrtimer_enter(__hrtimer) \ ({ \ @ include/linux/irqflags.h:135 @ do { \ # define lockdep_irq_work_exit(__work) do { } while (0) #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) +# define lockdep_softirq_enter() \ +do { \ + current->softirq_context++; \ +} while (0) +# define lockdep_softirq_exit() \ +do { \ + current->softirq_context--; \ +} while (0) + +#else +# define lockdep_softirq_enter() do { } while (0) +# define lockdep_softirq_exit() do { } while (0) +#endif + #if defined(CONFIG_IRQSOFF_TRACER) || \ defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); @ include/linux/local_lock_internal.h:47 @ static inline void local_lock_debug_init(local_lock_t *l) } #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) -static inline void local_lock_acquire(local_lock_t *l) { } -static inline void local_lock_release(local_lock_t *l) { } -static inline void local_lock_debug_init(local_lock_t *l) { } +# define local_lock_acquire(__ll) do { typecheck(local_lock_t *, __ll); } while (0) +# define local_lock_release(__ll) do { typecheck(local_lock_t *, __ll); } while (0) +# define local_lock_debug_init(__ll) do { typecheck(local_lock_t *, __ll); } while (0) #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } @ include/linux/netdevice.h:4006 @ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); -int netif_rx_ni(struct sk_buff *skb); -int netif_rx_any_context(struct sk_buff *skb); + +static inline int netif_rx_ni(struct sk_buff *skb) +{ + return netif_rx(skb); +} + +static inline int netif_rx_any_context(struct sk_buff *skb) +{ + return netif_rx(skb); +} + int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list(struct list_head *head); @ include/linux/nfs_xdr.h:1687 @ struct nfs_unlinkdata { struct nfs_removeargs args; struct nfs_removeres res; struct dentry *dentry; - wait_queue_head_t wq; + struct swait_queue_head wq; const struct cred *cred; struct nfs_fattr dir_attr; long timeout; @ include/linux/preempt.h:199 @ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) +#ifdef CONFIG_PREEMPT_LAZY +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) +#define inc_preempt_lazy_count() add_preempt_lazy_count(1) +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) +#else +#define add_preempt_lazy_count(val) do { } while (0) +#define sub_preempt_lazy_count(val) do { } while (0) +#define inc_preempt_lazy_count() do { } while (0) +#define dec_preempt_lazy_count() do { } while (0) +#define preempt_lazy_count() (0) +#endif + #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ @ include/linux/preempt.h:221 @ do { \ barrier(); \ } while (0) +#define preempt_lazy_disable() \ +do { \ + inc_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) -#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#ifndef CONFIG_PREEMPT_RT +# define preempt_enable_no_resched() sched_preempt_enable_no_resched() +# define preempt_check_resched_rt() barrier(); +#else +# define preempt_enable_no_resched() preempt_enable() +# define preempt_check_resched_rt() preempt_check_resched() +#endif #define preemptible() (preempt_count() == 0 && !irqs_disabled()) @ include/linux/preempt.h:264 @ do { \ __preempt_schedule(); \ } while (0) +/* + * open code preempt_check_resched() because it is not exported to modules and + * used by local_unlock() or bpf_enable_instrumentation(). + */ +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ + if (should_resched(0)) \ + __preempt_schedule(); \ +} while (0) + #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ @ include/linux/preempt.h:283 @ do { \ preempt_count_dec(); \ } while (0) +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define preempt_enable_notrace() \ do { \ barrier(); \ @ include/linux/preempt.h:327 @ do { \ #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() +#define preempt_check_resched_rt() barrier() #define preemptible() 0 +#define preempt_lazy_disable() barrier() +#define preempt_lazy_enable() barrier() + #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE @ include/linux/preempt.h:351 @ do { \ } while (0) #define preempt_fold_need_resched() \ do { \ - if (tif_need_resched()) \ + if (tif_need_resched_now()) \ set_preempt_need_resched(); \ } while (0) @ include/linux/preempt.h:467 @ extern void migrate_enable(void); #else -static inline void migrate_disable(void) { } -static inline void migrate_enable(void) { } +static inline void migrate_disable(void) +{ + preempt_lazy_disable(); +} + +static inline void migrate_enable(void) +{ + preempt_lazy_enable(); +} #endif /* CONFIG_SMP */ @ include/linux/printk.h:173 @ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit +extern bool pr_flush(int timeout_ms, bool reset_on_progress); + /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use @ include/linux/printk.h:229 @ static inline void printk_deferred_exit(void) { } +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return true; +} + static inline int printk_ratelimit(void) { return 0; @ include/linux/printk.h:291 @ static inline void printk_trigger_flush(void) #endif #ifdef CONFIG_SMP -extern int __printk_cpu_trylock(void); -extern void __printk_wait_on_cpu_lock(void); -extern void __printk_cpu_unlock(void); +extern int __printk_cpu_sync_try_get(void); +extern void __printk_cpu_sync_wait(void); +extern void __printk_cpu_sync_put(void); + +#else + +#define __printk_cpu_sync_try_get() true +#define __printk_cpu_sync_wait() +#define __printk_cpu_sync_put() +#endif /* CONFIG_SMP */ /** - * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning - * lock and disable interrupts. + * printk_cpu_sync_get_irqsave() - Disable interrupts and acquire the printk + * cpu-reentrant spinning lock. * @flags: Stack-allocated storage for saving local interrupt state, - * to be passed to printk_cpu_unlock_irqrestore(). + * to be passed to printk_cpu_sync_put_irqrestore(). * * If the lock is owned by another CPU, spin until it becomes available. * Interrupts are restored while spinning. */ -#define printk_cpu_lock_irqsave(flags) \ - for (;;) { \ - local_irq_save(flags); \ - if (__printk_cpu_trylock()) \ - break; \ - local_irq_restore(flags); \ - __printk_wait_on_cpu_lock(); \ +#define printk_cpu_sync_get_irqsave(flags) \ + for (;;) { \ + local_irq_save(flags); \ + if (__printk_cpu_sync_try_get()) \ + break; \ + local_irq_restore(flags); \ + __printk_cpu_sync_wait(); \ } /** - * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning - * lock and restore interrupts. - * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave(). + * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning + * lock and restore interrupts. + * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave(). */ -#define printk_cpu_unlock_irqrestore(flags) \ +#define printk_cpu_sync_put_irqrestore(flags) \ do { \ - __printk_cpu_unlock(); \ + __printk_cpu_sync_put(); \ local_irq_restore(flags); \ - } while (0) \ - -#else - -#define printk_cpu_lock_irqsave(flags) ((void)flags) -#define printk_cpu_unlock_irqrestore(flags) ((void)flags) - -#endif /* CONFIG_SMP */ + } while (0) extern int kptr_restrict; @ include/linux/random.h:38 @ static inline void add_latent_entropy(void) {} extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; +extern void add_interrupt_randomness(int irq) __latent_entropy; +extern void process_interrupt_randomness(void); extern void get_random_bytes(void *buf, int nbytes); extern int wait_for_random_bytes(void); @ include/linux/ratelimit_types.h:7 @ #include <linux/bits.h> #include <linux/param.h> -#include <linux/spinlock_types.h> +#include <linux/spinlock_types_raw.h> #define DEFAULT_RATELIMIT_INTERVAL (5 * HZ) #define DEFAULT_RATELIMIT_BURST 10 @ include/linux/rcupdate.h:98 @ void rcu_init_tasks_generic(void); static inline void rcu_init_tasks_generic(void) { } #endif +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TASKS_RCU_GENERIC) +void rcu_tasks_initiate_self_tests(void); +#else +static inline void rcu_tasks_initiate_self_tests(void) {} +#endif + + #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); @ include/linux/rtmutex.h:102 @ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); +extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock); #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) +#define rt_mutex_lock_nest_lock(lock, nest_lock) \ + do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _rt_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ + } while (0) + #else extern void rt_mutex_lock(struct rt_mutex *lock); #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) +#define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock) #endif extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); +extern int rt_mutex_lock_killable(struct rt_mutex *lock); extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); @ include/linux/rwlock.h:58 @ do { \ #define write_lock(lock) _raw_write_lock(lock) #define read_lock(lock) _raw_read_lock(lock) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define write_lock_nested(lock, subclass) _raw_write_lock_nested(lock, subclass) +#else +#define write_lock_nested(lock, subclass) _raw_write_lock(lock) +#endif + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define read_lock_irqsave(lock, flags) \ @ include/linux/rwlock_api_smp.h:20 @ void __lockfunc _raw_read_lock(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock(rwlock_t *lock) __acquires(lock); +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) __acquires(lock); void __lockfunc _raw_read_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires(lock); @ include/linux/rwlock_api_smp.h:213 @ static inline void __raw_write_lock(rwlock_t *lock) LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); } +static inline void __raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + preempt_disable(); + rwlock_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); +} + #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */ static inline void __raw_write_unlock(rwlock_t *lock) @ include/linux/rwlock_rt.h:31 @ extern void rt_read_lock(rwlock_t *rwlock); extern int rt_read_trylock(rwlock_t *rwlock); extern void rt_read_unlock(rwlock_t *rwlock); extern void rt_write_lock(rwlock_t *rwlock); +extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass); extern int rt_write_trylock(rwlock_t *rwlock); extern void rt_write_unlock(rwlock_t *rwlock); @ include/linux/rwlock_rt.h:87 @ static __always_inline void write_lock(rwlock_t *rwlock) rt_write_lock(rwlock); } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static __always_inline void write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rt_write_lock_nested(rwlock, subclass); +} +#else +#define write_lock_nested(lock, subclass) rt_write_lock(((void)(subclass), (lock))) +#endif + static __always_inline void write_lock_bh(rwlock_t *rwlock) { local_bh_disable(); @ include/linux/sched.h:121 @ struct task_group; #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) - #define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) -#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) - /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). @ include/linux/sched.h:1081 @ struct task_struct { /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; +#ifdef CONFIG_PREEMPT_RT + /* TODO: move me into ->restart_block ? */ + struct kernel_siginfo forced_info; +#endif unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; @ include/linux/sched.h:1730 @ static __always_inline bool is_percpu_thread(void) #endif } +/* Is the current task guaranteed to stay on its current CPU? */ +static inline bool is_migratable(void) +{ +#ifdef CONFIG_SMP + return preemptible() && !current->migration_disabled; +#else + return false; +#endif +} + /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ @ include/linux/sched.h:2012 @ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +#ifdef CONFIG_PREEMPT_LAZY +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); +} + +static inline int need_resched_lazy(void) +{ + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +} + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#else +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } +static inline int need_resched_lazy(void) { return 0; } + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#endif + +#ifdef CONFIG_PREEMPT_RT +static inline bool task_match_saved_state(struct task_struct *p, long match_state) +{ + return p->saved_state == match_state; +} + +static inline bool task_is_traced(struct task_struct *task) +{ + bool traced = false; + + /* in case the task is sleeping on tasklist_lock */ + raw_spin_lock_irq(&task->pi_lock); + if (READ_ONCE(task->__state) & __TASK_TRACED) + traced = true; + else if (task->saved_state & __TASK_TRACED) + traced = true; + raw_spin_unlock_irq(&task->pi_lock); + return traced; +} + +static inline bool task_is_stopped_or_traced(struct task_struct *task) +{ + bool traced_stopped = false; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + if (READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) + traced_stopped = true; + else if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) + traced_stopped = true; + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return traced_stopped; +} + +#else + +static inline bool task_match_saved_state(struct task_struct *p, long match_state) +{ + return false; +} + +static inline bool task_is_traced(struct task_struct *task) +{ + return READ_ONCE(task->__state) & __TASK_TRACED; +} + +static inline bool task_is_stopped_or_traced(struct task_struct *task) +{ + return READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED); +} +#endif + +static inline bool task_match_state_or_saved(struct task_struct *p, + long match_state) +{ + if (READ_ONCE(p->__state) == match_state) + return true; + + return task_match_saved_state(p, match_state); +} + +static inline bool task_match_state_lock(struct task_struct *p, + long match_state) +{ + bool match; + + raw_spin_lock_irq(&p->pi_lock); + match = task_match_state_or_saved(p, match_state); + raw_spin_unlock_irq(&p->pi_lock); + + return match; +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @ include/linux/sched/task_stack.h:73 @ static inline void *try_get_task_stack(struct task_struct *tsk) } extern void put_task_stack(struct task_struct *tsk); +extern void put_task_stack_sched(struct task_struct *tsk); #else static inline void *try_get_task_stack(struct task_struct *tsk) { @ include/linux/sched/task_stack.h:81 @ static inline void *try_get_task_stack(struct task_struct *tsk) } static inline void put_task_stack(struct task_struct *tsk) {} +static inline void put_task_stack_sched(struct task_struct *tsk) {} #endif +#ifdef CONFIG_ARCH_THREAD_STACK_ALLOCATOR +static inline void task_stack_cleanup(struct task_struct *tsk) {} +#else +extern void task_stack_cleanup(struct task_struct *tsk); +#endif + +void exit_task_stack_account(struct task_struct *tsk); + #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) @ include/linux/serial_8250.h:10 @ #ifndef _LINUX_SERIAL_8250_H #define _LINUX_SERIAL_8250_H +#include <linux/atomic.h> #include <linux/serial_core.h> #include <linux/serial_reg.h> #include <linux/platform_device.h> @ include/linux/serial_8250.h:129 @ struct uart_8250_port { #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; + atomic_t console_printing; + struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @ include/linux/serial_8250.h:186 @ void serial8250_init_port(struct uart_8250_port *up); void serial8250_set_defaults(struct uart_8250_port *up); void serial8250_console_write(struct uart_8250_port *up, const char *s, unsigned int count); +void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, + unsigned int count); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); @ include/linux/smp.h:270 @ static inline int get_boot_cpu_id(void) #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() +#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) +#define put_cpu_light() migrate_enable() + /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: @ include/linux/spinlock_api_up.h:62 @ #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) +#define _raw_write_lock_nested(lock, subclass) __LOCK(lock) #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) #define _raw_read_lock_bh(lock) __LOCK_BH(lock) #define _raw_write_lock_bh(lock) __LOCK_BH(lock) @ include/linux/spinlock_types_up.h:4 @ #ifndef __LINUX_SPINLOCK_TYPES_UP_H #define __LINUX_SPINLOCK_TYPES_UP_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif @ include/linux/thread_info.h:166 @ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#ifdef CONFIG_PREEMPT_LAZY +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ + test_thread_flag(TIF_NEED_RESCHED_LAZY)) +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) +#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY) + +#else +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED) +#define tif_need_resched_lazy() 0 +#endif #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, @ include/linux/trace_events.h:72 @ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + unsigned char preempt_lazy_count; }; #define TRACE_EVENT_TYPE_MAX \ @ include/linux/trace_events.h:161 @ static inline void tracing_generic_entry_update(struct trace_entry *entry, unsigned int trace_ctx) { entry->preempt_count = trace_ctx & 0xff; + entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; entry->pid = current->pid; entry->type = type; - entry->flags = trace_ctx >> 16; + entry->flags = trace_ctx >> 24; } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); @ include/linux/trace_events.h:177 @ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, + TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, }; #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT @ include/linux/u64_stats_sync.h:69 @ #include <linux/seqlock.h> struct u64_stats_sync { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) seqcount_t seq; #endif }; @ include/linux/u64_stats_sync.h:128 @ static inline void u64_stats_inc(u64_stats_t *p) } #endif -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) #else static inline void u64_stats_init(struct u64_stats_sync *syncp) @ include/linux/u64_stats_sync.h:138 @ static inline void u64_stats_init(struct u64_stats_sync *syncp) static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); write_seqcount_begin(&syncp->seq); #endif } static inline void u64_stats_update_end(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); #endif } @ include/linux/u64_stats_sync.h:159 @ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) { unsigned long flags = 0; -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - local_irq_save(flags); +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + local_irq_save(flags); write_seqcount_begin(&syncp->seq); #endif return flags; @ include/linux/u64_stats_sync.h:173 @ static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, unsigned long flags) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); - local_irq_restore(flags); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + local_irq_restore(flags); #endif } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_begin(&syncp->seq); #else return 0; @ include/linux/u64_stats_sync.h:193 @ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_disable(); #endif return __u64_stats_fetch_begin(syncp); @ include/linux/u64_stats_sync.h:202 @ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_retry(&syncp->seq, start); #else return false; @ include/linux/u64_stats_sync.h:212 @ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_enable(); #endif return __u64_stats_fetch_retry(syncp, start); @ include/linux/u64_stats_sync.h:226 @ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_disable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_disable(); #endif return __u64_stats_fetch_begin(syncp); @ include/linux/u64_stats_sync.h:237 @ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_enable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_enable(); #endif return __u64_stats_fetch_retry(syncp, start); @ include/trace/events/net.h:263 @ DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_ARGS(skb) ); -DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_ni_entry, - - TP_PROTO(const struct sk_buff *skb), - - TP_ARGS(skb) -); - DECLARE_EVENT_CLASS(net_dev_rx_exit_template, TP_PROTO(int ret), @ include/trace/events/net.h:308 @ DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, TP_ARGS(ret) ); -DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_ni_exit, - - TP_PROTO(int ret), - - TP_ARGS(ret) -); - DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, TP_PROTO(int ret), @ init/Kconfig:1545 @ config PRINTK very difficult to diagnose system problems, saying N here is strongly discouraged. +config HAVE_ATOMIC_CONSOLE + bool + default n + config BUG bool "BUG() support" if EXPERT default y @ init/main.c:1600 @ static noinline void __init kernel_init_freeable(void) rcu_init_tasks_generic(); do_pre_smp_initcalls(); + rcu_tasks_initiate_self_tests(); lockup_detector_init(); smp_init(); @ kernel/Kconfig.preempt:4 @ # SPDX-License-Identifier: GPL-2.0-only +config HAVE_PREEMPT_LAZY + bool + +config PREEMPT_LAZY + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT + config PREEMPT_NONE_BUILD bool @ kernel/cgroup/rstat.c:159 @ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; + unsigned long flags; - raw_spin_lock(cpu_lock); + raw_spin_lock_irqsave(cpu_lock, flags); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; @ kernel/cgroup/rstat.c:173 @ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock(cpu_lock); + raw_spin_unlock_irqrestore(cpu_lock, flags); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || @ kernel/entry/common.c:162 @ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & _TIF_NEED_RESCHED_MASK) schedule(); +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (unlikely(current->forced_info.si_signo)) { + struct task_struct *t = current; + force_sig_info(&t->forced_info); + t->forced_info.si_signo = 0; + } +#endif + if (ti_work & _TIF_UPROBE) uprobe_notify_resume(regs); @ kernel/entry/common.c:398 @ void irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (should_resched(0)) preempt_schedule_irq(); } } @ kernel/exit.c:174 @ static void delayed_put_task_struct(struct rcu_head *rhp) kprobe_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); + task_stack_cleanup(tsk); put_task_struct(tsk); } @ kernel/exit.c:875 @ void __noreturn do_exit(long code) put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); + exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); @ kernel/fork.c:181 @ static inline void free_task_struct(struct task_struct *tsk) #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR +#define THREAD_STACK_DELAYED_FREE 1UL + +static void thread_stack_mark_delayed_free(struct task_struct *tsk) +{ + unsigned long val = (unsigned long)tsk->stack; + + val |= THREAD_STACK_DELAYED_FREE; + WRITE_ONCE(tsk->stack, (void *)val); +} + /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) -#ifdef CONFIG_VMAP_STACK +# ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. @ kernel/fork.c:222 @ static int free_vm_stack_cache(unsigned int cpu) return 0; } -#endif -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) +static int memcg_charge_kernel_stack(struct vm_struct *vm) { -#ifdef CONFIG_VMAP_STACK + int i; + int ret; + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + if (ret) + goto err; + } + return 0; +err: + /* + * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is + * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will + * ignore this page. + */ + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); + return ret; +} + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + struct vm_struct *vm; void *stack; int i; @ kernel/fork.c:268 @ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); + if (memcg_charge_kernel_stack(s)) { + vfree(s->addr); + return -ENOMEM; + } + tsk->stack_vm_area = s; tsk->stack = s->addr; - return s->addr; + return 0; } /* @ kernel/fork.c:288 @ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); + if (!stack) + return -ENOMEM; + vm = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm)) { + vfree(stack); + return -ENOMEM; + } /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - if (stack) { - tsk->stack_vm_area = find_vm_area(stack); - tsk->stack = stack; + tsk->stack_vm_area = vm; + tsk->stack = stack; + return 0; +} + +static void free_thread_stack(struct task_struct *tsk, bool cache_only) +{ + int i; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_cmpxchg(cached_stacks[i], NULL, + tsk->stack_vm_area) != NULL) + continue; + + tsk->stack = NULL; + tsk->stack_vm_area = NULL; + return; } - return stack; -#else + if (cache_only) { + thread_stack_mark_delayed_free(tsk); + return; + } + + vfree(tsk->stack); + tsk->stack = NULL; + tsk->stack_vm_area = NULL; +} + +# else /* !CONFIG_VMAP_STACK */ + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); if (likely(page)) { tsk->stack = kasan_reset_tag(page_address(page)); - return tsk->stack; + return 0; } - return NULL; -#endif + return -ENOMEM; } -static inline void free_thread_stack(struct task_struct *tsk) +static void free_thread_stack(struct task_struct *tsk, bool cache_only) { -#ifdef CONFIG_VMAP_STACK - struct vm_struct *vm = task_stack_vm_area(tsk); - - if (vm) { - int i; - - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); - - for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_cmpxchg(cached_stacks[i], - NULL, tsk->stack_vm_area) != NULL) - continue; - - return; - } - - vfree_atomic(tsk->stack); + if (cache_only) { + thread_stack_mark_delayed_free(tsk); return; } -#endif - __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); + tsk->stack = NULL; } -# else + +# endif /* CONFIG_VMAP_STACK */ +# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ + static struct kmem_cache *thread_stack_cache; -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); stack = kasan_reset_tag(stack); tsk->stack = stack; - return stack; + return stack ? 0 : -ENOMEM; } -static void free_thread_stack(struct task_struct *tsk) +static void free_thread_stack(struct task_struct *tsk, bool cache_only) { + if (cache_only) { + thread_stack_mark_delayed_free(tsk); + return; + } kmem_cache_free(thread_stack_cache, tsk->stack); + tsk->stack = NULL; } void thread_stack_cache_init(void) @ kernel/fork.c:384 @ void thread_stack_cache_init(void) THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } -# endif -#endif + +# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ + +void task_stack_cleanup(struct task_struct *tsk) +{ + unsigned long val = (unsigned long)tsk->stack; + + if (!(val & THREAD_STACK_DELAYED_FREE)) + return; + + WRITE_ONCE(tsk->stack, (void *)(val & ~THREAD_STACK_DELAYED_FREE)); + free_thread_stack(tsk, false); +} + +#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + unsigned long *stack; + + stack = arch_alloc_thread_stack_node(tsk, node); + tsk->stack = stack; + return stack ? 0 : -ENOMEM; +} + +static void free_thread_stack(struct task_struct *tsk, bool cache_only) +{ + arch_free_thread_stack(tsk); +} + +#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @ kernel/fork.c:468 @ void vm_area_free(struct vm_area_struct *vma) static void account_kernel_stack(struct task_struct *tsk, int account) { - void *stack = task_stack_page(tsk); - struct vm_struct *vm = task_stack_vm_area(tsk); - - if (vm) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { + void *stack = task_stack_page(tsk); + /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } } -static int memcg_charge_kernel_stack(struct task_struct *tsk) +void exit_task_stack_account(struct task_struct *tsk) { -#ifdef CONFIG_VMAP_STACK - struct vm_struct *vm = task_stack_vm_area(tsk); - int ret; + account_kernel_stack(tsk, -1); - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); - - if (vm) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm; int i; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); - - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - /* - * If memcg_kmem_charge_page() fails, page's - * memory cgroup pointer is NULL, and - * memcg_kmem_uncharge_page() in free_thread_stack() - * will ignore this page. - */ - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, - 0); - if (ret) - return ret; - } + vm = task_stack_vm_area(tsk); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); } -#endif - return 0; } -static void release_task_stack(struct task_struct *tsk) +static void release_task_stack(struct task_struct *tsk, bool cache_only) { if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */ - account_kernel_stack(tsk, -1); - free_thread_stack(tsk); - tsk->stack = NULL; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = NULL; -#endif + free_thread_stack(tsk, cache_only); } #ifdef CONFIG_THREAD_INFO_IN_TASK void put_task_stack(struct task_struct *tsk) { if (refcount_dec_and_test(&tsk->stack_refcount)) - release_task_stack(tsk); + release_task_stack(tsk, false); +} + +void put_task_stack_sched(struct task_struct *tsk) +{ + if (refcount_dec_and_test(&tsk->stack_refcount)) + release_task_stack(tsk, true); } #endif @ kernel/fork.c:530 @ void free_task(struct task_struct *tsk) * The task is finally done with both the stack and thread_info, * so free both. */ - release_task_stack(tsk); + release_task_stack(tsk, false); #else /* * If the task had a separate stack allocation, it should be gone @ kernel/fork.c:950 @ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - unsigned long *stack; - struct vm_struct *stack_vm_area __maybe_unused; int err; if (node == NUMA_NO_NODE) @ kernel/fork.c:958 @ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - stack = alloc_thread_stack_node(tsk, node); - if (!stack) + err = arch_dup_task_struct(tsk, orig); + if (err) goto free_tsk; - if (memcg_charge_kernel_stack(tsk)) - goto free_stack; + err = alloc_thread_stack_node(tsk, node); + if (err) + goto free_tsk; - stack_vm_area = task_stack_vm_area(tsk); - - err = arch_dup_task_struct(tsk, orig); - - /* - * arch_dup_task_struct() clobbers the stack-related fields. Make - * sure they're properly initialized before using any stack-related - * functions again. - */ - tsk->stack = stack; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = stack_vm_area; -#endif #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif - - if (err) - goto free_stack; + account_kernel_stack(tsk, 1); err = scs_prepare(tsk, node); if (err) @ kernel/fork.c:1013 @ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->wake_q.next = NULL; tsk->pf_io_worker = NULL; - account_kernel_stack(tsk, 1); - kcov_task_init(tsk); kmap_local_fork(tsk); @ kernel/fork.c:1031 @ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) return tsk; free_stack: - free_thread_stack(tsk); + exit_task_stack_account(tsk); + free_thread_stack(tsk, false); free_tsk: free_task_struct(tsk); return NULL; @ kernel/fork.c:2528 @ static __latent_entropy struct task_struct *copy_process( exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); + exit_task_stack_account(p); put_task_stack(p); delayed_free_task(p); fork_out: @ kernel/irq/chip.c:578 @ EXPORT_SYMBOL_GPL(handle_simple_irq); */ void handle_untracked_irq(struct irq_desc *desc) { - unsigned int flags = 0; - raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) @ kernel/irq/chip.c:594 @ void handle_untracked_irq(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); - __handle_irq_event_percpu(desc, &flags); + __handle_irq_event_percpu(desc); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); @ kernel/irq/handle.c:139 @ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int irq = desc->irq_data.irq; @ kernel/irq/handle.c:177 @ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags } __irq_wake_thread(desc, action); - - fallthrough; /* to add to randomness */ - case IRQ_HANDLED: - *flags |= action->flags; break; default: @ kernel/irq/handle.c:192 @ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval; - unsigned int flags = 0; - retval = __handle_irq_event_percpu(desc, &flags); + retval = __handle_irq_event_percpu(desc); - add_interrupt_randomness(desc->irq_data.irq, flags); + add_interrupt_randomness(desc->irq_data.irq); if (!irq_settings_no_debug(desc)) note_interrupt(desc, retval); @ kernel/irq/internals.h:106 @ extern int __irq_get_irqchip_state(struct irq_data *data, extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @ kernel/irq/irqdesc.c:665 @ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +/** + * generic_handle_irq_safe - Invoke the handler for a particular irq + * @irq: The irq number to handle + * + * Returns: 0 on success, or -EINVAL if conversion has failed + * + * This function must be called either from an IRQ context with irq regs + * initialized or with care from any context. + */ +int generic_handle_irq_safe(unsigned int irq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_to_desc(irq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_irq_safe); + #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging @ kernel/irq/manage.c:1284 @ static int irq_thread(void *data) if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + process_interrupt_randomness(); + wake_threads_waitq(desc); } @ kernel/ksysfs.c:141 @ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_CRASH_CORE */ +#if defined(CONFIG_PREEMPT_RT) +static ssize_t realtime_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", 1); +} +KERNEL_ATTR_RO(realtime); +#endif + /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @ kernel/ksysfs.c:240 @ static struct attribute * kernel_attrs[] = { #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, +#endif +#ifdef CONFIG_PREEMPT_RT + &realtime_attr.attr, #endif NULL }; @ kernel/locking/lockdep.c:5488 @ static noinstr void check_flags(unsigned long flags) } } +#ifndef CONFIG_PREEMPT_RT /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @ kernel/locking/lockdep.c:5503 @ static noinstr void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } +#endif if (!debug_locks) print_irqtrace_events(current); @ kernel/locking/rtmutex.c:1106 @ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, * the other will detect the deadlock and return -EDEADLOCK, * which is wrong, as the other waiter is not in a deadlock * situation. + * + * Except for ww_mutex, in that case the chain walk must already deal + * with spurious cycles, see the comments at [3] and [6]. */ - if (owner == task) + if (owner == task && !(build_ww_mutex() && ww_ctx)) return -EDEADLK; raw_spin_lock(&task->pi_lock); @ kernel/locking/rtmutex_api.c:24 @ int max_lock_depth = 1024; */ static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, unsigned int state, + struct lockdep_map *nest_lock, unsigned int subclass) { int ret; might_sleep(); - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_); ret = __rt_mutex_lock(&lock->rtmutex, state); if (ret) mutex_release(&lock->dep_map, _RET_IP_); @ kernel/locking/rtmutex_api.c:52 @ EXPORT_SYMBOL(rt_mutex_base_init); */ void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); } EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); +void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); +} +EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); + #else /* !CONFIG_DEBUG_LOCK_ALLOC */ /** @ kernel/locking/rtmutex_api.c:71 @ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); #endif @ kernel/locking/rtmutex_api.c:87 @ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { - return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); + return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); +/** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + /** * rt_mutex_trylock - try to lock a rt_mutex * @ kernel/locking/spinlock.c:303 @ void __lockfunc _raw_write_lock(rwlock_t *lock) __raw_write_lock(lock); } EXPORT_SYMBOL(_raw_write_lock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock))) +#endif + +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + __raw_write_lock_nested(lock, subclass); +} +EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE @ kernel/locking/spinlock_rt.c:242 @ void __sched rt_write_lock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock_nested); +#endif + void __sched rt_read_unlock(rwlock_t *rwlock) { rwlock_release(&rwlock->dep_map, _RET_IP_); @ kernel/locking/spinlock_rt.c:272 @ void __sched rt_write_unlock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_unlock); -int __sched rt_rwlock_is_contended(rwlock_t *rwlock) -{ - return rw_base_is_contended(&rwlock->rwbase); -} -EXPORT_SYMBOL(rt_rwlock_is_contended); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void __rt_rwlock_init(rwlock_t *rwlock, const char *name, struct lock_class_key *key) @ kernel/locking/ww_rt_mutex.c:29 @ int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) if (__rt_mutex_trylock(&rtm->rtmutex)) { ww_mutex_set_context_fastpath(lock, ww_ctx); - mutex_acquire_nest(&rtm->dep_map, 0, 1, ww_ctx->dep_map, _RET_IP_); + mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); return 1; } @ kernel/panic.c:218 @ void panic(const char *fmt, ...) panic_smp_self_stop(); console_verbose(); - bust_spinlocks(1); va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); @ kernel/panic.c:241 @ void panic(const char *fmt, ...) */ kgdb_panic(buf); + /* Use atomic consoles to dump the kernel log. */ + console_flush_on_panic(CONSOLE_ATOMIC_FLUSH_PENDING); + + bust_spinlocks(1); + /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. @ kernel/panic.c:540 @ void oops_enter(void) trigger_all_cpu_backtrace(); } -/* - * 64-bit random ID for oopses: - */ -static u64 oops_id; - -static int init_oops_id(void) -{ - if (!oops_id) - get_random_bytes(&oops_id, sizeof(oops_id)); - else - oops_id++; - - return 0; -} -late_initcall(init_oops_id); - static void print_oops_end_marker(void) { - init_oops_id(); - pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", 0ULL); } /* @ kernel/printk/printk.c:47 @ #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> +#include <linux/clocksource.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> @ kernel/printk/printk.c:218 @ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; +/* + * Used to synchronize printing kthreads against direct printing via + * console_trylock/console_unlock. + * + * Values: + * -1 = console locked (via trylock), kthreads will not print + * 0 = no kthread printing, console not locked (via trylock) + * >0 = kthread(s) actively printing + * + * Note: For synchronizing against direct printing via + * console_lock/console_unlock, see the @lock variable in + * struct console. + */ +static atomic_t console_lock_count = ATOMIC_INIT(0); + +#define console_excl_trylock() (atomic_cmpxchg(&console_lock_count, 0, -1) == 0) +#define console_excl_unlock() atomic_cmpxchg(&console_lock_count, -1, 0) +#define console_printer_tryenter() atomic_inc_unless_negative(&console_lock_count) +#define console_printer_exit() atomic_dec(&console_lock_count) + /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @ kernel/printk/printk.c:281 @ static void __up_console_sem(unsigned long ip) #define up_console_sem() __up_console_sem(_RET_IP_) /* - * This is used for debugging the mess that is the VT code by - * keeping track if we have the console semaphore held. It's - * definitely not the perfect debug tool (we don't know if _WE_ - * hold it and are racing, but it helps tracking those weird code - * paths in the console code where we end up in places I want - * locked without the console semaphore held). + * Tracks whether kthread printers are all paused. A value of true implies + * that the console is locked via console_lock() or the console is suspended. + * Reading and writing to this variable requires holding @console_sem. */ -static int console_locked, console_suspended; +static bool consoles_paused; /* - * If exclusive_console is non-NULL then only this console is to be printed to. + * Pause or unpause all kthread printers. + * + * Requires the console_lock. */ -static struct console *exclusive_console; +static void __pause_all_consoles(bool do_pause) +{ + struct console *con; + + for_each_console(con) { + mutex_lock(&con->lock); + if (do_pause) + con->flags |= CON_PAUSED; + else + con->flags &= ~CON_PAUSED; + mutex_unlock(&con->lock); + } + + consoles_paused = do_pause; +} + +#define pause_all_consoles() __pause_all_consoles(true) +#define unpause_all_consoles() __pause_all_consoles(false) + +static int console_suspended; /* * Array of consoles built from command line options (console=) @ kernel/printk/printk.c:395 @ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); +/* + * A flag to signify if printk_late_init() has already started the kthread + * printers. If true, any later registered consoles must start their own + * kthread directly. The flag is write protected by the console_lock. + */ +static bool kthreads_started; + +static inline bool kthread_printers_active(void) +{ + return (kthreads_started && + system_state == SYSTEM_RUNNING && + !oops_in_progress); +} + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ @ kernel/printk/printk.c:417 @ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; -/* All 3 protected by @console_sem. */ -/* the next printk record to write to the console */ -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; - struct latched_seq { seqcount_latch_t latch; u64 val[2]; @ kernel/printk/printk.c:442 @ static struct latched_seq clear_seq = { /* the maximum size of a formatted record (i.e. with prefix added per line) */ #define CONSOLE_LOG_MAX 1024 +/* the maximum size for a dropped text message */ +#define DROPPED_TEXT_MAX 64 + /* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) @ kernel/printk/printk.c:1876 @ static int console_lock_spinning_disable_and_check(void) return 1; } +#if (!IS_ENABLED(CONFIG_PREEMPT_RT)) /** * console_trylock_spinning - try to get console_lock by busy waiting * @ kernel/printk/printk.c:1940 @ static int console_trylock_spinning(void) return 1; } +#endif /* CONFIG_PREEMPT_RT */ /* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. + * Call the specified console driver, asking it to write out the specified + * text and length. If @dropped_text is non-NULL and any records have been + * dropped, a dropped message will be written out first. */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text, bool atomic_printing) { - static char dropped_text[64]; - size_t dropped_len = 0; - struct console *con; + unsigned long dropped = 0; + size_t dropped_len; trace_console_rcuidle(text, len); - if (!console_drivers) - return; + if (dropped_text) + dropped = atomic_long_xchg_relaxed(&con->dropped, 0); - if (console_dropped) { - dropped_len = snprintf(dropped_text, sizeof(dropped_text), + if (dropped) { + dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX, "** %lu printk messages dropped **\n", - console_dropped); - console_dropped = 0; + dropped); + if (atomic_printing) + con->write_atomic(con, dropped_text, dropped_len); + else + con->write(con, dropped_text, dropped_len); } - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if (!(con->flags & CON_ENABLED)) - continue; - if (!con->write) - continue; - if (!cpu_online(smp_processor_id()) && - !(con->flags & CON_ANYTIME)) - continue; - if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); - else { - if (dropped_len) - con->write(con, dropped_text, dropped_len); - con->write(con, text, len); - } - } + if (atomic_printing) + con->write_atomic(con, text, len); + else + con->write(con, text, len); } /* @ kernel/printk/printk.c:2061 @ static inline void printk_delay(void) static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : - 0x80000000 + raw_smp_processor_id(); + 0x80000000 + smp_processor_id(); } /** @ kernel/printk/printk.c:2143 @ int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { - const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; @ kernel/printk/printk.c:2152 @ int vprintk_store(int facility, int level, u8 *recursion_ptr; u16 reserve_size; va_list args2; + u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; + if (!printk_enter_irqsave(recursion_ptr, irqflags)) + return 0; + /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is @ kernel/printk/printk.c:2168 @ int vprintk_store(int facility, int level, */ ts_nsec = local_clock(); - if (!printk_enter_irqsave(recursion_ptr, irqflags)) - return 0; + caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be @ kernel/printk/printk.c:2268 @ asmlinkage int vprintk_emit(int facility, int level, in_sched = true; } - boot_delay_msec(level); - printk_delay(); - printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { + if (!in_sched && !kthread_printers_active()) { + /* + * Try to acquire and then immediately release the console + * semaphore. The release will print out buffers. + */ +#if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* + * Use the non-spinning trylock since PREEMPT_RT does not + * support console lock handovers. + * + * Direct printing will most likely involve taking spinlocks. + * For PREEMPT_RT, this is only allowed if in a preemptible + * context. + */ + if (preemptible() && console_trylock()) + console_unlock(); +#else /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to * console */ preempt_disable(); - /* - * Try to acquire and then immediately release the console - * semaphore. The release will print out buffers and wake up - * /dev/kmsg and syslog() users. - */ if (console_trylock_spinning()) console_unlock(); preempt_enable(); +#endif } wake_up_klogd(); @ kernel/printk/printk.c:2324 @ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE +static void __free_atomic_data(struct console_atomic_data *d) +{ + kfree(d->text); + kfree(d->ext_text); + kfree(d->dropped_text); +} + +static void free_atomic_data(struct console_atomic_data *d) +{ + int count = 1; + int i; + + if (!d) + return; + +#ifdef CONFIG_HAVE_NMI + count = 2; +#endif + + for (i = 0; i < count; i++) + __free_atomic_data(&d[i]); + kfree(d); +} + +static int __alloc_atomic_data(struct console_atomic_data *d, short flags) +{ + d->text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + if (!d->text) + return -1; + + if (flags & CON_EXTENDED) { + d->ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); + if (!d->ext_text) + return -1; + } else { + d->dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL); + if (!d->dropped_text) + return -1; + } + + return 0; +} + +static struct console_atomic_data *alloc_atomic_data(short flags) +{ + struct console_atomic_data *d; + int count = 1; + int i; + +#ifdef CONFIG_HAVE_NMI + count = 2; +#endif + + d = kzalloc(sizeof(*d) * count, GFP_KERNEL); + if (!d) + goto err_out; + + for (i = 0; i < count; i++) { + if (__alloc_atomic_data(&d[i], flags) != 0) + goto err_out; + } + + return d; +err_out: + free_atomic_data(d); + return NULL; +} +#endif /* CONFIG_HAVE_ATOMIC_CONSOLE */ + +static void start_printk_kthread(struct console *con); + #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 +#define DROPPED_TEXT_MAX 0 #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 +#define prb_next_seq(rb) 0 + +#define free_atomic_data(d) static u64 syslog_seq; -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; static size_t record_print_text(const struct printk_record *r, bool syslog, bool time) @ kernel/printk/printk.c:2425 @ static ssize_t msg_print_ext_body(char *buf, size_t size, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text, bool atomic_printing) {} static bool suppress_message_printing(int level) { return false; } +static void start_printk_kthread(struct console *con) {} #endif /* CONFIG_PRINTK */ @ kernel/printk/printk.c:2604 @ void suspend_console(void) if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); + pr_flush(1000, true); console_lock(); console_suspended = 1; up_console_sem(); @ kernel/printk/printk.c:2617 @ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); + pr_flush(1000, true); } /** @ kernel/printk/printk.c:2654 @ void console_lock(void) down_console_sem(); if (console_suspended) return; - console_locked = 1; + pause_all_consoles(); console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); @ kernel/printk/printk.c:2675 @ int console_trylock(void) up_console_sem(); return 0; } - console_locked = 1; + if (!console_excl_trylock()) { + up_console_sem(); + return 0; + } console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); +/* + * This is used to help to make sure that certain paths within the VT code are + * running with the console lock held. It is definitely not the perfect debug + * tool (it is not known if the VT code is the task holding the console lock), + * but it helps tracking those weird code paths in the console code such as + * when the console is suspended: where the console is not locked but no + * console printing may occur. + * + * Note: This returns true when the console is suspended but is not locked. + * This is intentional because the VT code must consider that situation + * the same as if the console was locked. + */ int is_console_locked(void) { - return console_locked; + return (consoles_paused || atomic_read(&console_lock_count)); } EXPORT_SYMBOL(is_console_locked); /* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. + * Check if the given console is currently capable and allowed to print + * records. + * + * Requires the console_lock. */ -static int have_callable_console(void) +static inline bool console_is_usable(struct console *con, bool atomic_printing) { - struct console *con; + if (!(con->flags & CON_ENABLED)) + return false; - for_each_console(con) - if ((con->flags & CON_ENABLED) && - (con->flags & CON_ANYTIME)) - return 1; + if (atomic_printing) { +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE + if (!con->write_atomic) + return false; + if (!con->atomic_data) + return false; +#else + return false; +#endif + } else if (!con->write) { + return false; + } - return 0; + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until per-cpu resources have + * been allocated. + */ + if (!printk_percpu_data_ready() && + !(con->flags & CON_ANYTIME)) + return false; + + return true; +} + +static void __console_unlock(void) +{ + /* + * Depending on whether console_lock() or console_trylock() was used, + * appropriately allow the kthread printers to continue. + */ + if (consoles_paused) + unpause_all_consoles(); + else + console_excl_unlock(); + + /* Wake the kthread printers. */ + wake_up_klogd(); + + up_console_sem(); +} + +static u64 read_console_seq(struct console *con) +{ +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE + unsigned long flags; + u64 seq2; + u64 seq; + + if (!con->atomic_data) + return con->seq; + + printk_cpu_sync_get_irqsave(flags); + + seq = con->seq; + seq2 = con->atomic_data[0].seq; + if (seq2 > seq) + seq = seq2; +#ifdef CONFIG_HAVE_NMI + seq2 = con->atomic_data[1].seq; + if (seq2 > seq) + seq = seq2; +#endif + + printk_cpu_sync_put_irqrestore(flags); + + return seq; +#else /* CONFIG_HAVE_ATOMIC_CONSOLE */ + return con->seq; +#endif +} + +static void write_console_seq(struct console *con, u64 val, bool atomic_printing) +{ +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE + unsigned long flags; + u64 *seq; + + if (!con->atomic_data) { + con->seq = val; + return; + } + + printk_cpu_sync_get_irqsave(flags); + + if (atomic_printing) { + seq = &con->atomic_data[0].seq; +#ifdef CONFIG_HAVE_NMI + if (in_nmi()) + seq = &con->atomic_data[1].seq; +#endif + } else { + seq = &con->seq; + } + *seq = val; + + printk_cpu_sync_put_irqrestore(flags); +#else /* CONFIG_HAVE_ATOMIC_CONSOLE */ + con->seq = val; +#endif } /* - * Can we actually use the console at this time on this cpu? + * Print one record for the given console. The record printed is whatever + * record is the next available record for the given console. * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. + * @text is a buffer of size CONSOLE_LOG_MAX. + * + * If extended messages should be printed, @ext_text is a buffer of size + * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL. + * + * If dropped messages should be printed, @dropped_text is a buffer of size + * DROPPED_TEXT_MAX. Otherise @dropped_text must be NULL. + * + * @atomic_printing specifies if atomic printing should be used. + * + * Requires the console_lock. + * + * Returns false if the given console has no next record to print, otherwise + * true. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. A NULL pointer may be provided to disable allowing + * the console_lock to be taken over by a printk waiter. */ -static inline int can_use_console(void) +static bool console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text, bool atomic_printing, + bool *handover) { - return cpu_online(raw_smp_processor_id()) || have_callable_console(); + struct printk_info info; + struct printk_record r; + unsigned long flags; + bool allow_handover; + char *write_text; + size_t len; + u64 seq; + + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + + if (handover) + *handover = false; + + seq = read_console_seq(con); + + if (!prb_read_valid(prb, seq, &r)) + return false; + + if (seq != r.info->seq) { + atomic_long_add((unsigned long)(r.info->seq - seq), &con->dropped); + write_console_seq(con, r.info->seq, atomic_printing); + seq = r.info->seq; + } + + /* Skip record that has level above the console loglevel. */ + if (suppress_message_printing(r.info->level)) { + write_console_seq(con, seq + 1, atomic_printing); + goto skip; + } + + if (ext_text) { + write_text = ext_text; + len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info); + len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len, + &r.text_buf[0], r.info->text_len, &r.info->dev_info); + } else { + write_text = text; + len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + } + +#if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* PREEMPT_RT does not support console lock handovers. */ + allow_handover = false; +#else + /* Handovers may only happen between trylock contexts. */ + allow_handover = (handover && atomic_read(&console_lock_count) == -1); +#endif + + if (allow_handover) { + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + } + + stop_critical_timings(); /* don't trace print latency */ + call_console_driver(con, write_text, len, dropped_text, atomic_printing); + start_critical_timings(); + + write_console_seq(con, seq + 1, atomic_printing); + + if (allow_handover) { + *handover = console_lock_spinning_disable_and_check(); + printk_safe_exit_irqrestore(flags); + } + + boot_delay_msec(r.info->level); + printk_delay(); +skip: + return true; } +/* + * Print out all remaining records to all consoles. + * + * Requires the console_lock. + * + * Returns true if a console was available for flushing, otherwise false. + * + * @next_seq is set to the highest sequence number of all of the consoles that + * were flushed. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. + */ +static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +{ + static char dropped_text[DROPPED_TEXT_MAX]; + static char ext_text[CONSOLE_EXT_LOG_MAX]; + static char text[CONSOLE_LOG_MAX]; + bool any_usable = false; + struct console *con; + bool any_progress; + + *next_seq = 0; + *handover = false; + + do { + /* Let the kthread printers do the work if they can. */ + if (kthread_printers_active()) + return false; + + any_progress = false; + + for_each_console(con) { + bool progress; + + if (!console_is_usable(con, false)) + continue; + if ((con->flags & CON_MIGHT_SLEEP) && !do_cond_resched) + continue; + any_usable = true; + + if (con->flags & CON_EXTENDED) { + /* Extended consoles do not print "dropped messages". */ + progress = console_emit_next_record(con, &text[0], + &ext_text[0], NULL, + false, handover); + } else { + progress = console_emit_next_record(con, &text[0], + NULL, &dropped_text[0], + false, handover); + } + if (*handover) + return true; + + /* Track the highest seq flushed. */ + if (con->seq > *next_seq) + *next_seq = con->seq; + + if (!progress) + continue; + any_progress = true; + + if (do_cond_resched) + cond_resched(); + } + } while (any_progress); + + return any_usable; +} + +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE +static void atomic_console_flush_all(void) +{ + bool any_usable = false; + unsigned long flags; + struct console *con; + bool any_progress; + int index = 0; + + if (console_suspended) + return; + +#ifdef CONFIG_HAVE_NMI + if (in_nmi()) + index = 1; +#endif + + printk_cpu_sync_get_irqsave(flags); + + do { + any_progress = false; + + for_each_console(con) { + bool progress; + + if (!console_is_usable(con, true)) + continue; + any_usable = true; + + if (con->flags & CON_EXTENDED) { + /* Extended consoles do not print "dropped messages". */ + progress = console_emit_next_record(con, + &con->atomic_data->text[index], + &con->atomic_data->ext_text[index], + NULL, + true, NULL); + } else { + progress = console_emit_next_record(con, + &con->atomic_data->text[index], + NULL, + &con->atomic_data->dropped_text[index], + true, NULL); + } + + if (!progress) + continue; + any_progress = true; + + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + touch_nmi_watchdog(); + } + } while (any_progress); + + printk_cpu_sync_put_irqrestore(flags); +} +#else /* CONFIG_HAVE_ATOMIC_CONSOLE */ +#define atomic_console_flush_all() +#endif + /** * console_unlock - unlock the console system * @ kernel/printk/printk.c:3072 @ static inline int can_use_console(void) */ void console_unlock(void) { - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[CONSOLE_LOG_MAX]; - unsigned long flags; - bool do_cond_resched, retry; - struct printk_info info; - struct printk_record r; - u64 __maybe_unused next_seq; + bool do_cond_resched; + bool handover; + bool flushed; + u64 next_seq; if (console_suspended) { up_console_sem(); return; } - prb_rec_init_rd(&r, &info, text, sizeof(text)); - /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @ kernel/printk/printk.c:3097 @ void console_unlock(void) * and cleared after the "again" goto label. */ do_cond_resched = console_may_schedule; -again: - console_may_schedule = 0; - /* - * We released the console_sem lock, so we need to recheck if - * cpu is online and (if not) is there at least one CON_ANYTIME - * console. - */ - if (!can_use_console()) { - console_locked = 0; - up_console_sem(); - return; - } + do { + console_may_schedule = 0; - for (;;) { - size_t ext_len = 0; - int handover; - size_t len; - -skip: - if (!prb_read_valid(prb, console_seq, &r)) + flushed = console_flush_all(do_cond_resched, &next_seq, &handover); + if (handover) break; - if (console_seq != r.info->seq) { - console_dropped += r.info->seq - console_seq; - console_seq = r.info->seq; - } + __console_unlock(); - if (suppress_message_printing(r.info->level)) { - /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. - */ - console_seq++; - goto skip; - } - - /* Output to all consoles once old messages replayed. */ - if (unlikely(exclusive_console && - console_seq >= exclusive_console_stop_seq)) { - exclusive_console = NULL; - } + /* Were there any consoles available for flushing? */ + if (!flushed) + break; /* - * Handle extended console text first because later - * record_print_text() will modify the record buffer in-place. + * Some context may have added new records after + * console_flush_all() but before unlocking the console. + * Re-check if there is a new record to flush. If the trylock + * fails, another context is already handling the printing. */ - if (nr_ext_console_drivers) { - ext_len = info_print_ext_header(ext_text, - sizeof(ext_text), - r.info); - ext_len += msg_print_ext_body(ext_text + ext_len, - sizeof(ext_text) - ext_len, - &r.text_buf[0], - r.info->text_len, - &r.info->dev_info); - } - len = record_print_text(&r, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time); - console_seq++; - - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). - */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(ext_text, ext_len, text, len); - start_critical_timings(); - - handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); - if (handover) - return; - - if (do_cond_resched) - cond_resched(); - } - - /* Get consistent value of the next-to-be-used sequence number. */ - next_seq = console_seq; - - console_locked = 0; - up_console_sem(); - - /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. - */ - retry = prb_read_valid(prb, next_seq, NULL); - if (retry && console_trylock()) - goto again; + } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } EXPORT_SYMBOL(console_unlock); @ kernel/printk/printk.c:3148 @ void console_unblank(void) if (oops_in_progress) { if (down_trylock_console_sem() != 0) return; - } else + if (!console_excl_trylock()) { + up_console_sem(); + return; + } + } else { + pr_flush(1000, true); console_lock(); + } - console_locked = 1; console_may_schedule = 0; for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) @ kernel/printk/printk.c:3172 @ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { + if (mode == CONSOLE_ATOMIC_FLUSH_PENDING) { + atomic_console_flush_all(); + return; + } + /* * If someone else is holding the console lock, trylock will fail * and may_schedule may be set. Ignore and proceed to unlock so @ kernel/printk/printk.c:3187 @ void console_flush_on_panic(enum con_flush_mode mode) console_trylock(); console_may_schedule = 0; - if (mode == CONSOLE_REPLAY_ALL) - console_seq = prb_first_valid_seq(prb); + if (mode == CONSOLE_REPLAY_ALL) { + struct console *c; + u64 seq; + + seq = prb_first_valid_seq(prb); + for_each_console(c) + write_console_seq(c, seq, false); + } console_unlock(); } @ kernel/printk/printk.c:3225 @ struct tty_driver *console_device(int *index) */ void console_stop(struct console *console) { + pr_flush(1000, true); console_lock(); console->flags &= ~CON_ENABLED; console_unlock(); @ kernel/printk/printk.c:3237 @ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); + pr_flush(1000, true); } EXPORT_SYMBOL(console_start); @ kernel/printk/printk.c:3418 @ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; - if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - * - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - * - * Set exclusive_console with disabled interrupts to reduce - * race window with eventual console_flush_on_panic() that - * ignores console_lock. - */ - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; + if (consoles_paused) + newcon->flags |= CON_PAUSED; + atomic_long_set(&newcon->dropped, 0); +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE + newcon->atomic_data = NULL; +#endif + mutex_init(&newcon->lock); + if (newcon->flags & CON_PRINTBUFFER) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); - console_seq = syslog_seq; + write_console_seq(newcon, syslog_seq, false); mutex_unlock(&syslog_lock); + } else { + /* Begin with next message. */ + write_console_seq(newcon, prb_next_seq(prb), false); } + if (kthreads_started) + start_printk_kthread(newcon); console_unlock(); console_sysfs_notify(); @ kernel/printk/printk.c:3493 @ int unregister_console(struct console *console) } } + if (console->thread) { + kthread_stop(console->thread); + console->thread = NULL; + } + if (res) goto out_disable_unlock; @ kernel/printk/printk.c:3515 @ int unregister_console(struct console *console) console_unlock(); console_sysfs_notify(); +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE + free_atomic_data(console->atomic_data); +#endif + if (console->exit) res = console->exit(console); @ kernel/printk/printk.c:3607 @ static int __init printk_late_init(void) ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); + + console_lock(); + for_each_console(con) + start_printk_kthread(con); + kthreads_started = true; + console_unlock(); + return 0; } late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/** + * pr_flush() - Wait for printing threads to catch up. + * + * @timeout_ms: The maximum time (in ms) to wait. + * @reset_on_progress: Reset the timeout if forward progress is seen. + * + * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 + * represents infinite waiting. + * + * If @reset_on_progress is true, the timeout will be reset whenever any + * printer has been seen to make some forward progress. + * + * Context: Process context. May sleep while acquiring console lock. + * Return: true if all enabled printers are caught up. + */ +bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + int remaining = timeout_ms; + struct console *con; + u64 last_diff = 0; + u64 printk_seq; + u64 diff; + u64 seq; + + might_sleep(); + + seq = prb_next_seq(prb); + + for (;;) { + diff = 0; + + console_lock(); + for_each_console(con) { + if (!console_is_usable(con, false)) + continue; + printk_seq = con->seq; + if (printk_seq < seq) + diff += seq - printk_seq; + } + console_unlock(); + + if (diff != last_diff && reset_on_progress) + remaining = timeout_ms; + + if (diff == 0 || remaining == 0) + break; + + if (remaining < 0) { + /* no timeout limit */ + msleep(100); + } else if (remaining < 100) { + msleep(remaining); + remaining = 0; + } else { + msleep(100); + remaining -= 100; + } + + last_diff = diff; + } + + return (diff == 0); +} +EXPORT_SYMBOL(pr_flush); + +static bool printer_should_wake(struct console *con, u64 seq) +{ + short flags; + + if (kthread_should_stop()) + return true; + + /* + * This is an unsafe read to con->flags, but false positives + * are not an issue as long as they are rare. + */ + flags = data_race(READ_ONCE(con->flags)); + + if (!(flags & CON_ENABLED) || + (flags & CON_PAUSED) || + atomic_read(&console_lock_count) == -1) { + return false; + } + + return prb_read_valid(prb, seq, NULL); +} + +static int printk_kthread_func(void *data) +{ + struct console *con = data; + char *dropped_text = NULL; + char *ext_text = NULL; + bool progress; + u64 seq = 0; + char *text; + int error; + + pr_info("%sconsole [%s%d]: printing thread started\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); + +#ifdef CONFIG_HAVE_ATOMIC_CONSOLE + if (con->write_atomic) + con->atomic_data = alloc_atomic_data(con->flags); +#endif + + text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + if (!text) + goto out; + + if (con->flags & CON_EXTENDED) { + ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); + if (!ext_text) + goto out; + } else { + dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL); + if (!dropped_text) + goto out; + } + + for (;;) { + error = wait_event_interruptible(log_wait, printer_should_wake(con, seq)); + + if (kthread_should_stop()) + break; + + if (error) + continue; + + do { + error = mutex_lock_interruptible(&con->lock); + if (error) + break; + + if (!console_is_usable(con, false)) { + mutex_unlock(&con->lock); + break; + } + + if ((con->flags & CON_PAUSED) || !console_printer_tryenter()) { + mutex_unlock(&con->lock); + break; + } + + /* + * Even though the printk kthread is always preemptible, it is + * still not allowed to call cond_resched() from within + * console drivers. The task may become non-preemptible in the + * console driver call chain. For example, vt_console_print() + * takes a spinlock and then can call into fbcon_redraw(), + * which can conditionally invoke cond_resched(). + */ + console_may_schedule = 0; + progress = console_emit_next_record(con, text, ext_text, + dropped_text, false, NULL); + + seq = con->seq; + + console_printer_exit(); + + mutex_unlock(&con->lock); + } while (progress); + } +out: + kfree(dropped_text); + kfree(ext_text); + kfree(text); + pr_info("%sconsole [%s%d]: printing thread stopped\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); + return 0; +} + +/* Must be called within console_lock(). */ +static void start_printk_kthread(struct console *con) +{ + con->thread = kthread_run(printk_kthread_func, con, + "pr/%s%d", con->name, con->index); + if (IS_ERR(con->thread)) { + con->thread = NULL; + pr_err("%sconsole [%s%d]: unable to start printing thread\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); + return; + } +} + /* * Delayed printk version, for scheduler-internal messages: */ @ kernel/printk/printk.c:3825 @ static void wake_up_klogd_work_func(struct irq_work *irq_work) } if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); + wake_up_interruptible_all(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = @ kernel/printk/printk.c:4188 @ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif #ifdef CONFIG_SMP -static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); -static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); +static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); +static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); /** - * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant - * spinning lock is not owned by any CPU. + * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant + * spinning lock is not owned by any CPU. * * Context: Any context. */ -void __printk_wait_on_cpu_lock(void) +void __printk_cpu_sync_wait(void) { do { cpu_relax(); - } while (atomic_read(&printk_cpulock_owner) != -1); + } while (atomic_read(&printk_cpu_sync_owner) != -1); } -EXPORT_SYMBOL(__printk_wait_on_cpu_lock); +EXPORT_SYMBOL(__printk_cpu_sync_wait); /** - * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant - * spinning lock. + * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant + * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the @ kernel/printk/printk.c:4216 @ EXPORT_SYMBOL(__printk_wait_on_cpu_lock); * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ -int __printk_cpu_trylock(void) +int __printk_cpu_sync_try_get(void) { int cpu; int old; @ kernel/printk/printk.c:4226 @ int __printk_cpu_trylock(void) /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with - * __printk_cpu_unlock:B. + * __printk_cpu_sync_put:B. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then - * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_put:A can never read from + * __printk_cpu_sync_try_get:B. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of this CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of this CPU */ - old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1, - cpu); /* LMM(__printk_cpu_trylock:A) */ + old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, + cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing - * data: LMM(__printk_cpu_trylock:B) + * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ - atomic_inc(&printk_cpulock_nested); + atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } -EXPORT_SYMBOL(__printk_cpu_trylock); +EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** - * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock. + * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ -void __printk_cpu_unlock(void) +void __printk_cpu_sync_put(void) { - if (atomic_read(&printk_cpulock_nested)) { - atomic_dec(&printk_cpulock_nested); + if (atomic_read(&printk_cpu_sync_nested)) { + atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: - * LMM(__printk_cpu_unlock:A) + * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs - * with __printk_cpu_trylock:A. + * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, - * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of the next CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of the next CPU */ - atomic_set_release(&printk_cpulock_owner, - -1); /* LMM(__printk_cpu_unlock:B) */ + atomic_set_release(&printk_cpu_sync_owner, + -1); /* LMM(__printk_cpu_sync_put:B) */ } -EXPORT_SYMBOL(__printk_cpu_unlock); +EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */ @ kernel/ptrace.c:200 @ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (READ_ONCE(task->__state) & __TASK_TRACED) + WRITE_ONCE(task->__state, __TASK_TRACED); + else + task->saved_state = __TASK_TRACED; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); +#else WRITE_ONCE(task->__state, __TASK_TRACED); +#endif ret = true; } spin_unlock_irq(&task->sighand->siglock); @ kernel/ptrace.c:221 @ static bool ptrace_freeze_traced(struct task_struct *task) static void ptrace_unfreeze_traced(struct task_struct *task) { - if (READ_ONCE(task->__state) != __TASK_TRACED) + unsigned long flags; + bool frozen = true; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && + READ_ONCE(task->__state) != __TASK_TRACED) return; WARN_ON(!task->ptrace || task->parent != current); @ kernel/ptrace.c:235 @ static void ptrace_unfreeze_traced(struct task_struct *task) * Recheck state under the lock to close this race. */ spin_lock_irq(&task->sighand->siglock); - if (READ_ONCE(task->__state) == __TASK_TRACED) { - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - WRITE_ONCE(task->__state, TASK_TRACED); - } + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (READ_ONCE(task->__state) == __TASK_TRACED) + WRITE_ONCE(task->__state, TASK_TRACED); + +#ifdef CONFIG_PREEMPT_RT + else if (task->saved_state == __TASK_TRACED) + task->saved_state = TASK_TRACED; +#endif + else + frozen = false; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + if (frozen && __fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + spin_unlock_irq(&task->sighand->siglock); } @ kernel/rcu/tasks.h:1348 @ static void test_rcu_tasks_callback(struct rcu_head *rhp) rttd->notrun = true; } -static void rcu_tasks_initiate_self_tests(void) +void rcu_tasks_initiate_self_tests(void) { pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU @ kernel/rcu/tasks.h:1385 @ static int rcu_tasks_verify_self_tests(void) return ret; } late_initcall(rcu_tasks_verify_self_tests); -#else /* #ifdef CONFIG_PROVE_RCU */ -static void rcu_tasks_initiate_self_tests(void) { } -#endif /* #else #ifdef CONFIG_PROVE_RCU */ +#endif /* #ifdef CONFIG_PROVE_RCU */ void __init rcu_init_tasks_generic(void) { @ kernel/rcu/tasks.h:1400 @ void __init rcu_init_tasks_generic(void) #ifdef CONFIG_TASKS_TRACE_RCU rcu_spawn_tasks_trace_kthread(); #endif - - // Run the self-tests. - rcu_tasks_initiate_self_tests(); } #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ @ kernel/rcu/tree.c:2279 @ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake = false; - const bool offloaded = rcu_rdp_is_offloaded(rdp); + bool offloaded, needwake = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); + offloaded = rcu_rdp_is_offloaded(rdp); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || rdp->gpwrap) { @ kernel/rcu/tree.c:2447 @ static void rcu_do_batch(struct rcu_data *rdp) int div; bool __maybe_unused empty; unsigned long flags; - const bool offloaded = rcu_rdp_is_offloaded(rdp); + bool offloaded; struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; @ kernel/rcu/tree.c:2473 @ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); + offloaded = rcu_rdp_is_offloaded(rdp); div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); @ kernel/sched/core.c:989 @ void resched_curr(struct rq *rq) trace_sched_wake_idle_without_ipi(cpu); } +#ifdef CONFIG_PREEMPT_LAZY + +static int tsk_is_polling(struct task_struct *p) +{ +#ifdef TIF_POLLING_NRFLAG + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); +#else + return 0; +#endif +} + +void resched_curr_lazy(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + int cpu; + + if (!sched_feat(PREEMPT_LAZY)) { + resched_curr(rq); + return; + } + + if (test_tsk_need_resched(curr)) + return; + + if (test_tsk_need_resched_lazy(curr)) + return; + + set_tsk_need_resched_lazy(curr); + + cpu = cpu_of(rq); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED_LAZY must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(curr)) + smp_send_reschedule(cpu); +} +#endif + void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @ kernel/sched/core.c:2203 @ void migrate_disable(void) preempt_disable(); this_rq()->nr_pinned++; p->migration_disabled = 1; + preempt_lazy_disable(); preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @ kernel/sched/core.c:2215 @ void migrate_enable(void) if (p->migration_disabled > 1) { p->migration_disabled--; return; + } else if (WARN_ON_ONCE(p->migration_disabled == 0)) { + return; } /* @ kernel/sched/core.c:2234 @ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; + preempt_lazy_enable(); preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @ kernel/sched/core.c:3272 @ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * is actually now running somewhere else! */ while (task_running(rq, p)) { - if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) + if (match_state && !task_match_state_lock(p, match_state)) return 0; cpu_relax(); } @ kernel/sched/core.c:3287 @ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || READ_ONCE(p->__state) == match_state) + if (!match_state || task_match_state_or_saved(p, match_state)) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @ kernel/sched/core.c:4465 @ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->on_cpu = 0; #endif init_task_preempt_count(p); +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(p)->preempt_lazy_count = 0; +#endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); @ kernel/sched/core.c:4930 @ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); - /* Task is done with its stack. */ - put_task_stack(prev); + /* + * Cache only the VMAP stack. The final deallocation is in + * delayed_put_task_struct. + */ + put_task_stack_sched(prev); put_task_struct_rcu_user(prev); } @ kernel/sched/core.c:6269 @ static void __sched notrace __schedule(unsigned int sched_mode) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); + clear_tsk_need_resched_lazy(prev); clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; @ kernel/sched/core.c:6481 @ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } +#ifdef CONFIG_PREEMPT_LAZY +/* + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as + * preempt_lazy_count counter >0. + */ +static __always_inline int preemptible_lazy(void) +{ + if (test_thread_flag(TIF_NEED_RESCHED)) + return 1; + if (current_thread_info()->preempt_lazy_count) + return 0; + return 1; +} + +#else + +static inline int preemptible_lazy(void) +{ + return 1; +} + +#endif + #ifdef CONFIG_PREEMPTION /* * This is the entry point to schedule() from in-kernel preemption @ kernel/sched/core.c:6518 @ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; - + if (!preemptible_lazy()) + return; preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); @ kernel/sched/core.c:6552 @ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) if (likely(!preemptible())) return; + if (!preemptible_lazy()) + return; + do { /* * Because the function tracer can trace preempt_count_sub() @ kernel/sched/core.c:8735 @ void __init init_idle(struct task_struct *idle, int cpu) /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); - +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(idle)->preempt_lazy_count = 0; +#endif /* * The idle tasks have their own, simple scheduling class: */ @ kernel/sched/fair.c:4396 @ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @ kernel/sched/fair.c:4420 @ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static void @ kernel/sched/fair.c:4566 @ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } /* @ kernel/sched/fair.c:4715 @ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static __always_inline @ kernel/sched/fair.c:5478 @ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (task_current(rq, p)) - resched_curr(rq); + resched_curr_lazy(rq); return; } hrtick_start(rq, delta); @ kernel/sched/fair.c:7175 @ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @ kernel/sched/fair.c:11210 @ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_curr(rq); + resched_curr_lazy(rq); } se->vruntime -= cfs_rq->min_vruntime; @ kernel/sched/fair.c:11237 @ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (task_current(rq, p)) { if (p->prio > oldprio) - resched_curr(rq); + resched_curr_lazy(rq); } else check_preempt_curr(rq, p, 0); } @ kernel/sched/features.h:51 @ SCHED_FEAT(NONTASK_CAPACITY, true) #ifdef CONFIG_PREEMPT_RT SCHED_FEAT(TTWU_QUEUE, false) +# ifdef CONFIG_PREEMPT_LAZY +SCHED_FEAT(PREEMPT_LAZY, true) +# endif #else /* @ kernel/sched/sched.h:2303 @ extern void reweight_task(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); +#ifdef CONFIG_PREEMPT_LAZY +extern void resched_curr_lazy(struct rq *rq); +#else +static inline void resched_curr_lazy(struct rq *rq) +{ + resched_curr(rq); +} +#endif + extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); @ kernel/sched/swait.c:67 @ void swake_up_all(struct swait_queue_head *q) struct swait_queue *curr; LIST_HEAD(tmp); + WARN_ON(irqs_disabled()); raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { @ kernel/signal.c:1327 @ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, struct k_sigaction *action; int sig = info->si_signo; + /* + * On some archs, PREEMPT_RT has to delay sending a signal from a trap + * since it can not enable preemption, and the signal code's spin_locks + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will + * send the signal on exit of the trap. + */ +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (in_atomic()) { + struct task_struct *t = current; + + if (WARN_ON_ONCE(t->forced_info.si_signo)) + return 0; + + if (is_si_special(info)) { + WARN_ON_ONCE(info != SEND_SIG_PRIV); + t->forced_info.si_signo = info->si_signo; + t->forced_info.si_errno = 0; + t->forced_info.si_code = SI_KERNEL; + t->forced_info.si_pid = 0; + t->forced_info.si_uid = 0; + } else { + t->forced_info = *info; + } + + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + return 0; + } +#endif spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; @ kernel/signal.c:2302 @ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); - preempt_enable_no_resched(); freezable_schedule(); cgroup_leave_frozen(true); } else { @ kernel/smp.c:693 @ void flush_smp_call_function_from_idle(void) cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_IDLE); + local_irq_save(flags); flush_smp_call_function_queue(true); - if (local_softirq_pending()) - do_softirq(); + + if (local_softirq_pending()) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + do_softirq(); + } else { + struct task_struct *ksoftirqd = this_cpu_ksoftirqd(); + + if (ksoftirqd && !task_is_running(ksoftirqd)) + wake_up_process(ksoftirqd); + } + } local_irq_restore(flags); } @ kernel/softirq.c:627 @ static inline void tick_irq_exit(void) #endif } +static DEFINE_PER_CPU(struct task_struct *, timersd); +static DEFINE_PER_CPU(unsigned long, pending_timer_softirq); + +static unsigned int local_pending_timers(void) +{ + return __this_cpu_read(pending_timer_softirq); +} + +static void wake_timersd(void) +{ + struct task_struct *tsk = __this_cpu_read(timersd); + + if (tsk) + wake_up_process(tsk); +} + static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED @ kernel/softirq.c:654 @ static inline void __irq_exit_rcu(void) preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !in_interrupt() && local_pending_timers()) + wake_timersd(); tick_irq_exit(); } @ kernel/softirq.c:984 @ static struct smp_hotplug_thread softirq_threads = { .thread_comm = "ksoftirqd/%u", }; +static void timersd_setup(unsigned int cpu) +{ + sched_set_fifo_low(current); +} + +static int timersd_should_run(unsigned int cpu) +{ + return local_pending_timers(); +} + +static void run_timersd(unsigned int cpu) +{ + unsigned int timer_si; + + ksoftirqd_run_begin(); + + timer_si = local_pending_timers(); + __this_cpu_write(pending_timer_softirq, 0); + or_softirq_pending(timer_si); + + __do_softirq(); + + ksoftirqd_run_end(); +} + +#ifdef CONFIG_PREEMPT_RT +static void raise_ktimers_thread(unsigned int nr) +{ + trace_softirq_raise(nr); + __this_cpu_or(pending_timer_softirq, 1 << nr); +} + +void raise_hrtimer_softirq(void) +{ + raise_ktimers_thread(HRTIMER_SOFTIRQ); +} + +void raise_timer_softirq(void) +{ + unsigned long flags; + + local_irq_save(flags); + raise_ktimers_thread(TIMER_SOFTIRQ); + wake_timersd(); + local_irq_restore(flags); +} +#endif + +static struct smp_hotplug_thread timer_threads = { + .store = &timersd, + .setup = timersd_setup, + .thread_should_run = timersd_should_run, + .thread_fn = run_timersd, + .thread_comm = "ktimers/%u", +}; + static __init int spawn_ksoftirqd(void) { cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, takeover_tasklets); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + BUG_ON(smpboot_register_percpu_thread(&timer_threads)); return 0; } @ kernel/time/hrtimer.c:1808 @ void hrtimer_interrupt(struct clock_event_device *dev) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_hrtimer_softirq(); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @ kernel/time/hrtimer.c:1921 @ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_hrtimer_softirq(); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @ kernel/time/timer.c:1769 @ static void run_local_timers(void) if (time_before(jiffies, base->next_expiry)) return; } - raise_softirq(TIMER_SOFTIRQ); + raise_timer_softirq(); } /* @ kernel/trace/trace.c:2609 @ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; - return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | +#ifdef CONFIG_PREEMPT_LAZY + if (need_resched_lazy()) + trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; +#endif + + return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (preempt_lazy_count() & 0xff) << 16 | (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } @ kernel/trace/trace.c:4191 @ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" - "# | / _----=> need-resched \n" - "# || / _---=> hardirq/softirq \n" - "# ||| / _--=> preempt-depth \n" - "# |||| / _-=> migrate-disable \n" - "# ||||| / delay \n" - "# cmd pid |||||| time | caller \n" - "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# _--------=> CPU# \n" + "# / _-------=> irqs-off \n" + "# | / _------=> need-resched \n" + "# || / _-----=> need-resched-lazy\n" + "# ||| / _----=> hardirq/softirq \n" + "# |||| / _---=> preempt-depth \n" + "# ||||| / _--=> preempt-lazy-depth\n" + "# |||||| / _-=> migrate-disable \n" + "# ||||||| / delay \n" + "# cmd pid |||||||| time | caller \n" + "# \\ / |||||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @ kernel/trace/trace.c:4235 @ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); - seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); - seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); - seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); - seq_printf(m, "# %.*s|||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); + seq_printf(m, "# %.*s _-------=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space); + seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space); + seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); } void @ kernel/trace/trace_events.c:187 @ static int trace_define_common_fields(void) /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(unsigned char, preempt_lazy_count); return ret; } @ kernel/trace/trace_output.c:445 @ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { char hardsoft_irq; char need_resched; + char need_resched_lazy; char irqs_off; int hardirq; int softirq; @ kernel/trace/trace_output.c:476 @ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) break; } + need_resched_lazy = + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; + hardsoft_irq = (nmi && hardirq) ? 'Z' : nmi ? 'z' : @ kernel/trace/trace_output.c:487 @ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) softirq ? 's' : '.' ; - trace_seq_printf(s, "%c%c%c", - irqs_off, need_resched, hardsoft_irq); + trace_seq_printf(s, "%c%c%c%c", + irqs_off, need_resched, need_resched_lazy, + hardsoft_irq); if (entry->preempt_count & 0xf) trace_seq_printf(s, "%x", entry->preempt_count & 0xf); else trace_seq_putc(s, '.'); + if (entry->preempt_lazy_count) + trace_seq_printf(s, "%x", entry->preempt_lazy_count); + else + trace_seq_putc(s, '.'); + if (entry->preempt_count & 0xf0) trace_seq_printf(s, "%x", entry->preempt_count >> 4); else @ lib/dump_stack.c:105 @ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) * Permit this cpu to perform nested stack dumps while serialising * against other CPUs */ - printk_cpu_lock_irqsave(flags); + printk_cpu_sync_get_irqsave(flags); __dump_stack(log_lvl); - printk_cpu_unlock_irqrestore(flags); + printk_cpu_sync_put_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); @ lib/irq_poll.c:194 @ static int irq_poll_cpu_dead(unsigned int cpu) * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ + local_bh_disable(); local_irq_disable(); list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), this_cpu_ptr(&blk_cpu_iopoll)); __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_enable(); + local_bh_enable(); return 0; } @ lib/locking-selftest.c:29 @ #include <linux/rtmutex.h> #include <linux/local_lock.h> +#ifdef CONFIG_PREEMPT_RT +# define NON_RT(...) +#else +# define NON_RT(...) __VA_ARGS__ +#endif + /* * Change this to 1 if you want to see the failure printouts: */ @ lib/locking-selftest.c:148 @ static DEFINE_RT_MUTEX(rtmutex_Z2); #endif -static local_lock_t local_A = INIT_LOCAL_LOCK(local_A); +static DEFINE_PER_CPU(local_lock_t, local_A); /* * non-inlined runtime initializers, to let separate locks share @ lib/locking-selftest.c:721 @ GENERATE_TESTCASE(ABCDBCDA_rtmutex); #undef E +#ifdef CONFIG_PREEMPT_RT +# define RT_PREPARE_DBL_UNLOCK() { migrate_disable(); rcu_read_lock(); } +#else +# define RT_PREPARE_DBL_UNLOCK() +#endif /* * Double unlock: */ #define E() \ \ LOCK(A); \ + RT_PREPARE_DBL_UNLOCK(); \ UNLOCK(A); \ UNLOCK(A); /* fail */ @ lib/locking-selftest.c:817 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_spin) @ lib/locking-selftest.c:826 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) +#endif #undef E1 #undef E2 +#ifndef CONFIG_PREEMPT_RT /* * Enabling hardirqs with a softirq-safe lock held: */ @ lib/locking-selftest.c:864 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) #undef E1 #undef E2 +#endif + /* * Enabling irqs with an irq-safe lock held: */ @ lib/locking-selftest.c:895 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_spin) @ lib/locking-selftest.c:904 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) +#endif #undef E1 #undef E2 @ lib/locking-selftest.c:943 @ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_spin) @ lib/locking-selftest.c:952 @ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) +#endif #undef E1 #undef E2 @ lib/locking-selftest.c:993 @ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_spin) @ lib/locking-selftest.c:1002 @ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) +#endif #undef E1 #undef E2 @ lib/locking-selftest.c:1057 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_rlock) #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_spin) @ lib/locking-selftest.c:1066 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) +#endif #undef E1 #undef E2 @ lib/locking-selftest.c:1234 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-softirq.h" #include "locking-selftest-rlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) +#endif #undef E1 #undef E2 @ lib/locking-selftest.c:1282 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-softirq.h" #include "locking-selftest-rlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_wlock) +#endif #undef E1 #undef E2 @ lib/locking-selftest.c:1338 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-softirq.h" #include "locking-selftest-rlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) +#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) @ lib/locking-selftest.c:1354 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) # define I_MUTEX(x) lockdep_reset_lock(&mutex_##x.dep_map) # define I_RWSEM(x) lockdep_reset_lock(&rwsem_##x.dep_map) # define I_WW(x) lockdep_reset_lock(&x.dep_map) -# define I_LOCAL_LOCK(x) lockdep_reset_lock(&local_##x.dep_map) +# define I_LOCAL_LOCK(x) lockdep_reset_lock(this_cpu_ptr(&local_##x.dep_map)) #ifdef CONFIG_RT_MUTEXES # define I_RTMUTEX(x) lockdep_reset_lock(&rtmutex_##x.dep_map) #endif @ lib/locking-selftest.c:1414 @ static void reset_locks(void) init_shared_classes(); raw_spin_lock_init(&raw_lock_A); raw_spin_lock_init(&raw_lock_B); - local_lock_init(&local_A); + local_lock_init(this_cpu_ptr(&local_A)); ww_mutex_init(&o, &ww_lockdep); ww_mutex_init(&o2, &ww_lockdep); ww_mutex_init(&o3, &ww_lockdep); memset(&t, 0, sizeof(t)); memset(&t2, 0, sizeof(t2)); @ lib/locking-selftest.c:1432 @ static int unexpected_testcase_failures; static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) { - unsigned long saved_preempt_count = preempt_count(); + int saved_preempt_count = preempt_count(); +#ifdef CONFIG_PREEMPT_RT +#ifdef CONFIG_SMP + int saved_mgd_count = current->migration_disabled; +#endif + int saved_rcu_count = current->rcu_read_lock_nesting; +#endif WARN_ON(irqs_disabled()); @ lib/locking-selftest.c:1472 @ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) * count, so restore it: */ preempt_count_set(saved_preempt_count); + +#ifdef CONFIG_PREEMPT_RT +#ifdef CONFIG_SMP + while (current->migration_disabled > saved_mgd_count) + migrate_enable(); +#endif + + while (current->rcu_read_lock_nesting > saved_rcu_count) + rcu_read_unlock(); + WARN_ON_ONCE(current->rcu_read_lock_nesting < saved_rcu_count); +#endif + #ifdef CONFIG_TRACE_IRQFLAGS if (softirq_count()) current->softirqs_enabled = 0; @ lib/locking-selftest.c:1551 @ static inline void print_testname(const char *testname) #define DO_TESTCASE_2x2RW(desc, name, nr) \ DO_TESTCASE_2RW("hard-"desc, name##_hard, nr) \ - DO_TESTCASE_2RW("soft-"desc, name##_soft, nr) \ + NON_RT(DO_TESTCASE_2RW("soft-"desc, name##_soft, nr)) \ #define DO_TESTCASE_6x2x2RW(desc, name) \ DO_TESTCASE_2x2RW(desc, name, 123); \ @ lib/locking-selftest.c:1599 @ static inline void print_testname(const char *testname) #define DO_TESTCASE_2I(desc, name, nr) \ DO_TESTCASE_1("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_1("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_1("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_2IB(desc, name, nr) \ DO_TESTCASE_1B("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_1B("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_1B("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_6I(desc, name, nr) \ DO_TESTCASE_3("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_3("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_3("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_6IRW(desc, name, nr) \ DO_TESTCASE_3RW("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_3RW("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_3RW("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_2x3(desc, name) \ DO_TESTCASE_3(desc, name, 12); \ @ lib/locking-selftest.c:1703 @ static void ww_test_fail_acquire(void) #endif } +#ifdef CONFIG_PREEMPT_RT +#define ww_mutex_base_lock(b) rt_mutex_lock(b) +#define ww_mutex_base_trylock(b) rt_mutex_trylock(b) +#define ww_mutex_base_lock_nest_lock(b, b2) rt_mutex_lock_nest_lock(b, b2) +#define ww_mutex_base_lock_interruptible(b) rt_mutex_lock_interruptible(b) +#define ww_mutex_base_lock_killable(b) rt_mutex_lock_killable(b) +#define ww_mutex_base_unlock(b) rt_mutex_unlock(b) +#else +#define ww_mutex_base_lock(b) mutex_lock(b) +#define ww_mutex_base_trylock(b) mutex_trylock(b) +#define ww_mutex_base_lock_nest_lock(b, b2) mutex_lock_nest_lock(b, b2) +#define ww_mutex_base_lock_interruptible(b) mutex_lock_interruptible(b) +#define ww_mutex_base_lock_killable(b) mutex_lock_killable(b) +#define ww_mutex_base_unlock(b) mutex_unlock(b) +#endif + static void ww_test_normal(void) { int ret; @ lib/locking-selftest.c:1733 @ static void ww_test_normal(void) /* mutex_lock (and indirectly, mutex_lock_nested) */ o.ctx = (void *)~0UL; - mutex_lock(&o.base); - mutex_unlock(&o.base); + ww_mutex_base_lock(&o.base); + ww_mutex_base_unlock(&o.base); WARN_ON(o.ctx != (void *)~0UL); /* mutex_lock_interruptible (and *_nested) */ o.ctx = (void *)~0UL; - ret = mutex_lock_interruptible(&o.base); + ret = ww_mutex_base_lock_interruptible(&o.base); if (!ret) - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); else WARN_ON(1); WARN_ON(o.ctx != (void *)~0UL); /* mutex_lock_killable (and *_nested) */ o.ctx = (void *)~0UL; - ret = mutex_lock_killable(&o.base); + ret = ww_mutex_base_lock_killable(&o.base); if (!ret) - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); else WARN_ON(1); WARN_ON(o.ctx != (void *)~0UL); /* trylock, succeeding */ o.ctx = (void *)~0UL; - ret = mutex_trylock(&o.base); + ret = ww_mutex_base_trylock(&o.base); WARN_ON(!ret); if (ret) - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); else WARN_ON(1); WARN_ON(o.ctx != (void *)~0UL); /* trylock, failing */ o.ctx = (void *)~0UL; - mutex_lock(&o.base); - ret = mutex_trylock(&o.base); + ww_mutex_base_lock(&o.base); + ret = ww_mutex_base_trylock(&o.base); WARN_ON(ret); - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); WARN_ON(o.ctx != (void *)~0UL); /* nest_lock */ o.ctx = (void *)~0UL; - mutex_lock_nest_lock(&o.base, &t); - mutex_unlock(&o.base); + ww_mutex_base_lock_nest_lock(&o.base, &t); + ww_mutex_base_unlock(&o.base); WARN_ON(o.ctx != (void *)~0UL); } @ lib/locking-selftest.c:1789 @ static void ww_test_two_contexts(void) static void ww_test_diff_class(void) { WWAI(&t); -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES t.ww_class = NULL; #endif WWL(&o, &t); @ lib/locking-selftest.c:1853 @ static void ww_test_edeadlk_normal(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); o2.ctx = &t2; mutex_release(&o2.base.dep_map, _THIS_IP_); @ lib/locking-selftest.c:1869 @ static void ww_test_edeadlk_normal(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); WWU(&o); WWL(&o2, &t); @ lib/locking-selftest.c:1879 @ static void ww_test_edeadlk_normal_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ lib/locking-selftest.c:1895 @ static void ww_test_edeadlk_normal_slow(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); WWU(&o); ww_mutex_lock_slow(&o2, &t); @ lib/locking-selftest.c:1905 @ static void ww_test_edeadlk_no_unlock(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); o2.ctx = &t2; mutex_release(&o2.base.dep_map, _THIS_IP_); @ lib/locking-selftest.c:1921 @ static void ww_test_edeadlk_no_unlock(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); WWL(&o2, &t); } @ lib/locking-selftest.c:1930 @ static void ww_test_edeadlk_no_unlock_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ lib/locking-selftest.c:1946 @ static void ww_test_edeadlk_no_unlock_slow(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); ww_mutex_lock_slow(&o2, &t); } @ lib/locking-selftest.c:1955 @ static void ww_test_edeadlk_acquire_more(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ lib/locking-selftest.c:1976 @ static void ww_test_edeadlk_acquire_more_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ lib/locking-selftest.c:1997 @ static void ww_test_edeadlk_acquire_more_edeadlk(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; - mutex_lock(&o3.base); + ww_mutex_base_lock(&o3.base); mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; @ lib/locking-selftest.c:2023 @ static void ww_test_edeadlk_acquire_more_edeadlk_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; - mutex_lock(&o3.base); + ww_mutex_base_lock(&o3.base); mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; @ lib/locking-selftest.c:2048 @ static void ww_test_edeadlk_acquire_wrong(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ lib/locking-selftest.c:2073 @ static void ww_test_edeadlk_acquire_wrong_slow(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ lib/locking-selftest.c:2714 @ static void wait_context_tests(void) static void local_lock_2(void) { - local_lock_acquire(&local_A); /* IRQ-ON */ - local_lock_release(&local_A); + local_lock(&local_A); /* IRQ-ON */ + local_unlock(&local_A); HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ @ lib/locking-selftest.c:2724 @ static void local_lock_2(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); - local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ - local_lock_release(&local_A); + local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ + local_unlock(&local_A); spin_unlock(&lock_A); HARDIRQ_ENABLE(); } static void local_lock_3A(void) { - local_lock_acquire(&local_A); /* IRQ-ON */ + local_lock(&local_A); /* IRQ-ON */ spin_lock(&lock_B); /* IRQ-ON */ spin_unlock(&lock_B); - local_lock_release(&local_A); + local_unlock(&local_A); HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ @ lib/locking-selftest.c:2744 @ static void local_lock_3A(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); - local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ - local_lock_release(&local_A); + local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ + local_unlock(&local_A); spin_unlock(&lock_A); HARDIRQ_ENABLE(); } static void local_lock_3B(void) { - local_lock_acquire(&local_A); /* IRQ-ON */ + local_lock(&local_A); /* IRQ-ON */ spin_lock(&lock_B); /* IRQ-ON */ spin_unlock(&lock_B); - local_lock_release(&local_A); + local_unlock(&local_A); HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ @ lib/locking-selftest.c:2764 @ static void local_lock_3B(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); - local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ - local_lock_release(&local_A); + local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ + local_unlock(&local_A); spin_unlock(&lock_A); HARDIRQ_ENABLE(); @ lib/locking-selftest.c:2880 @ void locking_selftest(void) printk("------------------------\n"); printk("| Locking API testsuite:\n"); printk("----------------------------------------------------------------------------\n"); - printk(" | spin |wlock |rlock |mutex | wsem | rsem |\n"); + printk(" | spin |wlock |rlock |mutex | wsem | rsem |rtmutex\n"); printk(" --------------------------------------------------------------------------\n"); init_shared_classes(); @ lib/locking-selftest.c:2953 @ void locking_selftest(void) DO_TESTCASE_6x1RR("rlock W1R2/R2R3/W3W1", W1R2_R2R3_W3W1); printk(" --------------------------------------------------------------------------\n"); - /* * irq-context testcases: */ DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); - DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); + NON_RT(DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A)); DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); DO_TESTCASE_6x6("safe-A + unsafe-B #1", irqsafe3); DO_TESTCASE_6x6("safe-A + unsafe-B #2", irqsafe4); @ lib/nmi_backtrace.c:102 @ bool nmi_cpu_backtrace(struct pt_regs *regs) * Allow nested NMI backtraces while serializing * against other CPUs. */ - printk_cpu_lock_irqsave(flags); + printk_cpu_sync_get_irqsave(flags); if (!READ_ONCE(backtrace_idle) && regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", cpu, (void *)instruction_pointer(regs)); @ lib/nmi_backtrace.c:113 @ bool nmi_cpu_backtrace(struct pt_regs *regs) else dump_stack(); } - printk_cpu_unlock_irqrestore(flags); + printk_cpu_sync_put_irqrestore(flags); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } @ localversion-rt:1 @ +-rt19 @ mm/memcontrol.c:172 @ struct mem_cgroup_event { struct work_struct remove; }; -static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* Stuffs for move charges at task migration. */ @ mm/memcontrol.c:263 @ bool mem_cgroup_kmem_disabled(void) return cgroup_memory_nokmem; } +struct memcg_stock_pcp; static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, - unsigned int nr_pages); + unsigned int nr_pages, + bool stock_lock_acquried); static void obj_cgroup_release(struct percpu_ref *ref) { @ mm/memcontrol.c:300 @ static void obj_cgroup_release(struct percpu_ref *ref) nr_pages = nr_bytes >> PAGE_SHIFT; if (nr_pages) - obj_cgroup_uncharge_pages(objcg, nr_pages); + obj_cgroup_uncharge_pages(objcg, nr_pages, false); spin_lock_irqsave(&css_set_lock, flags); list_del(&objcg->list); @ mm/memcontrol.c:525 @ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) return excess; } -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) -{ - unsigned long excess; - struct mem_cgroup_per_node *mz; - struct mem_cgroup_tree_per_node *mctz; - - mctz = soft_limit_tree.rb_tree_per_node[nid]; - if (!mctz) - return; - /* - * Necessary to update all ancestors when hierarchy is used. - * because their event counter is not touched. - */ - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = memcg->nodeinfo[nid]; - excess = soft_limit_excess(memcg); - /* - * We have to update the tree if mz is on RB-tree or - * mem is over its softlimit. - */ - if (excess || mz->on_tree) { - unsigned long flags; - - spin_lock_irqsave(&mctz->lock, flags); - /* if on-tree, remove it */ - if (mz->on_tree) - __mem_cgroup_remove_exceeded(mz, mctz); - /* - * Insert again. mz->usage_in_excess will be updated. - * If excess is 0, no tree ops. - */ - __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock_irqrestore(&mctz->lock, flags); - } - } -} - static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { struct mem_cgroup_tree_per_node *mctz; @ mm/memcontrol.c:666 @ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); /* Update memcg */ __this_cpu_add(memcg->vmstats_percpu->state[idx], val); @ mm/memcontrol.c:675 @ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); memcg_rstat_updated(memcg); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } /** @ mm/memcontrol.c:759 @ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; + if (IS_ENABLED(PREEMPT_RT)) + preempt_disable(); __this_cpu_add(memcg->vmstats_percpu->events[idx], count); memcg_rstat_updated(memcg); + if (IS_ENABLED(PREEMPT_RT)) + preempt_enable(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @ mm/memcontrol.c:796 @ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); } -static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, - enum mem_cgroup_events_target target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); - next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); - /* from time_after() in jiffies.h */ - if ((long)(next - val) < 0) { - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - default: - break; - } - __this_cpu_write(memcg->vmstats_percpu->targets[target], next); - return true; - } - return false; -} - -/* - * Check events in order. - * - */ -static void memcg_check_events(struct mem_cgroup *memcg, int nid) -{ - /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_THRESH))) { - bool do_softlimit; - - do_softlimit = mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_SOFTLIMIT); - mem_cgroup_threshold(memcg); - if (unlikely(do_softlimit)) - mem_cgroup_update_tree(memcg, nid); - } -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @ mm/memcontrol.c:2022 @ struct obj_stock { }; struct memcg_stock_pcp { + /* Protects memcg_stock_pcp */ + local_lock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; +#ifndef CONFIG_PREEMPTION + /* Protects only task_obj */ + local_lock_t task_obj_lock; struct obj_stock task_obj; +#endif struct obj_stock irq_obj; struct work_struct work; unsigned long flags; #define FLUSHING_CACHED_CHARGE 0 }; -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { + .stock_lock = INIT_LOCAL_LOCK(stock_lock), +#ifndef CONFIG_PREEMPTION + .task_obj_lock = INIT_LOCAL_LOCK(task_obj_lock), +#endif +}; static DEFINE_MUTEX(percpu_charge_mutex); #ifdef CONFIG_MEMCG_KMEM -static void drain_obj_stock(struct obj_stock *stock); +static struct obj_cgroup *drain_obj_stock(struct obj_stock *stock, + bool stock_lock_acquried); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); #else -static inline void drain_obj_stock(struct obj_stock *stock) +static inline struct obj_cgroup *drain_obj_stock(struct obj_stock *stock, + bool stock_lock_acquried) { + return NULL; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) @ mm/memcontrol.c:2084 @ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { @ mm/memcontrol.c:2092 @ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ret = true; } - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @ mm/memcontrol.c:2120 @ static void drain_stock(struct memcg_stock_pcp *stock) static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock; - unsigned long flags; + struct memcg_stock_pcp *stock_pcp; + struct obj_cgroup *old; /* * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. * drain_stock races is that we always operate on local CPU stock * here with IRQ disabled */ - local_irq_save(flags); +#ifndef CONFIG_PREEMPTION + local_lock(&memcg_stock.task_obj_lock); + old = drain_obj_stock(&this_cpu_ptr(&memcg_stock)->task_obj, NULL); + local_unlock(&memcg_stock.task_obj_lock); + if (old) + obj_cgroup_put(old); +#endif - stock = this_cpu_ptr(&memcg_stock); - drain_obj_stock(&stock->irq_obj); - if (in_task()) - drain_obj_stock(&stock->task_obj); - drain_stock(stock); - clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + local_lock_irq(&memcg_stock.stock_lock); + stock_pcp = this_cpu_ptr(&memcg_stock); + old = drain_obj_stock(&stock_pcp->irq_obj, stock_pcp); - local_irq_restore(flags); + drain_stock(stock_pcp); + clear_bit(FLUSHING_CACHED_CHARGE, &stock_pcp->flags); + + local_unlock_irq(&memcg_stock.stock_lock); + if (old) + obj_cgroup_put(old); } /* * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { - struct memcg_stock_pcp *stock; - unsigned long flags; + struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); - local_irq_save(flags); - - stock = this_cpu_ptr(&memcg_stock); + lockdep_assert_held(&stock->stock_lock); if (stock->cached != memcg) { /* reset if necessary */ drain_stock(stock); css_get(&memcg->css); @ mm/memcontrol.c:2166 @ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); +} - local_irq_restore(flags); +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages, + bool stock_lock_acquried) +{ + unsigned long flags; + + if (stock_lock_acquried) { + __refill_stock(memcg, nr_pages); + return; + } + local_lock_irqsave(&memcg_stock.stock_lock, flags); + __refill_stock(memcg, nr_pages); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @ mm/memcontrol.c:2188 @ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) */ static void drain_all_stock(struct mem_cgroup *root_memcg) { - int cpu, curcpu; + int cpu; /* If someone's already draining, avoid adding running more workers. */ if (!mutex_trylock(&percpu_charge_mutex)) @ mm/memcontrol.c:2199 @ static void drain_all_stock(struct mem_cgroup *root_memcg) * as well as workers from this path always operate on the local * per-cpu data. CPU up doesn't touch memcg_stock at all. */ - curcpu = get_cpu(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; @ mm/memcontrol.c:2215 @ static void drain_all_stock(struct mem_cgroup *root_memcg) rcu_read_unlock(); if (flush && - !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { - if (cpu == curcpu) - drain_local_stock(&stock->work); - else - schedule_work_on(cpu, &stock->work); - } + !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + schedule_work_on(cpu, &stock->work); } - put_cpu(); + cpus_read_unlock(); mutex_unlock(&percpu_charge_mutex); } @ mm/memcontrol.c:2619 @ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, done_restock: if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); + refill_stock(memcg, batch - nr_pages, false); /* * If the hierarchy is above the normal consumption range, schedule @ mm/memcontrol.c:2732 @ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) * can only be accessed after disabling interrupt. User context code can * access interrupt object stock, but not vice versa. */ -static inline struct obj_stock *get_obj_stock(unsigned long *pflags) +static inline struct obj_stock *get_obj_stock(unsigned long *pflags, + bool *stock_lock_acquried) { struct memcg_stock_pcp *stock; +#ifndef CONFIG_PREEMPTION if (likely(in_task())) { *pflags = 0UL; - preempt_disable(); + *stock_lock_acquried = false; + local_lock(&memcg_stock.task_obj_lock); stock = this_cpu_ptr(&memcg_stock); return &stock->task_obj; } - - local_irq_save(*pflags); +#endif + *stock_lock_acquried = true; + local_lock_irqsave(&memcg_stock.stock_lock, *pflags); stock = this_cpu_ptr(&memcg_stock); return &stock->irq_obj; } -static inline void put_obj_stock(unsigned long flags) +static inline void put_obj_stock(unsigned long flags, + bool stock_lock_acquried) { - if (likely(in_task())) - preempt_enable(); - else - local_irq_restore(flags); +#ifndef CONFIG_PREEMPTION + if (likely(!stock_lock_acquried)) { + local_unlock(&memcg_stock.task_obj_lock); + return; + } +#endif + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @ mm/memcontrol.c:2939 @ static void memcg_free_cache_id(int id) * @nr_pages: number of pages to uncharge */ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, - unsigned int nr_pages) + unsigned int nr_pages, + bool stock_lock_acquried) { struct mem_cgroup *memcg; @ mm/memcontrol.c:2948 @ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->kmem, nr_pages); - refill_stock(memcg, nr_pages); + refill_stock(memcg, nr_pages, stock_lock_acquried); css_put(&memcg->css); } @ mm/memcontrol.c:3022 @ void __memcg_kmem_uncharge_page(struct page *page, int order) return; objcg = __folio_objcg(folio); - obj_cgroup_uncharge_pages(objcg, nr_pages); + obj_cgroup_uncharge_pages(objcg, nr_pages, false); folio->memcg_data = 0; obj_cgroup_put(objcg); } @ mm/memcontrol.c:3030 @ void __memcg_kmem_uncharge_page(struct page *page, int order) void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { + bool stock_lock_acquried; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); + struct obj_cgroup *old = NULL; + struct obj_stock *stock; int *bytes; + stock = get_obj_stock(&flags, &stock_lock_acquried); /* * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat or idx * changes. */ if (stock->cached_objcg != objcg) { - drain_obj_stock(stock); + old = drain_obj_stock(stock, stock_lock_acquried); + obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; @ mm/memcontrol.c:3088 @ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); - put_obj_stock(flags); + put_obj_stock(flags, stock_lock_acquried); + if (old) + obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { + bool stock_lock_acquried; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); + struct obj_stock *stock; bool ret = false; + stock = get_obj_stock(&flags, &stock_lock_acquried); if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } - put_obj_stock(flags); + put_obj_stock(flags, stock_lock_acquried); return ret; } -static void drain_obj_stock(struct obj_stock *stock) +static struct obj_cgroup *drain_obj_stock(struct obj_stock *stock, + bool stock_lock_acquried) { struct obj_cgroup *old = stock->cached_objcg; if (!old) - return; + return NULL; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); if (nr_pages) - obj_cgroup_uncharge_pages(old, nr_pages); + obj_cgroup_uncharge_pages(old, nr_pages, stock_lock_acquried); /* * The leftover is flushed to the centralized per-memcg value. @ mm/memcontrol.c:3159 @ static void drain_obj_stock(struct obj_stock *stock) stock->cached_pgdat = NULL; } - obj_cgroup_put(old); stock->cached_objcg = NULL; + return old; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @ mm/memcontrol.c:3168 @ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, { struct mem_cgroup *memcg; +#ifndef CONFIG_PREEMPTION if (in_task() && stock->task_obj.cached_objcg) { memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } +#endif if (stock->irq_obj.cached_objcg) { memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) @ mm/memcontrol.c:3187 @ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { + bool stock_lock_acquried; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); + struct obj_stock *stock; unsigned int nr_pages = 0; + struct obj_cgroup *old = NULL; + stock = get_obj_stock(&flags, &stock_lock_acquried); if (stock->cached_objcg != objcg) { /* reset if necessary */ - drain_obj_stock(stock); + old = drain_obj_stock(stock, stock_lock_acquried); obj_cgroup_get(objcg); stock->cached_objcg = objcg; stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) @ mm/memcontrol.c:3209 @ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - put_obj_stock(flags); + put_obj_stock(flags, stock_lock_acquried); + if (old) + obj_cgroup_put(old); if (nr_pages) - obj_cgroup_uncharge_pages(objcg, nr_pages); + obj_cgroup_uncharge_pages(objcg, nr_pages, false); } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) @ mm/memcontrol.c:3734 @ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } break; case RES_SOFT_LIMIT: +#ifndef CONFIG_PREEMPT_RT memcg->soft_limit = nr_pages; ret = 0; +#else + ret = -EOPNOTSUPP; +#endif break; } return ret ?: nbytes; @ mm/memcontrol.c:4044 @ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, return 0; } -static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) -{ - struct mem_cgroup_threshold_ary *t; - unsigned long usage; - int i; - - rcu_read_lock(); - if (!swap) - t = rcu_dereference(memcg->thresholds.primary); - else - t = rcu_dereference(memcg->memsw_thresholds.primary); - - if (!t) - goto unlock; - - usage = mem_cgroup_usage(memcg, swap); - - /* - * current_threshold points to threshold just below or equal to usage. - * If it's not true, a threshold was crossed after last - * call of __mem_cgroup_threshold(). - */ - i = t->current_threshold; - - /* - * Iterate backward over array of thresholds starting from - * current_threshold and check if a threshold is crossed. - * If none of thresholds below usage is crossed, we read - * only one element of the array here. - */ - for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) - eventfd_signal(t->entries[i].eventfd, 1); - - /* i = current_threshold + 1 */ - i++; - - /* - * Iterate forward over array of thresholds starting from - * current_threshold+1 and check if a threshold is crossed. - * If none of thresholds above usage is crossed, we read - * only one element of the array here. - */ - for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) - eventfd_signal(t->entries[i].eventfd, 1); - - /* Update current_threshold */ - t->current_threshold = i - 1; -unlock: - rcu_read_unlock(); -} - -static void mem_cgroup_threshold(struct mem_cgroup *memcg) -{ - while (memcg) { - __mem_cgroup_threshold(memcg, false); - if (do_memsw_account()) - __mem_cgroup_threshold(memcg, true); - - memcg = parent_mem_cgroup(memcg); - } -} - -static int compare_thresholds(const void *a, const void *b) -{ - const struct mem_cgroup_threshold *_a = a; - const struct mem_cgroup_threshold *_b = b; - - if (_a->threshold > _b->threshold) - return 1; - - if (_a->threshold < _b->threshold) - return -1; - - return 0; -} - static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) { struct mem_cgroup_eventfd_list *ev; @ mm/memcontrol.c:4065 @ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, const char *args, enum res_type type) -{ - struct mem_cgroup_thresholds *thresholds; - struct mem_cgroup_threshold_ary *new; - unsigned long threshold; - unsigned long usage; - int i, size, ret; - - ret = page_counter_memparse(args, "-1", &threshold); - if (ret) - return ret; - - mutex_lock(&memcg->thresholds_lock); - - if (type == _MEM) { - thresholds = &memcg->thresholds; - usage = mem_cgroup_usage(memcg, false); - } else if (type == _MEMSWAP) { - thresholds = &memcg->memsw_thresholds; - usage = mem_cgroup_usage(memcg, true); - } else - BUG(); - - /* Check if a threshold crossed before adding a new one */ - if (thresholds->primary) - __mem_cgroup_threshold(memcg, type == _MEMSWAP); - - size = thresholds->primary ? thresholds->primary->size + 1 : 1; - - /* Allocate memory for new array of thresholds */ - new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto unlock; - } - new->size = size; - - /* Copy thresholds (if any) to new array */ - if (thresholds->primary) - memcpy(new->entries, thresholds->primary->entries, - flex_array_size(new, entries, size - 1)); - - /* Add new threshold */ - new->entries[size - 1].eventfd = eventfd; - new->entries[size - 1].threshold = threshold; - - /* Sort thresholds. Registering of new threshold isn't time-critical */ - sort(new->entries, size, sizeof(*new->entries), - compare_thresholds, NULL); - - /* Find current threshold */ - new->current_threshold = -1; - for (i = 0; i < size; i++) { - if (new->entries[i].threshold <= usage) { - /* - * new->current_threshold will not be used until - * rcu_assign_pointer(), so it's safe to increment - * it here. - */ - ++new->current_threshold; - } else - break; - } - - /* Free old spare buffer and save old primary buffer as spare */ - kfree(thresholds->spare); - thresholds->spare = thresholds->primary; - - rcu_assign_pointer(thresholds->primary, new); - - /* To be sure that nobody uses thresholds */ - synchronize_rcu(); - -unlock: - mutex_unlock(&memcg->thresholds_lock); - - return ret; -} - -static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, const char *args) -{ - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); -} - -static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, const char *args) -{ - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); -} - -static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, enum res_type type) -{ - struct mem_cgroup_thresholds *thresholds; - struct mem_cgroup_threshold_ary *new; - unsigned long usage; - int i, j, size, entries; - - mutex_lock(&memcg->thresholds_lock); - - if (type == _MEM) { - thresholds = &memcg->thresholds; - usage = mem_cgroup_usage(memcg, false); - } else if (type == _MEMSWAP) { - thresholds = &memcg->memsw_thresholds; - usage = mem_cgroup_usage(memcg, true); - } else - BUG(); - - if (!thresholds->primary) - goto unlock; - - /* Check if a threshold crossed before removing */ - __mem_cgroup_threshold(memcg, type == _MEMSWAP); - - /* Calculate new number of threshold */ - size = entries = 0; - for (i = 0; i < thresholds->primary->size; i++) { - if (thresholds->primary->entries[i].eventfd != eventfd) - size++; - else - entries++; - } - - new = thresholds->spare; - - /* If no items related to eventfd have been cleared, nothing to do */ - if (!entries) - goto unlock; - - /* Set thresholds array to NULL if we don't have thresholds */ - if (!size) { - kfree(new); - new = NULL; - goto swap_buffers; - } - - new->size = size; - - /* Copy thresholds and find current threshold */ - new->current_threshold = -1; - for (i = 0, j = 0; i < thresholds->primary->size; i++) { - if (thresholds->primary->entries[i].eventfd == eventfd) - continue; - - new->entries[j] = thresholds->primary->entries[i]; - if (new->entries[j].threshold <= usage) { - /* - * new->current_threshold will not be used - * until rcu_assign_pointer(), so it's safe to increment - * it here. - */ - ++new->current_threshold; - } - j++; - } - -swap_buffers: - /* Swap primary and spare array */ - thresholds->spare = thresholds->primary; - - rcu_assign_pointer(thresholds->primary, new); - - /* To be sure that nobody uses thresholds */ - synchronize_rcu(); - - /* If all events are unregistered, free the spare array */ - if (!new) { - kfree(thresholds->spare); - thresholds->spare = NULL; - } -unlock: - mutex_unlock(&memcg->thresholds_lock); -} - -static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd) -{ - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); -} - -static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd) -{ - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); -} - -static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd, const char *args) -{ - struct mem_cgroup_eventfd_list *event; - - event = kmalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - spin_lock(&memcg_oom_lock); - - event->eventfd = eventfd; - list_add(&event->list, &memcg->oom_notify); - - /* already in OOM ? */ - if (memcg->under_oom) - eventfd_signal(eventfd, 1); - spin_unlock(&memcg_oom_lock); - - return 0; -} - -static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, - struct eventfd_ctx *eventfd) -{ - struct mem_cgroup_eventfd_list *ev, *tmp; - - spin_lock(&memcg_oom_lock); - - list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { - if (ev->eventfd == eventfd) { - list_del(&ev->list); - kfree(ev); - } - } - - spin_unlock(&memcg_oom_lock); -} - static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); @ mm/memcontrol.c:4305 @ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) #endif /* CONFIG_CGROUP_WRITEBACK */ +#ifndef CONFIG_PREEMPT_RT /* * DO NOT USE IN NEW FILES. * @ mm/memcontrol.c:4319 @ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) * possible. */ +static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) +{ + unsigned long val, next; + + val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); + next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); + /* from time_after() in jiffies.h */ + if ((long)(next - val) < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->vmstats_percpu->targets[target], next); + return true; + } + return false; +} + +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) +{ + unsigned long excess; + struct mem_cgroup_per_node *mz; + struct mem_cgroup_tree_per_node *mctz; + + mctz = soft_limit_tree.rb_tree_per_node[nid]; + if (!mctz) + return; + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = memcg->nodeinfo[nid]; + excess = soft_limit_excess(memcg); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(mz, mctz, excess); + spin_unlock_irqrestore(&mctz->lock, flags); + } + } +} + +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) +{ + struct mem_cgroup_threshold_ary *t; + unsigned long usage; + int i; + + rcu_read_lock(); + if (!swap) + t = rcu_dereference(memcg->thresholds.primary); + else + t = rcu_dereference(memcg->memsw_thresholds.primary); + + if (!t) + goto unlock; + + usage = mem_cgroup_usage(memcg, swap); + + /* + * current_threshold points to threshold just below or equal to usage. + * If it's not true, a threshold was crossed after last + * call of __mem_cgroup_threshold(). + */ + i = t->current_threshold; + + /* + * Iterate backward over array of thresholds starting from + * current_threshold and check if a threshold is crossed. + * If none of thresholds below usage is crossed, we read + * only one element of the array here. + */ + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) + eventfd_signal(t->entries[i].eventfd, 1); + + /* i = current_threshold + 1 */ + i++; + + /* + * Iterate forward over array of thresholds starting from + * current_threshold+1 and check if a threshold is crossed. + * If none of thresholds above usage is crossed, we read + * only one element of the array here. + */ + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) + eventfd_signal(t->entries[i].eventfd, 1); + + /* Update current_threshold */ + t->current_threshold = i - 1; +unlock: + rcu_read_unlock(); +} + +static void mem_cgroup_threshold(struct mem_cgroup *memcg) +{ + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_memsw_account()) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } +} + +/* + * Check events in order. + * + */ +static void memcg_check_events(struct mem_cgroup *memcg, int nid) +{ + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); + mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + mem_cgroup_update_tree(memcg, nid); + } +} + +static int compare_thresholds(const void *a, const void *b) +{ + const struct mem_cgroup_threshold *_a = a; + const struct mem_cgroup_threshold *_b = b; + + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; +} + +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long threshold; + unsigned long usage; + int i, size, ret; + + ret = page_counter_memparse(args, "-1", &threshold); + if (ret) + return ret; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + /* Check if a threshold crossed before adding a new one */ + if (thresholds->primary) + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + size = thresholds->primary ? thresholds->primary->size + 1 : 1; + + /* Allocate memory for new array of thresholds */ + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto unlock; + } + new->size = size; + + /* Copy thresholds (if any) to new array */ + if (thresholds->primary) + memcpy(new->entries, thresholds->primary->entries, + flex_array_size(new, entries, size - 1)); + + /* Add new threshold */ + new->entries[size - 1].eventfd = eventfd; + new->entries[size - 1].threshold = threshold; + + /* Sort thresholds. Registering of new threshold isn't time-critical */ + sort(new->entries, size, sizeof(*new->entries), + compare_thresholds, NULL); + + /* Find current threshold */ + new->current_threshold = -1; + for (i = 0; i < size; i++) { + if (new->entries[i].threshold <= usage) { + /* + * new->current_threshold will not be used until + * rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } else + break; + } + + /* Free old spare buffer and save old primary buffer as spare */ + kfree(thresholds->spare); + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + +unlock: + mutex_unlock(&memcg->thresholds_lock); + + return ret; +} + +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long usage; + int i, j, size, entries; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + if (!thresholds->primary) + goto unlock; + + /* Check if a threshold crossed before removing */ + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + /* Calculate new number of threshold */ + size = entries = 0; + for (i = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd != eventfd) + size++; + else + entries++; + } + + new = thresholds->spare; + + /* If no items related to eventfd have been cleared, nothing to do */ + if (!entries) + goto unlock; + + /* Set thresholds array to NULL if we don't have thresholds */ + if (!size) { + kfree(new); + new = NULL; + goto swap_buffers; + } + + new->size = size; + + /* Copy thresholds and find current threshold */ + new->current_threshold = -1; + for (i = 0, j = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd == eventfd) + continue; + + new->entries[j] = thresholds->primary->entries[i]; + if (new->entries[j].threshold <= usage) { + /* + * new->current_threshold will not be used + * until rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } + j++; + } + +swap_buffers: + /* Swap primary and spare array */ + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } +unlock: + mutex_unlock(&memcg->thresholds_lock); +} + +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +} + +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup_eventfd_list *event; + + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + spin_lock(&memcg_oom_lock); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (memcg->under_oom) + eventfd_signal(eventfd, 1); + spin_unlock(&memcg_oom_lock); + + return 0; +} + +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + struct mem_cgroup_eventfd_list *ev, *tmp; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + spin_unlock(&memcg_oom_lock); +} + /* * Unregister event and free resources. * @ mm/memcontrol.c:4914 @ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, return ret; } +#else + +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return -EOPNOTSUPP; +} + +static void memcg_check_events(struct mem_cgroup *memcg, int nid) { } + +#endif + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @ mm/memcontrol.c:7098 @ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); - refill_stock(memcg, nr_pages); + refill_stock(memcg, nr_pages, false); } static int __init cgroup_memory(char *s) @ mm/memcontrol.c:7238 @ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * i_pages lock which is taken with interrupts-off. It is * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. + * On PREEMPT_RT interrupts are never disabled and the updates to per-CPU + * variables are synchronised by keeping preemption disabled. */ - VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, -nr_entries); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + VM_BUG_ON(!irqs_disabled()); + mem_cgroup_charge_statistics(memcg, -nr_entries); + } else { + preempt_disable(); + mem_cgroup_charge_statistics(memcg, -nr_entries); + preempt_enable(); + } + memcg_check_events(memcg, page_to_nid(page)); css_put(&memcg->css); @ mm/vmalloc.c:1925 @ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) return ERR_PTR(err); } - vbq = &get_cpu_var(vmap_block_queue); + get_cpu_light(); + vbq = this_cpu_ptr(&vmap_block_queue); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); - put_cpu_var(vmap_block_queue); + put_cpu_light(); return vaddr; } @ mm/vmalloc.c:2009 @ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) order = get_order(size); rcu_read_lock(); - vbq = &get_cpu_var(vmap_block_queue); + get_cpu_light(); + vbq = this_cpu_ptr(&vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; @ mm/vmalloc.c:2033 @ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) break; } - put_cpu_var(vmap_block_queue); + put_cpu_light(); rcu_read_unlock(); /* Allocate new block if nothing was found */ @ mm/workingset.c:436 @ static struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { + struct address_space *mapping; + /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. @ mm/workingset.c:446 @ void workingset_update_node(struct xa_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + mapping = container_of(node->array, struct address_space, i_pages); + lockdep_assert_held(&mapping->i_pages.xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { @ mm/zsmalloc.c:33 @ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +/* + * lock ordering: + * page_lock + * pool->migrate_lock + * class->lock + * zspage->lock + */ + #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> @ mm/zsmalloc.c:68 @ #include <linux/wait.h> #include <linux/pagemap.h> #include <linux/fs.h> +#include <linux/local_lock.h> #define ZSPAGE_MAGIC 0x58 @ mm/zsmalloc.c:112 @ #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) -/* - * Memory for allocating for handle keeps object position by - * encoding <page, obj_idx> and the encoded value has a room - * in least bit(ie, look at obj_to_location). - * We use the bit to synchronize between object access by - * user and migration. - */ -#define HANDLE_PIN_BIT 0 - /* * Head in allocated object should have OBJ_ALLOCATED_TAG * to identify the object was allocated or not. @ mm/zsmalloc.c:124 @ #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) +#define HUGE_BITS 1 #define FULLNESS_BITS 2 #define CLASS_BITS 8 #define ISOLATED_BITS 3 @ mm/zsmalloc.c:162 @ enum fullness_group { NR_ZS_FULLNESS, }; -enum zs_stat_type { +enum class_stat_type { CLASS_EMPTY, CLASS_ALMOST_EMPTY, CLASS_ALMOST_FULL, @ mm/zsmalloc.c:217 @ struct size_class { struct zs_size_stat stats; }; -/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ -static void SetPageHugeObject(struct page *page) -{ - SetPageOwnerPriv1(page); -} - -static void ClearPageHugeObject(struct page *page) -{ - ClearPageOwnerPriv1(page); -} - -static int PageHugeObject(struct page *page) -{ - return PageOwnerPriv1(page); -} - /* * Placed within free objects to form a singly linked list. * For every zspage, zspage->freeobj gives head of this list. @ mm/zsmalloc.c:257 @ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; - /* A wait queue for when migration races with async_free_zspage() */ - struct wait_queue_head migration_wait; - atomic_long_t isolated_pages; - bool destroying; #endif + /* protect page/zspage migration */ + rwlock_t migrate_lock; }; struct zspage { struct { + unsigned int huge:HUGE_BITS; unsigned int fullness:FULLNESS_BITS; unsigned int class:CLASS_BITS + 1; unsigned int isolated:ISOLATED_BITS; @ mm/zsmalloc.c:280 @ struct zspage { }; struct mapping_area { + local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; +/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +static void SetZsHugePage(struct zspage *zspage) +{ + zspage->huge = 1; +} + +static bool ZsHugePage(struct zspage *zspage) +{ + return zspage->huge; +} + #ifdef CONFIG_COMPACTION static int zs_register_migration(struct zs_pool *pool); static void zs_unregister_migration(struct zs_pool *pool); static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); +static void migrate_write_lock(struct zspage *zspage); +static void migrate_write_lock_nested(struct zspage *zspage); +static void migrate_write_unlock(struct zspage *zspage); static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); @ mm/zsmalloc.c:317 @ static void zs_unregister_migration(struct zs_pool *pool) {} static void migrate_lock_init(struct zspage *zspage) {} static void migrate_read_lock(struct zspage *zspage) {} static void migrate_read_unlock(struct zspage *zspage) {} +static void migrate_write_lock(struct zspage *zspage) {} +static void migrate_write_lock_nested(struct zspage *zspage) {} +static void migrate_write_unlock(struct zspage *zspage) {} static void kick_deferred_free(struct zs_pool *pool) {} static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} @ mm/zsmalloc.c:371 @ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } +/* class->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { - /* - * lsb of @obj represents handle lock while other bits - * represent object value the handle is pointing so - * updating shouldn't do store tearing. - */ - WRITE_ONCE(*(unsigned long *)handle, obj); + *(unsigned long *)handle = obj; } /* zpool driver */ @ mm/zsmalloc.c:456 @ MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); - -static bool is_zspage_isolated(struct zspage *zspage) -{ - return zspage->isolated; -} +static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static __maybe_unused int is_first_page(struct page *page) { @ mm/zsmalloc.c:515 @ static void get_zspage_mapping(struct zspage *zspage, *class_idx = zspage->class; } +static struct size_class *zspage_class(struct zs_pool *pool, + struct zspage *zspage) +{ + return pool->size_class[zspage->class]; +} + static void set_zspage_mapping(struct zspage *zspage, unsigned int class_idx, enum fullness_group fullness) @ mm/zsmalloc.c:547 @ static int get_size_class_index(int size) return min_t(int, ZS_SIZE_CLASSES - 1, idx); } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_inc(struct size_class *class, +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_inc(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] += cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_dec(struct size_class *class, +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_dec(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ +/* type can be of enum type class_stat_type or fullness_group */ static inline unsigned long zs_stat_get(struct size_class *class, int type) { @ mm/zsmalloc.c:723 @ static void insert_zspage(struct size_class *class, { struct zspage *head; - zs_stat_inc(class, fullness, 1); + class_stat_inc(class, fullness, 1); head = list_first_entry_or_null(&class->fullness_list[fullness], struct zspage, list); /* @ mm/zsmalloc.c:745 @ static void remove_zspage(struct size_class *class, enum fullness_group fullness) { VM_BUG_ON(list_empty(&class->fullness_list[fullness])); - VM_BUG_ON(is_zspage_isolated(zspage)); list_del_init(&zspage->list); - zs_stat_dec(class, fullness, 1); + class_stat_dec(class, fullness, 1); } /* @ mm/zsmalloc.c:770 @ static enum fullness_group fix_fullness_group(struct size_class *class, if (newfg == currfg) goto out; - if (!is_zspage_isolated(zspage)) { - remove_zspage(class, zspage, currfg); - insert_zspage(class, zspage, newfg); - } - + remove_zspage(class, zspage, currfg); + insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class_idx, newfg); - out: return newfg; } @ mm/zsmalloc.c:823 @ static struct zspage *get_zspage(struct page *page) static struct page *get_next_page(struct page *page) { - if (unlikely(PageHugeObject(page))) + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) return NULL; return page->freelist; @ mm/zsmalloc.c:845 @ static void obj_to_location(unsigned long obj, struct page **page, *obj_idx = (obj & OBJ_INDEX_MASK); } +static void obj_to_page(unsigned long obj, struct page **page) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); +} + /** * location_to_obj - get obj value encoded from (<page>, <obj_idx>) * @page: page object resides in zspage @ mm/zsmalloc.c:872 @ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static unsigned long obj_to_head(struct page *page, void *obj) +static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) { - if (unlikely(PageHugeObject(page))) { + unsigned long handle; + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) { VM_BUG_ON_PAGE(!is_first_page(page), page); - return page->index; + handle = page->index; } else - return *(unsigned long *)obj; -} + handle = *(unsigned long *)obj; -static inline int testpin_tag(unsigned long handle) -{ - return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); -} + if (!(handle & OBJ_ALLOCATED_TAG)) + return false; -static inline int trypin_tag(unsigned long handle) -{ - return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static void pin_tag(unsigned long handle) __acquires(bitlock) -{ - bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static void unpin_tag(unsigned long handle) __releases(bitlock) -{ - bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); + *phandle = handle & ~OBJ_ALLOCATED_TAG; + return true; } static void reset_page(struct page *page) @ mm/zsmalloc.c:896 @ static void reset_page(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); page_mapcount_reset(page); - ClearPageHugeObject(page); page->freelist = NULL; } @ mm/zsmalloc.c:947 @ static void __free_zspage(struct zs_pool *pool, struct size_class *class, cache_free_zspage(pool, zspage); - zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); } @ mm/zsmalloc.c:958 @ static void free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(list_empty(&zspage->list)); + /* + * Since zs_free couldn't be sleepable, this function cannot call + * lock_page. The page locks trylock_zspage got will be released + * by __free_zspage. + */ if (!trylock_zspage(zspage)) { kick_deferred_free(pool); return; @ mm/zsmalloc.c:1042 @ static void create_page_chain(struct size_class *class, struct zspage *zspage, SetPagePrivate(page); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) - SetPageHugeObject(page); + SetZsHugePage(zspage); } else { prev_page->freelist = page; } @ mm/zsmalloc.c:1246 @ void *zs_map_object(struct zs_pool *pool, unsigned long handle, unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; struct page *pages[2]; @ mm/zsmalloc.c:1258 @ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); - /* From now on, migration cannot move the object */ - pin_tag(handle); - + /* It guarantees it can get zspage from handle safely */ + read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - /* migration cannot move any subpage in this zspage */ + /* + * migration cannot move any zpages in this zspage. Here, class->lock + * is too heavy since callers would take some time until they calls + * zs_unmap_object API so delegate the locking from class to zspage + * which is smaller granularity. + */ migrate_read_lock(zspage); + read_unlock(&pool->migrate_lock); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; - area = &get_cpu_var(zs_map_area); + local_lock(&zs_map_area.lock); + area = this_cpu_ptr(&zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ @ mm/zsmalloc.c:1293 @ void *zs_map_object(struct zs_pool *pool, unsigned long handle, ret = __zs_map_object(area, pages, off, class->size); out: - if (likely(!PageHugeObject(page))) + if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; return ret; @ mm/zsmalloc.c:1307 @ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; area = this_cpu_ptr(&zs_map_area); @ mm/zsmalloc.c:1328 @ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } - put_cpu_var(zs_map_area); + local_unlock(&zs_map_area.lock); migrate_read_unlock(zspage); - unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); @ mm/zsmalloc.c:1353 @ size_t zs_huge_class_size(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_huge_class_size); -static unsigned long obj_malloc(struct size_class *class, +static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { int i, nr_page, offset; unsigned long obj; struct link_free *link; + struct size_class *class; struct page *m_page; unsigned long m_offset; void *vaddr; + class = pool->size_class[zspage->class]; handle |= OBJ_ALLOCATED_TAG; obj = get_freeobj(zspage); @ mm/zsmalloc.c:1380 @ static unsigned long obj_malloc(struct size_class *class, vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); - if (likely(!PageHugeObject(m_page))) + if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle; else @ mm/zsmalloc.c:1389 @ static unsigned long obj_malloc(struct size_class *class, kunmap_atomic(vaddr); mod_zspage_inuse(zspage, 1); - zs_stat_inc(class, OBJ_USED, 1); obj = location_to_obj(m_page, obj); @ mm/zsmalloc.c:1424 @ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; + /* class->lock effectively protects the zpage migration */ spin_lock(&class->lock); zspage = find_get_zspage(class); if (likely(zspage)) { - obj = obj_malloc(class, zspage, handle); + obj = obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, zspage); record_obj(handle, obj); + class_stat_inc(class, OBJ_USED, 1); spin_unlock(&class->lock); return handle; @ mm/zsmalloc.c:1447 @ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } spin_lock(&class->lock); - obj = obj_malloc(class, zspage, handle); + obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class->index, newfg); record_obj(handle, obj); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); - zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, OBJ_USED, 1); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); @ mm/zsmalloc.c:1465 @ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(struct size_class *class, unsigned long obj) +static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; @ mm/zsmalloc.c:1475 @ static void obj_free(struct size_class *class, unsigned long obj) void *vaddr; obj_to_location(obj, &f_page, &f_objidx); - f_offset = (class->size * f_objidx) & ~PAGE_MASK; + f_offset = (class_size * f_objidx) & ~PAGE_MASK; zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; kunmap_atomic(vaddr); set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); - zs_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) @ mm/zsmalloc.c:1496 @ void zs_free(struct zs_pool *pool, unsigned long handle) struct zspage *zspage; struct page *f_page; unsigned long obj; - unsigned int f_objidx; - int class_idx; struct size_class *class; enum fullness_group fullness; - bool isolated; if (unlikely(!handle)) return; - pin_tag(handle); + /* + * The pool->migrate_lock protects the race with zpage's migration + * so it's safe to get the page from handle. + */ + read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); - obj_to_location(obj, &f_page, &f_objidx); + obj_to_page(obj, &f_page); zspage = get_zspage(f_page); - - migrate_read_lock(zspage); - - get_zspage_mapping(zspage, &class_idx, &fullness); - class = pool->size_class[class_idx]; - + class = zspage_class(pool, zspage); spin_lock(&class->lock); - obj_free(class, obj); + read_unlock(&pool->migrate_lock); + + obj_free(class->size, obj); + class_stat_dec(class, OBJ_USED, 1); fullness = fix_fullness_group(class, zspage); - if (fullness != ZS_EMPTY) { - migrate_read_unlock(zspage); + if (fullness != ZS_EMPTY) goto out; - } - isolated = is_zspage_isolated(zspage); - migrate_read_unlock(zspage); - /* If zspage is isolated, zs_page_putback will free the zspage */ - if (likely(!isolated)) - free_zspage(pool, class, zspage); + free_zspage(pool, class, zspage); out: - spin_unlock(&class->lock); - unpin_tag(handle); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @ mm/zsmalloc.c:1597 @ static void zs_object_copy(struct size_class *class, unsigned long dst, static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { - unsigned long head; int offset = 0; int index = *obj_idx; unsigned long handle = 0; @ mm/zsmalloc.c:1606 @ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - head = obj_to_head(page, addr + offset); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - if (trypin_tag(handle)) - break; - handle = 0; - } + if (obj_allocated(page, addr + offset, &handle)) + break; offset += class->size; index++; @ mm/zsmalloc.c:1653 @ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, /* Stop if there is no more space */ if (zspage_full(class, get_zspage(d_page))) { - unpin_tag(handle); ret = -ENOMEM; break; } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(class, get_zspage(d_page), handle); + free_obj = obj_malloc(pool, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; - /* - * record_obj updates handle's value to free_obj and it will - * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which - * breaks synchronization using pin_tag(e,g, zs_free) so - * let's keep the lock bit. - */ - free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); - unpin_tag(handle); - obj_free(class, used_obj); + obj_free(class->size, used_obj); } /* Remember last position in this iteration */ @ mm/zsmalloc.c:1687 @ static struct zspage *isolate_zspage(struct size_class *class, bool source) zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], struct zspage, list); if (zspage) { - VM_BUG_ON(is_zspage_isolated(zspage)); remove_zspage(class, zspage, fg[i]); return zspage; } @ mm/zsmalloc.c:1707 @ static enum fullness_group putback_zspage(struct size_class *class, { enum fullness_group fullness; - VM_BUG_ON(is_zspage_isolated(zspage)); - fullness = get_fullness_group(class, zspage); insert_zspage(class, zspage, fullness); set_zspage_mapping(zspage, class->index, fullness); @ mm/zsmalloc.c:1775 @ static void migrate_write_lock(struct zspage *zspage) write_lock(&zspage->lock); } +static void migrate_write_lock_nested(struct zspage *zspage) +{ + write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); +} + static void migrate_write_unlock(struct zspage *zspage) { write_unlock(&zspage->lock); @ mm/zsmalloc.c:1793 @ static void inc_zspage_isolation(struct zspage *zspage) static void dec_zspage_isolation(struct zspage *zspage) { + VM_BUG_ON(zspage->isolated == 0); zspage->isolated--; } -static void putback_zspage_deferred(struct zs_pool *pool, - struct size_class *class, - struct zspage *zspage) -{ - enum fullness_group fg; - - fg = putback_zspage(class, zspage); - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); - -} - -static inline void zs_pool_dec_isolated(struct zs_pool *pool) -{ - VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); - atomic_long_dec(&pool->isolated_pages); - /* - * Checking pool->destroying must happen after atomic_long_dec() - * for pool->isolated_pages above. Paired with the smp_mb() in - * zs_unregister_migration(). - */ - smp_mb__after_atomic(); - if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) - wake_up_all(&pool->migration_wait); -} - static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @ mm/zsmalloc.c:1815 @ static void replace_sub_page(struct size_class *class, struct zspage *zspage, create_page_chain(class, zspage, pages); set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); - if (unlikely(PageHugeObject(oldpage))) + if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; __SetPageMovable(newpage, page_mapping(oldpage)); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { - struct zs_pool *pool; - struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; - struct address_space *mapping; /* * Page is locked so zspage couldn't be destroyed. For detail, look at @ mm/zsmalloc.c:1832 @ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); - - /* - * Without class lock, fullness could be stale while class_idx is okay - * because class_idx is constant unless page is freed so we should get - * fullness again under class lock. - */ - get_zspage_mapping(zspage, &class_idx, &fullness); - mapping = page_mapping(page); - pool = mapping->private_data; - class = pool->size_class[class_idx]; - - spin_lock(&class->lock); - if (get_zspage_inuse(zspage) == 0) { - spin_unlock(&class->lock); - return false; - } - - /* zspage is isolated for object migration */ - if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - spin_unlock(&class->lock); - return false; - } - - /* - * If this is first time isolation for the zspage, isolate zspage from - * size_class to prevent further object allocation from the zspage. - */ - if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - get_zspage_mapping(zspage, &class_idx, &fullness); - atomic_long_inc(&pool->isolated_pages); - remove_zspage(class, zspage, fullness); - } - + migrate_write_lock(zspage); inc_zspage_isolation(zspage); - spin_unlock(&class->lock); + migrate_write_unlock(zspage); return true; } @ mm/zsmalloc.c:1844 @ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, { struct zs_pool *pool; struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; - int offset, pos; - unsigned long handle, head; + int offset; + unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - int ret = -EAGAIN; /* * We cannot support the _NO_COPY case here, because copy needs to @ mm/zsmalloc.c:1863 @ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); - zspage = get_zspage(page); - - /* Concurrent compactor cannot migrate any subpage in zspage */ - migrate_write_lock(zspage); - get_zspage_mapping(zspage, &class_idx, &fullness); pool = mapping->private_data; - class = pool->size_class[class_idx]; - offset = get_first_obj_offset(page); + /* + * The pool migrate_lock protects the race between zpage migration + * and zs_free. + */ + write_lock(&pool->migrate_lock); + zspage = get_zspage(page); + class = zspage_class(pool, zspage); + + /* + * the class lock protects zpage alloc/free in the zspage. + */ spin_lock(&class->lock); - if (!get_zspage_inuse(zspage)) { - /* - * Set "offset" to end of the page so that every loops - * skips unnecessary object scanning. - */ - offset = PAGE_SIZE; - } + /* the migrate_write_lock protects zpage access via zs_map_object */ + migrate_write_lock(zspage); - pos = offset; + offset = get_first_obj_offset(page); s_addr = kmap_atomic(page); - while (pos < PAGE_SIZE) { - head = obj_to_head(page, s_addr + pos); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - if (!trypin_tag(handle)) - goto unpin_objects; - } - pos += class->size; - } /* * Here, any user cannot access all objects in the zspage so let's move. @ mm/zsmalloc.c:1890 @ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, memcpy(d_addr, s_addr, PAGE_SIZE); kunmap_atomic(d_addr); - for (addr = s_addr + offset; addr < s_addr + pos; + for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - BUG_ON(!testpin_tag(handle)); + if (obj_allocated(page, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); new_obj = (unsigned long)location_to_obj(newpage, obj_idx); - new_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, new_obj); } } + kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); - get_page(newpage); - - dec_zspage_isolation(zspage); - /* - * Page migration is done so let's putback isolated zspage to - * the list if @page is final isolated subpage in the zspage. + * Since we complete the data copy and set up new zspage structure, + * it's okay to release migration_lock. */ - if (!is_zspage_isolated(zspage)) { - /* - * We cannot race with zs_destroy_pool() here because we wait - * for isolation to hit zero before we start destroying. - * Also, we ensure that everyone can see pool->destroying before - * we start waiting. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } + write_unlock(&pool->migrate_lock); + spin_unlock(&class->lock); + dec_zspage_isolation(zspage); + migrate_write_unlock(zspage); + get_page(newpage); if (page_zone(newpage) != page_zone(page)) { dec_zone_page_state(page, NR_ZSPAGES); inc_zone_page_state(newpage, NR_ZSPAGES); @ mm/zsmalloc.c:1921 @ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, reset_page(page); put_page(page); - page = newpage; - ret = MIGRATEPAGE_SUCCESS; -unpin_objects: - for (addr = s_addr + offset; addr < s_addr + pos; - addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - BUG_ON(!testpin_tag(handle)); - unpin_tag(handle); - } - } - kunmap_atomic(s_addr); - spin_unlock(&class->lock); - migrate_write_unlock(zspage); - - return ret; + return MIGRATEPAGE_SUCCESS; } static void zs_page_putback(struct page *page) { - struct zs_pool *pool; - struct size_class *class; - int class_idx; - enum fullness_group fg; - struct address_space *mapping; struct zspage *zspage; VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); - mapping = page_mapping(page); - pool = mapping->private_data; - class = pool->size_class[class_idx]; - - spin_lock(&class->lock); + migrate_write_lock(zspage); dec_zspage_isolation(zspage); - if (!is_zspage_isolated(zspage)) { - /* - * Due to page_lock, we cannot free zspage immediately - * so let's defer. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } - spin_unlock(&class->lock); + migrate_write_unlock(zspage); } static const struct address_space_operations zsmalloc_aops = { @ mm/zsmalloc.c:1957 @ static int zs_register_migration(struct zs_pool *pool) return 0; } -static bool pool_isolated_are_drained(struct zs_pool *pool) -{ - return atomic_long_read(&pool->isolated_pages) == 0; -} - -/* Function for resolving migration */ -static void wait_for_isolated_drain(struct zs_pool *pool) -{ - - /* - * We're in the process of destroying the pool, so there are no - * active allocations. zs_page_isolate() fails for completely free - * zspages, so we need only wait for the zs_pool's isolated - * count to hit zero. - */ - wait_event(pool->migration_wait, - pool_isolated_are_drained(pool)); -} - static void zs_unregister_migration(struct zs_pool *pool) { - pool->destroying = true; - /* - * We need a memory barrier here to ensure global visibility of - * pool->destroying. Thus pool->isolated pages will either be 0 in which - * case we don't care, or it will be > 0 and pool->destroying will - * ensure that we wake up once isolation hits 0. - */ - smp_mb(); - wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @ mm/zsmalloc.c:1988 @ static void async_free_zspage(struct work_struct *work) spin_unlock(&class->lock); } - list_for_each_entry_safe(zspage, tmp, &free_pages, list) { list_del(&zspage->list); lock_zspage(zspage); @ mm/zsmalloc.c:2051 @ static unsigned long __zs_compact(struct zs_pool *pool, struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; + /* protect the race between zpage migration and zs_free */ + write_lock(&pool->migrate_lock); + /* protect zpage allocation/free */ spin_lock(&class->lock); while ((src_zspage = isolate_zspage(class, true))) { + /* protect someone accessing the zspage(i.e., zs_map_object) */ + migrate_write_lock(src_zspage); if (!zs_can_compact(class)) break; @ mm/zsmalloc.c:2066 @ static unsigned long __zs_compact(struct zs_pool *pool, cc.s_page = get_first_page(src_zspage); while ((dst_zspage = isolate_zspage(class, false))) { + migrate_write_lock_nested(dst_zspage); + cc.d_page = get_first_page(dst_zspage); /* * If there is no more space in dst_page, resched @ mm/zsmalloc.c:2077 @ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + dst_zspage = NULL; + if (rwlock_is_contended(&pool->migrate_lock)) + break; } /* Stop if we couldn't find slot */ @ mm/zsmalloc.c:2088 @ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { + migrate_write_unlock(src_zspage); free_zspage(pool, class, src_zspage); pages_freed += class->pages_per_zspage; - } + } else + migrate_write_unlock(src_zspage); spin_unlock(&class->lock); + write_unlock(&pool->migrate_lock); cond_resched(); + write_lock(&pool->migrate_lock); spin_lock(&class->lock); } - if (src_zspage) + if (src_zspage) { putback_zspage(class, src_zspage); + migrate_write_unlock(src_zspage); + } spin_unlock(&class->lock); + write_unlock(&pool->migrate_lock); return pages_freed; } @ mm/zsmalloc.c:2215 @ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); + rwlock_init(&pool->migrate_lock); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) goto err; -#ifdef CONFIG_COMPACTION - init_waitqueue_head(&pool->migration_wait); -#endif - if (create_cache(pool)) goto err; @ net/core/dev.c:225 @ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *sd) +static inline void rps_lock_irqsave(struct softnet_data *sd, + unsigned long *flags) { -#ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(*flags); } -static inline void rps_unlock(struct softnet_data *sd) +static inline void rps_lock_irq_disable(struct softnet_data *sd) { -#ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); +} + +static inline void rps_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static inline void rps_unlock_irq_enable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, @ net/core/dev.c:394 @ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(net); } @ net/core/dev.c:412 @ static void unlist_netdevice(struct net_device *dev) ASSERT_RTNL(); /* Unlink dev from the device chain */ - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @ net/core/dev.c:1295 @ int dev_change_name(struct net_device *dev, const char *newname) netdev_adjacent_rename_links(dev, oldname); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); synchronize_rcu(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @ net/core/dev.c:3071 @ static void __netif_reschedule(struct Qdisc *q) sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); + preempt_check_resched_rt(); } void __netif_schedule(struct Qdisc *q) @ net/core/dev.c:3134 @ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); + preempt_check_resched_rt(); } EXPORT_SYMBOL(__dev_kfree_skb_irq); @ net/core/dev.c:3861 @ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. + * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit + * and then other tasks will only enqueue packets. The packets will be + * sent after the qdisc owner is scheduled again. To prevent this + * scenario the task always serialize on the lock. */ - contended = qdisc_is_running(q); + contended = IS_ENABLED(CONFIG_PREEMPT_RT) || qdisc_is_running(q); if (unlikely(contended)) spin_lock(&q->busylock); @ net/core/dev.c:4669 @ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, sd = &per_cpu(softnet_data, cpu); - local_irq_save(flags); - - rps_lock(sd); + rps_lock_irqsave(sd, &flags); if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); @ net/core/dev.c:4678 @ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); - rps_unlock(sd); - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device * We can use non atomic operation since we own the queue lock + * PREEMPT_RT needs to disable interrupts here for + * synchronisation needed in napi_schedule. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { if (!rps_ipi_queued(sd)) ____napi_schedule(sd, &sd->backlog); } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); goto enqueue; } drop: sd->dropped++; - rps_unlock(sd); - - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); @ net/core/dev.c:4942 @ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @ net/core/dev.c:4951 @ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } @ net/core/dev.c:4980 @ int netif_rx(struct sk_buff *skb) { int ret; + local_bh_disable(); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); + local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); -int netif_rx_ni(struct sk_buff *skb) -{ - int err; - - trace_netif_rx_ni_entry(skb); - - preempt_disable(); - err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); - trace_netif_rx_ni_exit(err); - - return err; -} -EXPORT_SYMBOL(netif_rx_ni); - -int netif_rx_any_context(struct sk_buff *skb) -{ - /* - * If invoked from contexts which do not invoke bottom half - * processing either at return from interrupt or when softrqs are - * reenabled, use netif_rx_ni() which invokes bottomhalf processing - * directly. - */ - if (in_interrupt()) - return netif_rx(skb); - else - return netif_rx_ni(skb); -} -EXPORT_SYMBOL(netif_rx_any_context); - static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @ net/core/dev.c:5762 @ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); @ net/core/dev.c:5770 @ static void flush_backlog(struct work_struct *work) input_queue_head_incr(sd); } } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { @ net/core/dev.c:5788 @ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); return do_flush; #endif @ net/core/dev.c:6409 @ static void net_rps_action_and_irq_enable(struct softnet_data *sd) sd->rps_ipi_list = NULL; local_irq_enable(); + preempt_check_resched_rt(); /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); } else #endif local_irq_enable(); + preempt_check_resched_rt(); } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) @ net/core/dev.c:6456 @ static int process_backlog(struct napi_struct *napi, int quota) } - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @ net/core/dev.c:6472 @ static int process_backlog(struct napi_struct *napi, int quota) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); } return work; @ net/core/dev.c:6492 @ void __napi_schedule(struct napi_struct *n) local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); + preempt_check_resched_rt(); } EXPORT_SYMBOL(__napi_schedule); @ net/core/dev.c:11324 @ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + preempt_check_resched_rt(); #ifdef CONFIG_RPS remsd = oldsd->rps_ipi_list; @ net/core/link_watch.c:58 @ static void rfc2863_policy(struct net_device *dev) if (operstate == dev->operstate) return; - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); switch(dev->link_mode) { case IF_LINK_MODE_TESTING: @ net/core/link_watch.c:77 @ static void rfc2863_policy(struct net_device *dev) dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } @ net/core/rtnetlink.c:845 @ static void set_operstate(struct net_device *dev, unsigned char transition) } if (dev->operstate != operstate) { - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); netdev_state_change(dev); } } @ net/core/rtnetlink.c:2782 @ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; dev->link_mode = value; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } if (tb[IFLA_VFINFO_LIST]) { @ net/hsr/hsr_device.c:33 @ static bool is_slave_up(struct net_device *dev) static void __hsr_set_operstate(struct net_device *dev, int transition) { - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); if (dev->operstate != transition) { dev->operstate = transition; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); netdev_state_change(dev); } else { - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } } @ net/ipv4/inet_hashtables.c:640 @ int __inet_hash(struct sock *sk, struct sock *osk) int err = 0; if (sk->sk_state != TCP_LISTEN) { + local_bh_disable(); inet_ehash_nolisten(sk, osk, NULL); + local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); @ net/ipv4/inet_hashtables.c:674 @ int inet_hash(struct sock *sk) { int err = 0; - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); + if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); - local_bh_enable(); - } return err; } EXPORT_SYMBOL_GPL(inet_hash); -void inet_unhash(struct sock *sk) +static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct inet_listen_hashbucket *ilb = NULL; - spinlock_t *lock; - if (sk_unhashed(sk)) return; - if (sk->sk_state == TCP_LISTEN) { - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &ilb->lock; - } else { - lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - } - spin_lock_bh(lock); - if (sk_unhashed(sk)) - goto unlock; - if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); if (ilb) { + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + inet_unhash2(hashinfo, sk); ilb->count--; } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); -unlock: - spin_unlock_bh(lock); +} + +void inet_unhash(struct sock *sk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + + if (sk_unhashed(sk)) + return; + + if (sk->sk_state == TCP_LISTEN) { + struct inet_listen_hashbucket *ilb; + + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + /* Don't disable bottom halves while acquiring the lock to + * avoid circular locking dependency on PREEMPT_RT. + */ + spin_lock(&ilb->lock); + __inet_unhash(sk, ilb); + spin_unlock(&ilb->lock); + } else { + spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + + spin_lock_bh(lock); + __inet_unhash(sk, NULL); + spin_unlock_bh(lock); + } } EXPORT_SYMBOL_GPL(inet_unhash); @ net/ipv6/inet6_hashtables.c:336 @ int inet6_hash(struct sock *sk) { int err = 0; - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); + if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); - local_bh_enable(); - } return err; } @ net/sunrpc/svc_xprt.c:444 @ void svc_xprt_do_enqueue(struct svc_xprt *xprt) if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) return; - cpu = get_cpu(); + cpu = get_cpu_light(); pool = svc_pool_for_cpu(xprt->xpt_server, cpu); atomic_long_inc(&pool->sp_stats.packets); @ net/sunrpc/svc_xprt.c:468 @ void svc_xprt_do_enqueue(struct svc_xprt *xprt) rqstp = NULL; out_unlock: rcu_read_unlock(); - put_cpu(); + put_cpu_light(); trace_svc_xprt_do_enqueue(xprt, rqstp); } EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue);