Index: linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst =================================================================== @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:67 @ Brief summary of control files. threads cgroup.procs show list of processes cgroup.event_control an interface for event_fd() + This knob is not available on CONFIG_PREEMPT_RT systems. memory.usage_in_bytes show current usage for memory (See 5.5 for details) memory.memsw.usage_in_bytes show current usage for memory+Swap @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:79 @ Brief summary of control files. memory.max_usage_in_bytes show max memory usage recorded memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded memory.soft_limit_in_bytes set/show soft limit of memory usage + This knob is not available on CONFIG_PREEMPT_RT systems. memory.stat show various statistics memory.use_hierarchy set/show hierarchical account enabled This knob is deprecated and shouldn't be Index: linux-5.15/Documentation/dev-tools/kcov.rst =================================================================== --- linux-5.15.orig/Documentation/dev-tools/kcov.rst +++ linux-5.15/Documentation/dev-tools/kcov.rst @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:53 @ program using kcov: #include <sys/mman.h> #include <unistd.h> #include <fcntl.h> + #include <linux/types.h> #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) #define KCOV_ENABLE _IO('c', 100) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:181 @ Comparison operands collection is simila /* Read number of comparisons collected. */ n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); for (i = 0; i < n; i++) { + uint64_t ip; + type = cover[i * KCOV_WORDS_PER_CMP + 1]; /* arg1 and arg2 - operands of the comparison. */ arg1 = cover[i * KCOV_WORDS_PER_CMP + 2]; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:257 @ selectively from different subsystems. .. code-block:: c + /* Same includes and defines as above. */ + struct kcov_remote_arg { __u32 trace_mode; __u32 area_size; Index: linux-5.15/arch/alpha/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/alpha/include/asm/spinlock_types.h +++ linux-5.15/arch/alpha/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #ifndef _ALPHA_SPINLOCK_TYPES_H #define _ALPHA_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/arm/Kconfig =================================================================== --- linux-5.15.orig/arch/arm/Kconfig +++ linux-5.15/arch/arm/Kconfig @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:35 @ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:72 @ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL select HAVE_ARCH_MMAP_RND_BITS if MMU @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:113 @ config ARM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:129 @ config ARM select OLD_SIGSUSPEND3 select PCI_SYSCALL if PCI select PERF_USE_VMALLOC + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select RTC_LIB select SYS_SUPPORTS_APM_EMULATION select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M Index: linux-5.15/arch/arm/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/arm/include/asm/spinlock_types.h +++ linux-5.15/arch/arm/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/arm/include/asm/thread_info.h =================================================================== --- linux-5.15.orig/arch/arm/include/asm/thread_info.h +++ linux-5.15/arch/arm/include/asm/thread_info.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:55 @ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ struct task_struct *task; /* main task structure */ __u32 cpu; /* cpu */ __u32 cpu_domain; /* cpu domain */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:134 @ extern int vfp_restore_user_hwstate(stru #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ #define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 9 #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:154 @ extern int vfp_restore_user_hwstate(stru #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) /* Checks for any syscall work in entry-common.S */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:164 @ extern int vfp_restore_user_hwstate(stru /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NOTIFY_SIGNAL) Index: linux-5.15/arch/arm/kernel/asm-offsets.c =================================================================== --- linux-5.15.orig/arch/arm/kernel/asm-offsets.c +++ linux-5.15/arch/arm/kernel/asm-offsets.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:46 @ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); Index: linux-5.15/arch/arm/kernel/entry-armv.S =================================================================== --- linux-5.15.orig/arch/arm/kernel/entry-armv.S +++ linux-5.15/arch/arm/kernel/entry-armv.S @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:209 @ __irq_svc: #ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count - ldr r0, [tsk, #TI_FLAGS] @ get flags teq r8, #0 @ if preempt count != 0 + bne 1f @ return from exeption + ldr r0, [tsk, #TI_FLAGS] @ get flags + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set + blne svc_preempt @ preempt! + + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r8, #0 @ if preempt lazy count != 0 movne r0, #0 @ force flags to 0 - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED_LAZY blne svc_preempt +1: #endif svc_exit r5, irq = 1 @ return from exception @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:235 @ svc_preempt: 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED + bne 1b + tst r0, #_TIF_NEED_RESCHED_LAZY reteq r8 @ go again - b 1b + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r0, #0 @ if preempt lazy count != 0 + beq 1b + ret r8 @ go again + #endif __und_fault: Index: linux-5.15/arch/arm/kernel/signal.c =================================================================== --- linux-5.15.orig/arch/arm/kernel/signal.c +++ linux-5.15/arch/arm/kernel/signal.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:610 @ do_work_pending(struct pt_regs *regs, un */ trace_hardirqs_off(); do { - if (likely(thread_flags & _TIF_NEED_RESCHED)) { + if (likely(thread_flags & (_TIF_NEED_RESCHED | + _TIF_NEED_RESCHED_LAZY))) { schedule(); } else { if (unlikely(!user_mode(regs))) Index: linux-5.15/arch/arm/mm/fault.c =================================================================== --- linux-5.15.orig/arch/arm/mm/fault.c +++ linux-5.15/arch/arm/mm/fault.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:403 @ do_translation_fault(unsigned long addr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); + if (interrupts_enabled(regs)) + local_irq_enable(); + if (user_mode(regs)) goto bad_area; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:476 @ do_translation_fault(unsigned long addr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + if (interrupts_enabled(regs)) + local_irq_enable(); + do_bad_area(addr, fsr, regs); return 0; } Index: linux-5.15/arch/arm64/Kconfig =================================================================== --- linux-5.15.orig/arch/arm64/Kconfig +++ linux-5.15/arch/arm64/Kconfig @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:91 @ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:195 @ config ARM64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_PREEMPT_LAZY select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUTEX_CMPXCHG if FUTEX select MMU_GATHER_RCU_TABLE_FREE @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:217 @ config ARM64 select PCI_DOMAINS_GENERIC if PCI select PCI_ECAM if (ACPI && PCI) select PCI_SYSCALL if PCI + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select POWER_RESET select POWER_SUPPLY select SPARSE_IRQ Index: linux-5.15/arch/arm64/include/asm/pgtable.h =================================================================== --- linux-5.15.orig/arch/arm64/include/asm/pgtable.h +++ linux-5.15/arch/arm64/include/asm/pgtable.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1004 @ static inline void update_mmu_cache(stru */ static inline bool arch_faults_on_old_pte(void) { - WARN_ON(preemptible()); + WARN_ON(is_migratable()); return !cpu_has_hw_af(); } Index: linux-5.15/arch/arm64/include/asm/preempt.h =================================================================== --- linux-5.15.orig/arch/arm64/include/asm/preempt.h +++ linux-5.15/arch/arm64/include/asm/preempt.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:73 @ static inline bool __preempt_count_dec_a * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE * pair. */ - return !pc || !READ_ONCE(ti->preempt_count); + if (!pc || !READ_ONCE(ti->preempt_count)) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if ((pc & ~PREEMPT_NEED_RESCHED)) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif } static inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u64 pc = READ_ONCE(current_thread_info()->preempt_count); + if (pc == preempt_offset) + return true; + + if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset) + return false; + + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else u64 pc = READ_ONCE(current_thread_info()->preempt_count); return pc == preempt_offset; +#endif } #ifdef CONFIG_PREEMPTION Index: linux-5.15/arch/arm64/include/asm/signal.h =================================================================== --- linux-5.15.orig/arch/arm64/include/asm/signal.h +++ linux-5.15/arch/arm64/include/asm/signal.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:25 @ static inline void __user *arch_untagged } #define arch_untagged_si_addr arch_untagged_si_addr +#if defined(CONFIG_PREEMPT_RT) +#define ARCH_RT_DELAYS_SIGNAL_SEND +#endif + #endif Index: linux-5.15/arch/arm64/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/arm64/include/asm/spinlock_types.h +++ linux-5.15/arch/arm64/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:8 @ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) # error "please don't include this file directly" #endif Index: linux-5.15/arch/arm64/include/asm/thread_info.h =================================================================== --- linux-5.15.orig/arch/arm64/include/asm/thread_info.h +++ linux-5.15/arch/arm64/include/asm/thread_info.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:29 @ struct thread_info { #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ #endif + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ union { u64 preempt_count; /* 0 => preemptible, <0 => bug */ struct { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:71 @ int arch_dup_task_struct(struct task_str #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 7 #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:102 @ int arch_dup_task_struct(struct task_str #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ _TIF_NOTIFY_SIGNAL) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:114 @ int arch_dup_task_struct(struct task_str _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) + #ifdef CONFIG_SHADOW_CALL_STACK #define INIT_SCS \ .scs_base = init_shadow_call_stack, \ Index: linux-5.15/arch/arm64/kernel/asm-offsets.c =================================================================== --- linux-5.15.orig/arch/arm64/kernel/asm-offsets.c +++ linux-5.15/arch/arm64/kernel/asm-offsets.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:34 @ int main(void) BLANK(); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif Index: linux-5.15/arch/arm64/kernel/fpsimd.c =================================================================== --- linux-5.15.orig/arch/arm64/kernel/fpsimd.c +++ linux-5.15/arch/arm64/kernel/fpsimd.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:182 @ static void __get_cpu_fpsimd_context(voi * * The double-underscore version must only be called if you know the task * can't be preempted. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. */ static void get_cpu_fpsimd_context(void) { - local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); __get_cpu_fpsimd_context(); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:215 @ static void __put_cpu_fpsimd_context(voi static void put_cpu_fpsimd_context(void) { __put_cpu_fpsimd_context(); - local_bh_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } static bool have_cpu_fpsimd_context(void) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1048 @ void fpsimd_thread_switch(struct task_st void fpsimd_flush_thread(void) { int vl, supported_vl; + void *sve_state = NULL; if (!system_supports_fpsimd()) return; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1061 @ void fpsimd_flush_thread(void) if (system_supports_sve()) { clear_thread_flag(TIF_SVE); - sve_free(current); + + /* Defer kfree() while in atomic context */ + sve_state = current->thread.sve_state; + current->thread.sve_state = NULL; /* * Reset the task vector length as required. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1098 @ void fpsimd_flush_thread(void) } put_cpu_fpsimd_context(); + kfree(sve_state); } /* Index: linux-5.15/arch/arm64/kernel/signal.c =================================================================== --- linux-5.15.orig/arch/arm64/kernel/signal.c +++ linux-5.15/arch/arm64/kernel/signal.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:925 @ static void do_signal(struct pt_regs *re void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { do { - if (thread_flags & _TIF_NEED_RESCHED) { + if (thread_flags & _TIF_NEED_RESCHED_MASK) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:933 @ void do_notify_resume(struct pt_regs *re } else { local_daif_restore(DAIF_PROCCTX); +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (unlikely(current->forced_info.si_signo)) { + struct task_struct *t = current; + force_sig_info(&t->forced_info); + t->forced_info.si_signo = 0; + } +#endif + if (thread_flags & _TIF_UPROBE) uprobe_notify_resume(regs); Index: linux-5.15/arch/arm64/kvm/arm.c =================================================================== --- linux-5.15.orig/arch/arm64/kvm/arm.c +++ linux-5.15/arch/arm64/kvm/arm.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:831 @ int kvm_arch_vcpu_ioctl_run(struct kvm_v * involves poking the GIC, which must be done in a * non-preemptible context. */ - preempt_disable(); + migrate_disable(); kvm_pmu_flush_hwstate(vcpu); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:855 @ int kvm_arch_vcpu_ioctl_run(struct kvm_v kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); - preempt_enable(); + migrate_enable(); continue; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:924 @ int kvm_arch_vcpu_ioctl_run(struct kvm_v /* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); - preempt_enable(); + migrate_enable(); /* * The ARMv8 architecture doesn't give the hypervisor Index: linux-5.15/arch/csky/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/csky/include/asm/spinlock_types.h +++ linux-5.15/arch/csky/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6 @ #ifndef __ASM_CSKY_SPINLOCK_TYPES_H #define __ASM_CSKY_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/hexagon/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/hexagon/include/asm/spinlock_types.h +++ linux-5.15/arch/hexagon/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:11 @ #ifndef _ASM_SPINLOCK_TYPES_H #define _ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/ia64/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/ia64/include/asm/spinlock_types.h +++ linux-5.15/arch/ia64/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #ifndef _ASM_IA64_SPINLOCK_TYPES_H #define _ASM_IA64_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/powerpc/Kconfig =================================================================== --- linux-5.15.orig/arch/powerpc/Kconfig +++ linux-5.15/arch/powerpc/Kconfig @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:154 @ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:222 @ config PPC select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IOREMAP_PROT select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE select HAVE_KERNEL_LZO if DEFAULT_UIMAGE @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:239 @ config PPC select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ Index: linux-5.15/arch/powerpc/include/asm/simple_spinlock_types.h =================================================================== --- linux-5.15.orig/arch/powerpc/include/asm/simple_spinlock_types.h +++ linux-5.15/arch/powerpc/include/asm/simple_spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/powerpc/include/asm/smp.h =================================================================== --- linux-5.15.orig/arch/powerpc/include/asm/smp.h +++ linux-5.15/arch/powerpc/include/asm/smp.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:65 @ struct smp_ops_t { extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); extern int smp_send_safe_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); +extern void smp_send_debugger_break_cpu(unsigned int cpu); extern void smp_send_debugger_break(void); extern void start_secondary_resume(void); extern void smp_generic_give_timebase(void); Index: linux-5.15/arch/powerpc/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/powerpc/include/asm/spinlock_types.h +++ linux-5.15/arch/powerpc/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H #define _ASM_POWERPC_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/powerpc/include/asm/stackprotector.h =================================================================== --- linux-5.15.orig/arch/powerpc/include/asm/stackprotector.h +++ linux-5.15/arch/powerpc/include/asm/stackprotector.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:27 @ static __always_inline void boot_init_st unsigned long canary; /* Try to get a semi random initial value. */ +#ifdef CONFIG_PREEMPT_RT + canary = (unsigned long)&canary; +#else canary = get_random_canary(); +#endif canary ^= mftb(); canary ^= LINUX_VERSION_CODE; canary &= CANARY_MASK; Index: linux-5.15/arch/powerpc/include/asm/thread_info.h =================================================================== --- linux-5.15.orig/arch/powerpc/include/asm/thread_info.h +++ linux-5.15/arch/powerpc/include/asm/thread_info.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:56 @ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_lazy_count; /* 0 => preemptable, + <0 => BUG */ unsigned long local_flags; /* private flags for thread */ #ifdef CONFIG_LIVEPATCH unsigned long *livepatch_sp; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:104 @ void arch_setup_new_exec(void); #define TIF_PATCH_PENDING 6 /* pending live patching update */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SINGLESTEP 8 /* singlestepping active */ +#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */ #define TIF_SECCOMP 10 /* secure computing */ #define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ #define TIF_NOERROR 12 /* Force successful syscall return */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:120 @ void arch_setup_new_exec(void); #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_32BIT 20 /* 32 bit binary */ + /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:132 @ void arch_setup_new_exec(void); #define _TIF_PATCH_PENDING (1<<TIF_PATCH_PENDING) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) #define _TIF_SECCOMP (1<<TIF_SECCOMP) #define _TIF_RESTOREALL (1<<TIF_RESTOREALL) #define _TIF_NOERROR (1<<TIF_NOERROR) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:146 @ void arch_setup_new_exec(void); _TIF_SYSCALL_EMU) #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ + _TIF_NEED_RESCHED_LAZY | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \ _TIF_NOTIFY_SIGNAL) #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) /* Bits in local_flags */ /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ Index: linux-5.15/arch/powerpc/kernel/interrupt.c =================================================================== --- linux-5.15.orig/arch/powerpc/kernel/interrupt.c +++ linux-5.15/arch/powerpc/kernel/interrupt.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:351 @ again: ti_flags = READ_ONCE(current_thread_info()->flags); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); - if (ti_flags & _TIF_NEED_RESCHED) { + if (ti_flags & _TIF_NEED_RESCHED_MASK) { schedule(); } else { /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:557 @ notrace unsigned long interrupt_exit_ker /* Returning to a kernel context with local irqs enabled. */ WARN_ON_ONCE(!(regs->msr & MSR_EE)); again: - if (IS_ENABLED(CONFIG_PREEMPT)) { + if (IS_ENABLED(CONFIG_PREEMPTION)) { /* Return to preemptible kernel context */ if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) { if (preempt_count() == 0) preempt_schedule_irq(); + } else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) { + if ((preempt_count() == 0) && + (current_thread_info()->preempt_lazy_count == 0)) + preempt_schedule_irq(); } } Index: linux-5.15/arch/powerpc/kernel/irq.c =================================================================== --- linux-5.15.orig/arch/powerpc/kernel/irq.c +++ linux-5.15/arch/powerpc/kernel/irq.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:693 @ static inline void check_stack_overflow( } } +#ifndef CONFIG_PREEMPT_RT static __always_inline void call_do_softirq(const void *sp) { /* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:712 @ static __always_inline void call_do_soft "r11", "r12" ); } +#endif static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:825 @ void *mcheckirq_ctx[NR_CPUS] __read_most void *softirq_ctx[NR_CPUS] __read_mostly; void *hardirq_ctx[NR_CPUS] __read_mostly; +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { call_do_softirq(softirq_ctx[smp_processor_id()]); } +#endif irq_hw_number_t virq_to_hw(unsigned int virq) { Index: linux-5.15/arch/powerpc/kernel/kgdb.c =================================================================== --- linux-5.15.orig/arch/powerpc/kernel/kgdb.c +++ linux-5.15/arch/powerpc/kernel/kgdb.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:123 @ int kgdb_skipexception(int exception, st static int kgdb_debugger_ipi(struct pt_regs *regs) { - kgdb_nmicallback(raw_smp_processor_id(), regs); + int cpu = raw_smp_processor_id(); + + if (!kgdb_roundup_delay(cpu)) + kgdb_nmicallback(cpu, regs); return 0; } #ifdef CONFIG_SMP +void kgdb_roundup_cpu(unsigned int cpu) +{ + smp_send_debugger_break_cpu(cpu); +} + void kgdb_roundup_cpus(void) { smp_send_debugger_break(); Index: linux-5.15/arch/powerpc/kernel/smp.c =================================================================== --- linux-5.15.orig/arch/powerpc/kernel/smp.c +++ linux-5.15/arch/powerpc/kernel/smp.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:593 @ static void debugger_ipi_callback(struct debugger_ipi(regs); } +void smp_send_debugger_break_cpu(unsigned int cpu) +{ + smp_send_nmi_ipi(cpu, debugger_ipi_callback, 1000000); +} + void smp_send_debugger_break(void) { smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000); Index: linux-5.15/arch/powerpc/kernel/traps.c =================================================================== --- linux-5.15.orig/arch/powerpc/kernel/traps.c +++ linux-5.15/arch/powerpc/kernel/traps.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:263 @ static char *get_mmu_str(void) static int __die(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", PAGE_SIZE / 1024, get_mmu_str(), - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", Index: linux-5.15/arch/powerpc/kvm/Kconfig =================================================================== --- linux-5.15.orig/arch/powerpc/kvm/Kconfig +++ linux-5.15/arch/powerpc/kvm/Kconfig @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:181 @ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" depends on KVM && E500 + depends on !PREEMPT_RT select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING Index: linux-5.15/arch/powerpc/platforms/pseries/iommu.c =================================================================== --- linux-5.15.orig/arch/powerpc/platforms/pseries/iommu.c +++ linux-5.15/arch/powerpc/platforms/pseries/iommu.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:27 @ #include <linux/of.h> #include <linux/iommu.h> #include <linux/rculist.h> +#include <linux/local_lock.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:199 @ static int tce_build_pSeriesLP(unsigned return ret; } -static DEFINE_PER_CPU(__be64 *, tce_page); +struct tce_page { + __be64 * page; + local_lock_t lock; +}; +static DEFINE_PER_CPU(struct tce_page, tce_page) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:228 @ static int tce_buildmulti_pSeriesLP(stru direction, attrs); } - local_irq_save(flags); /* to protect tcep and the page behind it */ + /* to protect tcep and the page behind it */ + local_lock_irqsave(&tce_page.lock, flags); - tcep = __this_cpu_read(tce_page); + tcep = __this_cpu_read(tce_page.page); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:240 @ static int tce_buildmulti_pSeriesLP(stru tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, tceshift, npages, uaddr, direction, attrs); } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } rpn = __pa(uaddr) >> tceshift; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:275 @ static int tce_buildmulti_pSeriesLP(stru tcenum += limit; } while (npages > 0 && !rc); - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:451 @ static int tce_setrange_multi_pSeriesLP( DMA_BIDIRECTIONAL, 0); } - local_irq_disable(); /* to protect tcep and the page behind it */ - tcep = __this_cpu_read(tce_page); + /* to protect tcep and the page behind it */ + local_lock_irq(&tce_page.lock); + tcep = __this_cpu_read(tce_page.page); if (!tcep) { tcep = (__be64 *)__get_free_page(GFP_ATOMIC); if (!tcep) { - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return -ENOMEM; } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:504 @ static int tce_setrange_multi_pSeriesLP( /* error cleanup: caller will clear whole range */ - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return rc; } Index: linux-5.15/arch/riscv/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/riscv/include/asm/spinlock_types.h +++ linux-5.15/arch/riscv/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:9 @ #ifndef _ASM_RISCV_SPINLOCK_TYPES_H #define _ASM_RISCV_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/s390/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/s390/include/asm/spinlock_types.h +++ linux-5.15/arch/s390/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/sh/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/sh/include/asm/spinlock_types.h +++ linux-5.15/arch/sh/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #ifndef __ASM_SH_SPINLOCK_TYPES_H #define __ASM_SH_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/arch/sh/kernel/irq.c =================================================================== --- linux-5.15.orig/arch/sh/kernel/irq.c +++ linux-5.15/arch/sh/kernel/irq.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:152 @ void irq_ctx_exit(int cpu) hardirq_ctx[cpu] = NULL; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct thread_info *curctx; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:180 @ void do_softirq_own_stack(void) "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" ); } +#endif #else static inline void handle_one_irq(unsigned int irq) { Index: linux-5.15/arch/sparc/kernel/irq_64.c =================================================================== --- linux-5.15.orig/arch/sparc/kernel/irq_64.c +++ linux-5.15/arch/sparc/kernel/irq_64.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:858 @ void __irq_entry handler_irq(int pil, st set_irq_regs(old_regs); } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { void *orig_sp, *sp = softirq_stack[smp_processor_id()]; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:873 @ void do_softirq_own_stack(void) __asm__ __volatile__("mov %0, %%sp" : : "r" (orig_sp)); } +#endif #ifdef CONFIG_HOTPLUG_CPU void fixup_irqs(void) Index: linux-5.15/arch/x86/Kconfig =================================================================== --- linux-5.15.orig/arch/x86/Kconfig +++ linux-5.15/arch/x86/Kconfig @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:110 @ config X86 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:235 @ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API Index: linux-5.15/arch/x86/include/asm/irq_stack.h =================================================================== --- linux-5.15.orig/arch/x86/include/asm/irq_stack.h +++ linux-5.15/arch/x86/include/asm/irq_stack.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:205 @ IRQ_CONSTRAINTS, regs, vector); \ } +#ifndef CONFIG_PREEMPT_RT /* * Macro to invoke __do_softirq on the irq stack. This is only called from * task context when bottom halves are about to be reenabled and soft @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:219 @ __this_cpu_write(hardirq_stack_inuse, false); \ } +#endif + #else /* CONFIG_X86_64 */ /* System vector handlers always run on the stack they interrupted. */ #define run_sysvec_on_irqstack_cond(func, regs) \ Index: linux-5.15/arch/x86/include/asm/preempt.h =================================================================== --- linux-5.15.orig/arch/x86/include/asm/preempt.h +++ linux-5.15/arch/x86/include/asm/preempt.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:93 @ static __always_inline void __preempt_co * a decrement which hits zero means we have no preempt_count and should * reschedule. */ -static __always_inline bool __preempt_count_dec_and_test(void) +static __always_inline bool ____preempt_count_dec_and_test(void) { return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); } +static __always_inline bool __preempt_count_dec_and_test(void) +{ + if (____preempt_count_dec_and_test()) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if (preempt_count()) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif +} + /* * Returns true when we need to resched and can (barring IRQ state). */ static __always_inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u32 tmp; + tmp = raw_cpu_read_4(__preempt_count); + if (tmp == preempt_offset) + return true; + + /* preempt count == 0 ? */ + tmp &= ~PREEMPT_NEED_RESCHED; + if (tmp != preempt_offset) + return false; + /* XXX PREEMPT_LOCK_OFFSET */ + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); +#endif } #ifdef CONFIG_PREEMPTION Index: linux-5.15/arch/x86/include/asm/signal.h =================================================================== --- linux-5.15.orig/arch/x86/include/asm/signal.h +++ linux-5.15/arch/x86/include/asm/signal.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:31 @ typedef struct { #define SA_IA32_ABI 0x02000000u #define SA_X32_ABI 0x01000000u +/* + * Because some traps use the IST stack, we must keep preemption + * disabled while calling do_trap(), but do_trap() may call + * force_sig_info() which will grab the signal spin_locks for the + * task, which in PREEMPT_RT are mutexes. By defining + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the + * trap. + */ +#if defined(CONFIG_PREEMPT_RT) +#define ARCH_RT_DELAYS_SIGNAL_SEND +#endif + #ifndef CONFIG_COMPAT #define compat_sigset_t compat_sigset_t typedef sigset_t compat_sigset_t; Index: linux-5.15/arch/x86/include/asm/stackprotector.h =================================================================== --- linux-5.15.orig/arch/x86/include/asm/stackprotector.h +++ linux-5.15/arch/x86/include/asm/stackprotector.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:53 @ */ static __always_inline void boot_init_stack_canary(void) { - u64 canary; + u64 canary = 0; u64 tsc; #ifdef CONFIG_X86_64 @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:64 @ static __always_inline void boot_init_st * of randomness. The TSC only matters for very early init, * there it already has some randomness on most systems. Later * on during the bootup the random pool has true entropy too. + * For preempt-rt we need to weaken the randomness a bit, as + * we can't call into the random generator from atomic context + * due to locking constraints. We just leave canary + * uninitialized and use the TSC based randomness on top of it. */ +#ifndef CONFIG_PREEMPT_RT get_random_bytes(&canary, sizeof(canary)); +#endif tsc = rdtsc(); canary += tsc + (tsc << 32UL); canary &= CANARY_MASK; Index: linux-5.15/arch/x86/include/asm/thread_info.h =================================================================== --- linux-5.15.orig/arch/x86/include/asm/thread_info.h +++ linux-5.15/arch/x86/include/asm/thread_info.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:60 @ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ + int preempt_lazy_count; /* 0 => lazy preemptable + <0 => BUG */ }; #define INIT_THREAD_INFO(tsk) \ { \ .flags = 0, \ + .preempt_lazy_count = 0, \ } #else /* !__ASSEMBLY__ */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:96 @ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ +#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:121 @ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SLD (1 << TIF_SLD) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) Index: linux-5.15/arch/x86/kernel/irq_32.c =================================================================== --- linux-5.15.orig/arch/x86/kernel/irq_32.c +++ linux-5.15/arch/x86/kernel/irq_32.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:135 @ int irq_init_percpu_irqstack(unsigned in return 0; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct irq_stack *irqstk; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:152 @ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } +#endif void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { Index: linux-5.15/arch/x86/kernel/kgdb.c =================================================================== --- linux-5.15.orig/arch/x86/kernel/kgdb.c +++ linux-5.15/arch/x86/kernel/kgdb.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:505 @ static int kgdb_nmi_handler(unsigned int if (atomic_read(&kgdb_active) != -1) { /* KGDB CPU roundup */ cpu = raw_smp_processor_id(); - kgdb_nmicallback(cpu, regs); - set_bit(cpu, was_in_debug_nmi); - touch_nmi_watchdog(); + + if (!kgdb_roundup_delay(cpu)) { + kgdb_nmicallback(cpu, regs); + set_bit(cpu, was_in_debug_nmi); + touch_nmi_watchdog(); + } return NMI_HANDLED; } Index: linux-5.15/arch/x86/kvm/x86.c =================================================================== --- linux-5.15.orig/arch/x86/kvm/x86.c +++ linux-5.15/arch/x86/kvm/x86.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:8572 @ int kvm_arch_init(void *opaque) goto out; } +#ifdef CONFIG_PREEMPT_RT + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); + r = -EOPNOTSUPP; + goto out; + } +#endif + r = -ENOMEM; x86_emulator_cache = kvm_alloc_emulator_cache(); Index: linux-5.15/arch/xtensa/include/asm/spinlock_types.h =================================================================== --- linux-5.15.orig/arch/xtensa/include/asm/spinlock_types.h +++ linux-5.15/arch/xtensa/include/asm/spinlock_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) # error "please don't include this file directly" #endif Index: linux-5.15/block/blk-mq.c =================================================================== --- linux-5.15.orig/block/blk-mq.c +++ linux-5.15/block/blk-mq.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1570 @ static void __blk_mq_delay_run_hw_queue( return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { - int cpu = get_cpu(); + int cpu = get_cpu_light(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); - put_cpu(); + put_cpu_light(); return; } - put_cpu(); + put_cpu_light(); } kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, Index: linux-5.15/crypto/testmgr.c =================================================================== --- linux-5.15.orig/crypto/testmgr.c +++ linux-5.15/crypto/testmgr.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1064 @ static void generate_random_testvec_conf static void crypto_disable_simd_for_test(void) { - preempt_disable(); + migrate_disable(); __this_cpu_write(crypto_simd_disabled_for_test, true); } static void crypto_reenable_simd_for_test(void) { __this_cpu_write(crypto_simd_disabled_for_test, false); - preempt_enable(); + migrate_enable(); } /* Index: linux-5.15/drivers/block/zram/zram_drv.c =================================================================== --- linux-5.15.orig/drivers/block/zram/zram_drv.c +++ linux-5.15/drivers/block/zram/zram_drv.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:62 @ static void zram_free_page(struct zram * static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); +#ifdef CONFIG_PREEMPT_RT +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) +{ + size_t index; + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); +} + +static int zram_slot_trylock(struct zram *zram, u32 index) +{ + int ret; + + ret = spin_trylock(&zram->table[index].lock); + if (ret) + __set_bit(ZRAM_LOCK, &zram->table[index].flags); + return ret; +} + +static void zram_slot_lock(struct zram *zram, u32 index) +{ + spin_lock(&zram->table[index].lock); + __set_bit(ZRAM_LOCK, &zram->table[index].flags); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + __clear_bit(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); +} + +#else + +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } static int zram_slot_trylock(struct zram *zram, u32 index) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:111 @ static void zram_slot_unlock(struct zram { bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } +#endif static inline bool init_done(struct zram *zram) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1207 @ static bool zram_meta_alloc(struct zram if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + zram_meta_init_table_locks(zram, num_pages); return true; } Index: linux-5.15/drivers/block/zram/zram_drv.h =================================================================== --- linux-5.15.orig/drivers/block/zram/zram_drv.h +++ linux-5.15/drivers/block/zram/zram_drv.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:66 @ struct zram_table_entry { unsigned long element; }; unsigned long flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_MEMORY_TRACKING ktime_t ac_time; #endif Index: linux-5.15/drivers/char/tpm/tpm_tis.c =================================================================== --- linux-5.15.orig/drivers/char/tpm/tpm_tis.c +++ linux-5.15/drivers/char/tpm/tpm_tis.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:53 @ static inline struct tpm_tis_tcg_phy *to return container_of(data, struct tpm_tis_tcg_phy, priv); } +#ifdef CONFIG_PREEMPT_RT +/* + * Flushes previous write operations to chip so that a subsequent + * ioread*()s won't stall a cpu. + */ +static inline void tpm_tis_flush(void __iomem *iobase) +{ + ioread8(iobase + TPM_ACCESS(0)); +} +#else +#define tpm_tis_flush(iobase) do { } while (0) +#endif + +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) +{ + iowrite8(b, iobase + addr); + tpm_tis_flush(iobase); +} + +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) +{ + iowrite32(b, iobase + addr); + tpm_tis_flush(iobase); +} + static int interrupts = -1; module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:198 @ static int tpm_tcg_write_bytes(struct tp struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); while (len--) - iowrite8(*value++, phy->iobase + addr); + tpm_tis_iowrite8(*value++, phy->iobase, addr); return 0; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:225 @ static int tpm_tcg_write32(struct tpm_ti { struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); - iowrite32(value, phy->iobase + addr); + tpm_tis_iowrite32(value, phy->iobase, addr); return 0; } Index: linux-5.15/drivers/firmware/efi/efi.c =================================================================== --- linux-5.15.orig/drivers/firmware/efi/efi.c +++ linux-5.15/drivers/firmware/efi/efi.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:70 @ struct mm_struct efi_mm = { struct workqueue_struct *efi_rts_wq; -static bool disable_runtime; +static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); static int __init setup_noefi(char *arg) { disable_runtime = true; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:101 @ static int __init parse_efi_cmdline(char if (parse_option_str(str, "noruntime")) disable_runtime = true; + if (parse_option_str(str, "runtime")) + disable_runtime = false; + if (parse_option_str(str, "nosoftreserve")) set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); Index: linux-5.15/drivers/gpu/drm/i915/display/intel_crtc.c =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/display/intel_crtc.c +++ linux-5.15/drivers/gpu/drm/i915/display/intel_crtc.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:428 @ void intel_pipe_update_start(const struc */ intel_psr_wait_for_idle(new_crtc_state); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); crtc->debug.min_vbl = min; crtc->debug.max_vbl = max; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:454 @ void intel_pipe_update_start(const struc break; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); timeout = schedule_timeout(timeout); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } finish_wait(wq, &wait); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:493 @ void intel_pipe_update_start(const struc return; irq_disable: - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:573 @ void intel_pipe_update_end(struct intel_ new_crtc_state->uapi.event = NULL; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); /* Send VRR Push to terminate Vblank */ intel_vrr_send_push(new_crtc_state); Index: linux-5.15/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ linux-5.15/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:314 @ void __intel_breadcrumbs_park(struct int /* Kick the work once more to drain the signalers, and disarm the irq */ irq_work_sync(&b->irq_work); while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { - local_irq_disable(); - signal_irq_work(&b->irq_work); - local_irq_enable(); + irq_work_queue(&b->irq_work); cond_resched(); + irq_work_sync(&b->irq_work); } } Index: linux-5.15/drivers/gpu/drm/i915/gt/intel_context.h =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/gt/intel_context.h +++ linux-5.15/drivers/gpu/drm/i915/gt/intel_context.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:166 @ static inline void intel_context_enter(s static inline void intel_context_mark_active(struct intel_context *ce) { - lockdep_assert_held(&ce->timeline->mutex); + lockdep_assert(lockdep_is_held(&ce->timeline->mutex) || + test_bit(CONTEXT_IS_PARKED, &ce->flags)); ++ce->active_count; } Index: linux-5.15/drivers/gpu/drm/i915/gt/intel_context_types.h =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/gt/intel_context_types.h +++ linux-5.15/drivers/gpu/drm/i915/gt/intel_context_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:115 @ struct intel_context { #define CONTEXT_FORCE_SINGLE_SUBMISSION 7 #define CONTEXT_NOPREEMPT 8 #define CONTEXT_LRCA_DIRTY 9 +#define CONTEXT_IS_PARKED 10 struct { u64 timeout_us; Index: linux-5.15/drivers/gpu/drm/i915/gt/intel_engine_pm.c =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ linux-5.15/drivers/gpu/drm/i915/gt/intel_engine_pm.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:83 @ static int __engine_unpark(struct intel_ return 0; } -#if IS_ENABLED(CONFIG_LOCKDEP) - -static unsigned long __timeline_mark_lock(struct intel_context *ce) -{ - unsigned long flags; - - local_irq_save(flags); - mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); - - return flags; -} - -static void __timeline_mark_unlock(struct intel_context *ce, - unsigned long flags) -{ - mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); - local_irq_restore(flags); -} - -#else - -static unsigned long __timeline_mark_lock(struct intel_context *ce) -{ - return 0; -} - -static void __timeline_mark_unlock(struct intel_context *ce, - unsigned long flags) -{ -} - -#endif /* !IS_ENABLED(CONFIG_LOCKDEP) */ - static void duration(struct dma_fence *fence, struct dma_fence_cb *cb) { struct i915_request *rq = to_request(fence); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:129 @ static bool switch_to_kernel_context(str { struct intel_context *ce = engine->kernel_context; struct i915_request *rq; - unsigned long flags; bool result = true; /* GPU is pointing to the void, as good as in the kernel context. */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:170 @ static bool switch_to_kernel_context(str * engine->wakeref.count, we may see the request completion and retire * it causing an underflow of the engine->wakeref. */ - flags = __timeline_mark_lock(ce); + set_bit(CONTEXT_IS_PARKED, &ce->flags); GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0); rq = __i915_request_create(ce, GFP_NOWAIT); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:202 @ static bool switch_to_kernel_context(str result = false; out_unlock: - __timeline_mark_unlock(ce, flags); + clear_bit(CONTEXT_IS_PARKED, &ce->flags); return result; } Index: linux-5.15/drivers/gpu/drm/i915/gt/intel_execlists_submission.c =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ linux-5.15/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1289 @ static void execlists_dequeue(struct int * and context switches) submission. */ - spin_lock(&sched_engine->lock); + spin_lock_irq(&sched_engine->lock); /* * If the queue is higher priority than the last @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1389 @ static void execlists_dequeue(struct int * Even if ELSP[1] is occupied and not worthy * of timeslices, our queue might be. */ - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); return; } } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1415 @ static void execlists_dequeue(struct int if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.sched_engine->lock); - spin_unlock(&engine->sched_engine->lock); + spin_unlock_irq(&engine->sched_engine->lock); return; /* leave this for another sibling */ } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1577 @ done: */ sched_engine->queue_priority_hint = queue_prio(sched_engine); i915_sched_engine_reset_on_empty(sched_engine); - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); /* * We can skip poking the HW if we ended up with exactly the same set @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1603 @ done: } } -static void execlists_dequeue_irq(struct intel_engine_cs *engine) -{ - local_irq_disable(); /* Suspend interrupts across request submission */ - execlists_dequeue(engine); - local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ -} - static void clear_ports(struct i915_request **ports, int count) { memset_p((void **)ports, NULL, count); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2438 @ static void execlists_submission_tasklet } if (!engine->execlists.pending[0]) { - execlists_dequeue_irq(engine); + execlists_dequeue(engine); start_timeslice(engine); } Index: linux-5.15/drivers/gpu/drm/i915/i915_irq.c =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/i915_irq.c +++ linux-5.15/drivers/gpu/drm/i915/i915_irq.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:889 @ static bool i915_get_crtc_scanoutpos(str */ spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); - /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); /* Get optional system timestamp before query. */ if (stime) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:954 @ static bool i915_get_crtc_scanoutpos(str if (etime) *etime = ktime_get(); - /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); Index: linux-5.15/drivers/gpu/drm/i915/i915_request.c =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/i915_request.c +++ linux-5.15/drivers/gpu/drm/i915/i915_request.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:562 @ bool __i915_request_submit(struct i915_r RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:670 @ void __i915_request_unsubmit(struct i915 */ RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* Index: linux-5.15/drivers/gpu/drm/i915/i915_request.h =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/i915_request.h +++ linux-5.15/drivers/gpu/drm/i915/i915_request.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:612 @ i915_request_timeline(const struct i915_ { /* Valid only while the request is being constructed (or retired). */ return rcu_dereference_protected(rq->timeline, - lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex)); + lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex) || + test_bit(CONTEXT_IS_PARKED, &rq->context->flags)); } static inline struct i915_gem_context * Index: linux-5.15/drivers/gpu/drm/i915/i915_trace.h =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/i915_trace.h +++ linux-5.15/drivers/gpu/drm/i915/i915_trace.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _I915_TRACE_H_ +#ifdef CONFIG_PREEMPT_RT +#define NOTRACE +#endif + #include <linux/stringify.h> #include <linux/types.h> #include <linux/tracepoint.h> @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:826 @ DEFINE_EVENT(i915_request, i915_request_ TP_ARGS(rq) ); -#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) +#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) DEFINE_EVENT(i915_request, i915_request_guc_submit, TP_PROTO(struct i915_request *rq), TP_ARGS(rq) Index: linux-5.15/drivers/gpu/drm/i915/i915_utils.h =================================================================== --- linux-5.15.orig/drivers/gpu/drm/i915/i915_utils.h +++ linux-5.15/drivers/gpu/drm/i915/i915_utils.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:346 @ wait_remaining_ms_from_jiffies(unsigned #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ -#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) +#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) #else # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) Index: linux-5.15/drivers/i2c/busses/i2c-cht-wc.c =================================================================== --- linux-5.15.orig/drivers/i2c/busses/i2c-cht-wc.c +++ linux-5.15/drivers/i2c/busses/i2c-cht-wc.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:102 @ static irqreturn_t cht_wc_i2c_adap_threa * interrupt handler as well, so running the client irq handler from * this thread will cause things to lock up. */ - if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) { - /* - * generic_handle_irq expects local IRQs to be disabled - * as normally it is called from interrupt context. - */ - local_irq_disable(); - generic_handle_irq(adap->client_irq); - local_irq_enable(); - } + if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) + generic_handle_irq_safe(adap->client_irq); return IRQ_HANDLED; } Index: linux-5.15/drivers/i2c/i2c-core-base.c =================================================================== --- linux-5.15.orig/drivers/i2c/i2c-core-base.c +++ linux-5.15/drivers/i2c/i2c-core-base.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1425 @ int i2c_handle_smbus_host_notify(struct if (irq <= 0) return -ENXIO; - generic_handle_irq(irq); + generic_handle_irq_safe(irq); return 0; } Index: linux-5.15/drivers/leds/trigger/Kconfig =================================================================== --- linux-5.15.orig/drivers/leds/trigger/Kconfig +++ linux-5.15/drivers/leds/trigger/Kconfig @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:67 @ config LEDS_TRIGGER_BACKLIGHT config LEDS_TRIGGER_CPU bool "LED CPU Trigger" + depends on !PREEMPT_RT help This allows LEDs to be controlled by active CPUs. This shows the active CPUs across an array of LEDs so you can see which Index: linux-5.15/drivers/md/raid5.c =================================================================== --- linux-5.15.orig/drivers/md/raid5.c +++ linux-5.15/drivers/md/raid5.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2221 @ static void raid_run_ops(struct stripe_h struct raid5_percpu *percpu; unsigned long cpu; - cpu = get_cpu(); + cpu = get_cpu_light(); percpu = per_cpu_ptr(conf->percpu, cpu); + spin_lock(&percpu->lock); if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2282 @ static void raid_run_ops(struct stripe_h if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } - put_cpu(); + spin_unlock(&percpu->lock); + put_cpu_light(); } static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:7115 @ static int raid456_cpu_up_prepare(unsign __func__, cpu); return -ENOMEM; } + spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); return 0; } Index: linux-5.15/drivers/md/raid5.h =================================================================== --- linux-5.15.orig/drivers/md/raid5.h +++ linux-5.15/drivers/md/raid5.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:638 @ struct r5conf { int recovery_disabled; /* per cpu variables */ struct raid5_percpu { + spinlock_t lock; /* Protection for -RT */ struct page *spare_page; /* Used when checking P/Q in raid6 */ void *scribble; /* space for constructing buffer * lists and performing address Index: linux-5.15/drivers/mfd/ezx-pcap.c =================================================================== --- linux-5.15.orig/drivers/mfd/ezx-pcap.c +++ linux-5.15/drivers/mfd/ezx-pcap.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:196 @ static void pcap_isr_work(struct work_st ezx_pcap_write(pcap, PCAP_REG_MSR, isr | msr); ezx_pcap_write(pcap, PCAP_REG_ISR, isr); - local_irq_disable(); service = isr & ~msr; for (irq = pcap->irq_base; service; service >>= 1, irq++) { if (service & 1) - generic_handle_irq(irq); + generic_handle_irq_safe(irq); } - local_irq_enable(); ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr); } while (gpio_get_value(pdata->gpio)); } Index: linux-5.15/drivers/misc/hi6421v600-irq.c =================================================================== --- linux-5.15.orig/drivers/misc/hi6421v600-irq.c +++ linux-5.15/drivers/misc/hi6421v600-irq.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:121 @ static irqreturn_t hi6421v600_irq_handle * If both powerkey down and up IRQs are received, * handle them at the right order */ - generic_handle_irq(priv->irqs[POWERKEY_DOWN]); - generic_handle_irq(priv->irqs[POWERKEY_UP]); + generic_handle_irq_safe(priv->irqs[POWERKEY_DOWN]); + generic_handle_irq_safe(priv->irqs[POWERKEY_UP]); pending &= ~HISI_IRQ_POWERKEY_UP_DOWN; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:130 @ static irqreturn_t hi6421v600_irq_handle continue; for_each_set_bit(offset, &pending, BITS_PER_BYTE) { - generic_handle_irq(priv->irqs[offset + i * BITS_PER_BYTE]); + generic_handle_irq_safe(priv->irqs[offset + i * BITS_PER_BYTE]); } } Index: linux-5.15/drivers/net/ethernet/netronome/nfp/abm/qdisc.c =================================================================== --- linux-5.15.orig/drivers/net/ethernet/netronome/nfp/abm/qdisc.c +++ linux-5.15/drivers/net/ethernet/netronome/nfp/abm/qdisc.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:461 @ nfp_abm_qdisc_graft(struct nfp_abm_link static void nfp_abm_stats_calculate(struct nfp_alink_stats *new, struct nfp_alink_stats *old, - struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_sync *bstats, struct gnet_stats_queue *qstats) { _bstats_update(bstats, new->tx_bytes - old->tx_bytes, Index: linux-5.15/drivers/net/usb/lan78xx.c =================================================================== --- linux-5.15.orig/drivers/net/usb/lan78xx.c +++ linux-5.15/drivers/net/usb/lan78xx.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1370 @ static void lan78xx_status(struct lan78x netif_dbg(dev, link, dev->net, "PHY INTR: 0x%08x\n", intdata); lan78xx_defer_kevent(dev, EVENT_LINK_RESET); - if (dev->domain_data.phyirq > 0) { - local_irq_disable(); - generic_handle_irq(dev->domain_data.phyirq); - local_irq_enable(); - } + if (dev->domain_data.phyirq > 0) + generic_handle_irq_safe(dev->domain_data.phyirq); } else { netdev_warn(dev->net, "unexpected interrupt: 0x%08x\n", intdata); Index: linux-5.15/drivers/scsi/fcoe/fcoe.c =================================================================== --- linux-5.15.orig/drivers/scsi/fcoe/fcoe.c +++ linux-5.15/drivers/scsi/fcoe/fcoe.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1453 @ err2: static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen) { struct fcoe_percpu_s *fps; - int rc; + int rc, cpu = get_cpu_light(); - fps = &get_cpu_var(fcoe_percpu); + fps = &per_cpu(fcoe_percpu, cpu); rc = fcoe_get_paged_crc_eof(skb, tlen, fps); - put_cpu_var(fcoe_percpu); + put_cpu_light(); return rc; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1642 @ static inline int fcoe_filter_frames(str return 0; } - stats = per_cpu_ptr(lport->stats, get_cpu()); + stats = per_cpu_ptr(lport->stats, get_cpu_light()); stats->InvalidCRCCount++; if (stats->InvalidCRCCount < 5) printk(KERN_WARNING "fcoe: dropping frame with CRC error\n"); - put_cpu(); + put_cpu_light(); return -EINVAL; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1687 @ static void fcoe_recv_frame(struct sk_bu */ hp = (struct fcoe_hdr *) skb_network_header(skb); - stats = per_cpu_ptr(lport->stats, get_cpu()); + stats = per_cpu_ptr(lport->stats, get_cpu_light()); if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) { if (stats->ErrorFrames < 5) printk(KERN_WARNING "fcoe: FCoE version " @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1719 @ static void fcoe_recv_frame(struct sk_bu goto drop; if (!fcoe_filter_frames(lport, fp)) { - put_cpu(); + put_cpu_light(); fc_exch_recv(lport, fp); return; } drop: stats->ErrorFrames++; - put_cpu(); + put_cpu_light(); kfree_skb(skb); } Index: linux-5.15/drivers/scsi/fcoe/fcoe_ctlr.c =================================================================== --- linux-5.15.orig/drivers/scsi/fcoe/fcoe_ctlr.c +++ linux-5.15/drivers/scsi/fcoe/fcoe_ctlr.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:831 @ static unsigned long fcoe_ctlr_age_fcfs( INIT_LIST_HEAD(&del_list); - stats = per_cpu_ptr(fip->lp->stats, get_cpu()); + stats = per_cpu_ptr(fip->lp->stats, get_cpu_light()); list_for_each_entry_safe(fcf, next, &fip->fcfs, list) { deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:867 @ static unsigned long fcoe_ctlr_age_fcfs( sel_time = fcf->time; } } - put_cpu(); + put_cpu_light(); list_for_each_entry_safe(fcf, next, &del_list, list) { /* Removes fcf from current list */ Index: linux-5.15/drivers/scsi/libfc/fc_exch.c =================================================================== --- linux-5.15.orig/drivers/scsi/libfc/fc_exch.c +++ linux-5.15/drivers/scsi/libfc/fc_exch.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:828 @ static struct fc_exch *fc_exch_em_alloc( } memset(ep, 0, sizeof(*ep)); - cpu = get_cpu(); + cpu = get_cpu_light(); pool = per_cpu_ptr(mp->pool, cpu); spin_lock_bh(&pool->lock); - put_cpu(); + put_cpu_light(); /* peek cache of free slot */ if (pool->left != FC_XID_UNKNOWN) { Index: linux-5.15/drivers/staging/greybus/gpio.c =================================================================== --- linux-5.15.orig/drivers/staging/greybus/gpio.c +++ linux-5.15/drivers/staging/greybus/gpio.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:394 @ static int gb_gpio_request_handler(struc return -EINVAL; } - local_irq_disable(); - ret = generic_handle_irq(irq); - local_irq_enable(); - + ret = generic_handle_irq_safe(irq); if (ret) dev_err(dev, "failed to invoke irq handler\n"); Index: linux-5.15/drivers/tty/serial/8250/8250.h =================================================================== --- linux-5.15.orig/drivers/tty/serial/8250/8250.h +++ linux-5.15/drivers/tty/serial/8250/8250.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:159 @ static inline void serial_dl_write(struc up->dl_write(up, value); } +static inline void serial8250_set_IER(struct uart_8250_port *up, + unsigned char ier) +{ + struct uart_port *port = &up->port; + unsigned long flags; + bool is_console; + + is_console = uart_console(port); + + if (is_console) + console_atomic_lock(flags); + + serial_out(up, UART_IER, ier); + + if (is_console) + console_atomic_unlock(flags); +} + +static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) +{ + struct uart_port *port = &up->port; + unsigned int clearval = 0; + unsigned long flags; + unsigned int prior; + bool is_console; + + is_console = uart_console(port); + + if (up->capabilities & UART_CAP_UUE) + clearval = UART_IER_UUE; + + if (is_console) + console_atomic_lock(flags); + + prior = serial_port_in(port, UART_IER); + serial_port_out(port, UART_IER, clearval); + + if (is_console) + console_atomic_unlock(flags); + + return prior; +} + static inline bool serial8250_set_THRI(struct uart_8250_port *up) { if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:216 @ static inline bool serial8250_clear_THRI if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } Index: linux-5.15/drivers/tty/serial/8250/8250_core.c =================================================================== --- linux-5.15.orig/drivers/tty/serial/8250/8250_core.c +++ linux-5.15/drivers/tty/serial/8250/8250_core.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:268 @ static void serial8250_backup_timeout(st * Must disable interrupts or else we risk racing with the interrupt * based handler. */ - if (up->port.irq) { - ier = serial_in(up, UART_IER); - serial_out(up, UART_IER, 0); - } + if (up->port.irq) + ier = serial8250_clear_IER(up); iir = serial_in(up, UART_IIR); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:292 @ static void serial8250_backup_timeout(st serial8250_tx_chars(up); if (up->port.irq) - serial_out(up, UART_IER, ier); + serial8250_set_IER(up, ier); spin_unlock_irqrestore(&up->port.lock, flags); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:579 @ serial8250_register_ports(struct uart_dr #ifdef CONFIG_SERIAL_8250_CONSOLE +static void univ8250_console_write_atomic(struct console *co, const char *s, + unsigned int count) +{ + struct uart_8250_port *up = &serial8250_ports[co->index]; + + serial8250_console_write_atomic(up, s, count); +} + static void univ8250_console_write(struct console *co, const char *s, unsigned int count) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:680 @ static int univ8250_console_match(struct static struct console univ8250_console = { .name = "ttyS", + .write_atomic = univ8250_console_write_atomic, .write = univ8250_console_write, .device = uart_console_device, .setup = univ8250_console_setup, Index: linux-5.15/drivers/tty/serial/8250/8250_fsl.c =================================================================== --- linux-5.15.orig/drivers/tty/serial/8250/8250_fsl.c +++ linux-5.15/drivers/tty/serial/8250/8250_fsl.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:63 @ int fsl8250_handle_irq(struct uart_port /* Stop processing interrupts on input overrun */ if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { + unsigned long flags; unsigned long delay; + bool is_console; + is_console = uart_console(port); + + if (is_console) + console_atomic_lock(flags); up->ier = port->serial_in(port, UART_IER); + if (is_console) + console_atomic_unlock(flags); + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { port->ops->stop_rx(port); } else { Index: linux-5.15/drivers/tty/serial/8250/8250_ingenic.c =================================================================== --- linux-5.15.orig/drivers/tty/serial/8250/8250_ingenic.c +++ linux-5.15/drivers/tty/serial/8250/8250_ingenic.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:149 @ OF_EARLYCON_DECLARE(x1000_uart, "ingenic static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) { + unsigned long flags; + bool is_console; int ier; switch (offset) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:172 @ static void ingenic_uart_serial_out(stru * If we have enabled modem status IRQs we should enable * modem mode. */ + is_console = uart_console(p); + if (is_console) + console_atomic_lock(flags); ier = p->serial_in(p, UART_IER); + if (is_console) + console_atomic_unlock(flags); if (ier & UART_IER_MSI) value |= UART_MCR_MDCE | UART_MCR_FCM; Index: linux-5.15/drivers/tty/serial/8250/8250_mtk.c =================================================================== --- linux-5.15.orig/drivers/tty/serial/8250/8250_mtk.c +++ linux-5.15/drivers/tty/serial/8250/8250_mtk.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:225 @ static void mtk8250_shutdown(struct uart static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; + bool is_console; + + is_console = uart_console(port); + + if (is_console) + console_atomic_lock(flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier & (~mask)); + + if (is_console) + console_atomic_unlock(flags); } static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; + + if (uart_console(port)) + console_atomic_lock(flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier | mask); + + if (uart_console(port)) + console_atomic_unlock(flags); } static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) Index: linux-5.15/drivers/tty/serial/8250/8250_port.c =================================================================== --- linux-5.15.orig/drivers/tty/serial/8250/8250_port.c +++ linux-5.15/drivers/tty/serial/8250/8250_port.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:754 @ static void serial8250_set_sleep(struct serial_out(p, UART_EFR, UART_EFR_ECB); serial_out(p, UART_LCR, 0); } - serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); + serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); if (p->capabilities & UART_CAP_EFR) { serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, efr); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1429 @ static void serial8250_stop_rx(struct ua up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); up->port.read_status_mask &= ~UART_LSR_DR; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1459 @ void serial8250_em485_stop_tx(struct uar serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; - serial_port_out(&p->port, UART_IER, p->ier); + serial8250_set_IER(p, p->ier); } } EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1695 @ static void serial8250_disable_ms(struct mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); } static void serial8250_enable_ms(struct uart_port *port) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1711 @ static void serial8250_enable_ms(struct up->ier |= UART_IER_MSI; serial8250_rpm_get(up); - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2136 @ static void serial8250_put_poll_char(str struct uart_8250_port *up = up_to_u8250p(port); serial8250_rpm_get(up); - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else - serial_port_out(port, UART_IER, 0); + ier = serial8250_clear_IER(up); wait_for_xmitr(up, BOTH_EMPTY); /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2149 @ static void serial8250_put_poll_char(str * and restore the IER */ wait_for_xmitr(up, BOTH_EMPTY); - serial_port_out(port, UART_IER, ier); + serial8250_set_IER(up, ier); serial8250_rpm_put(up); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2454 @ void serial8250_do_shutdown(struct uart_ */ spin_lock_irqsave(&port->lock, flags); up->ier = 0; - serial_port_out(port, UART_IER, 0); + serial8250_set_IER(up, 0); spin_unlock_irqrestore(&port->lock, flags); synchronize_irq(port->irq); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2836 @ serial8250_do_set_termios(struct uart_po if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3301 @ EXPORT_SYMBOL_GPL(serial8250_set_default #ifdef CONFIG_SERIAL_8250_CONSOLE -static void serial8250_console_putchar(struct uart_port *port, int ch) +static void serial8250_console_putchar_locked(struct uart_port *port, int ch) { struct uart_8250_port *up = up_to_u8250p(port); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3309 @ static void serial8250_console_putchar(s serial_port_out(port, UART_TX, ch); } +static void serial8250_console_putchar(struct uart_port *port, int ch) +{ + struct uart_8250_port *up = up_to_u8250p(port); + unsigned long flags; + + wait_for_xmitr(up, UART_LSR_THRE); + + console_atomic_lock(flags); + serial8250_console_putchar_locked(port, ch); + console_atomic_unlock(flags); +} + /* * Restore serial console when h/w power-off detected */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3347 @ static void serial8250_console_restore(s serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); } +void serial8250_console_write_atomic(struct uart_8250_port *up, + const char *s, unsigned int count) +{ + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; + + console_atomic_lock(flags); + + touch_nmi_watchdog(); + + ier = serial8250_clear_IER(up); + + if (atomic_fetch_inc(&up->console_printing)) { + uart_console_write(port, "\n", 1, + serial8250_console_putchar_locked); + } + uart_console_write(port, s, count, serial8250_console_putchar_locked); + atomic_dec(&up->console_printing); + + wait_for_xmitr(up, BOTH_EMPTY); + serial8250_set_IER(up, ier); + + console_atomic_unlock(flags); +} + /* * Print a string to the serial port trying not to disturb * any possible real use of the port... @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3389 @ void serial8250_console_write(struct uar struct uart_port *port = &up->port; unsigned long flags; unsigned int ier; - int locked = 1; touch_nmi_watchdog(); - if (oops_in_progress) - locked = spin_trylock_irqsave(&port->lock, flags); - else - spin_lock_irqsave(&port->lock, flags); - - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); + spin_lock_irqsave(&port->lock, flags); - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else - serial_port_out(port, UART_IER, 0); + ier = serial8250_clear_IER(up); /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3408 @ void serial8250_console_write(struct uar mdelay(port->rs485.delay_rts_before_send); } + atomic_inc(&up->console_printing); uart_console_write(port, s, count, serial8250_console_putchar); + atomic_dec(&up->console_printing); /* * Finally, wait for transmitter to become empty @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3423 @ void serial8250_console_write(struct uar if (em485->tx_stopped) up->rs485_stop_tx(up); } - - serial_port_out(port, UART_IER, ier); + serial8250_set_IER(up, ier); /* * The receive handling will happen properly because the @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3435 @ void serial8250_console_write(struct uar if (up->msr_saved_flags) serial8250_modem_status(up); - if (locked) - spin_unlock_irqrestore(&port->lock, flags); + spin_unlock_irqrestore(&port->lock, flags); } static unsigned int probe_baud(struct uart_port *port) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3455 @ static unsigned int probe_baud(struct ua int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { + struct uart_8250_port *up = up_to_u8250p(port); int baud = 9600; int bits = 8; int parity = 'n'; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3465 @ int serial8250_console_setup(struct uart if (!port->iobase && !port->membase) return -ENODEV; + atomic_set(&up->console_printing, 0); + if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) Index: linux-5.15/drivers/tty/serial/amba-pl011.c =================================================================== --- linux-5.15.orig/drivers/tty/serial/amba-pl011.c +++ linux-5.15/drivers/tty/serial/amba-pl011.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2343 @ pl011_console_write(struct console *co, { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; - unsigned long flags; + unsigned long flags = 0; int locked = 1; clk_enable(uap->clk); - local_irq_save(flags); + /* + * local_irq_save(flags); + * + * This local_irq_save() is nonsense. If we come in via sysrq + * handling then interrupts are already disabled. Aside of + * that the port.sysrq check is racy on SMP regardless. + */ if (uap->port.sysrq) locked = 0; else if (oops_in_progress) - locked = spin_trylock(&uap->port.lock); + locked = spin_trylock_irqsave(&uap->port.lock, flags); else - spin_lock(&uap->port.lock); + spin_lock_irqsave(&uap->port.lock, flags); /* * First save the CR then disable the interrupts @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2386 @ pl011_console_write(struct console *co, pl011_write(old_cr, uap, REG_CR); if (locked) - spin_unlock(&uap->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&uap->port.lock, flags); clk_disable(uap->clk); } Index: linux-5.15/drivers/tty/serial/omap-serial.c =================================================================== --- linux-5.15.orig/drivers/tty/serial/omap-serial.c +++ linux-5.15/drivers/tty/serial/omap-serial.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1258 @ serial_omap_console_write(struct console unsigned int ier; int locked = 1; - local_irq_save(flags); - if (up->port.sysrq) - locked = 0; - else if (oops_in_progress) - locked = spin_trylock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); else - spin_lock(&up->port.lock); + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1288 @ serial_omap_console_write(struct console check_modem_status(up); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init Index: linux-5.15/drivers/virt/acrn/irqfd.c =================================================================== --- linux-5.15.orig/drivers/virt/acrn/irqfd.c +++ linux-5.15/drivers/virt/acrn/irqfd.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:20 @ #include "acrn_drv.h" static LIST_HEAD(acrn_irqfd_clients); -static DEFINE_MUTEX(acrn_irqfds_mutex); /** * struct hsm_irqfd - Properties of HSM irqfd Index: linux-5.15/fs/afs/dir_silly.c =================================================================== --- linux-5.15.orig/fs/afs/dir_silly.c +++ linux-5.15/fs/afs/dir_silly.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:242 @ int afs_silly_iput(struct dentry *dentry struct dentry *alias; int ret; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); Index: linux-5.15/fs/cifs/readdir.c =================================================================== --- linux-5.15.orig/fs/cifs/readdir.c +++ linux-5.15/fs/cifs/readdir.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:72 @ cifs_prime_dcache(struct dentry *parent, struct inode *inode; struct super_block *sb = parent->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); Index: linux-5.15/fs/dcache.c =================================================================== --- linux-5.15.orig/fs/dcache.c +++ linux-5.15/fs/dcache.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2541 @ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - + /* + * The caller has a spinlock_t (dentry::d_lock) acquired which disables + * preemption on !PREEMPT_RT. On PREEMPT_RT the lock does not disable + * preemption and it has be done explicitly. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2559 @ static inline unsigned start_dir_add(str static inline void end_dir_add(struct inode *dir, unsigned n) { smp_store_release(&dir->i_dir_seq, n + 2); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } static void d_wait_lookup(struct dentry *dentry) { - if (d_in_lookup(dentry)) { - DECLARE_WAITQUEUE(wait, current); - add_wait_queue(dentry->d_wait, &wait); - do { - set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock(&dentry->d_lock); - schedule(); - spin_lock(&dentry->d_lock); - } while (d_in_lookup(dentry)); - } + struct swait_queue __wait; + + if (!d_in_lookup(dentry)) + return; + + INIT_LIST_HEAD(&__wait.task_list); + do { + prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&dentry->d_lock); + schedule(); + spin_lock(&dentry->d_lock); + } while (d_in_lookup(dentry)); + finish_swait(dentry->d_wait, &__wait); } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, - wait_queue_head_t *wq) + struct swait_queue_head *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2697 @ void __d_lookup_done(struct dentry *dent hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); - wake_up_all(dentry->d_wait); + swake_up_all(dentry->d_wait); dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); Index: linux-5.15/fs/fscache/internal.h =================================================================== --- linux-5.15.orig/fs/fscache/internal.h +++ linux-5.15/fs/fscache/internal.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:84 @ extern unsigned fscache_debug; extern struct kobject *fscache_root; extern struct workqueue_struct *fscache_object_wq; extern struct workqueue_struct *fscache_op_wq; -DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); Index: linux-5.15/fs/fscache/main.c =================================================================== --- linux-5.15.orig/fs/fscache/main.c +++ linux-5.15/fs/fscache/main.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:44 @ struct kobject *fscache_root; struct workqueue_struct *fscache_object_wq; struct workqueue_struct *fscache_op_wq; -DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); - /* these values serve as lower bounds, will be adjusted in fscache_init() */ static unsigned fscache_object_max_active = 4; static unsigned fscache_op_max_active = 2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:139 @ unsigned int fscache_hash(unsigned int s static int __init fscache_init(void) { unsigned int nr_cpus = num_possible_cpus(); - unsigned int cpu; int ret; fscache_object_max_active = @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:161 @ static int __init fscache_init(void) if (!fscache_op_wq) goto error_op_wq; - for_each_possible_cpu(cpu) - init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); - ret = fscache_proc_init(); if (ret < 0) goto error_proc; Index: linux-5.15/fs/fscache/object.c =================================================================== --- linux-5.15.orig/fs/fscache/object.c +++ linux-5.15/fs/fscache/object.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:801 @ void fscache_object_destroy(struct fscac } EXPORT_SYMBOL(fscache_object_destroy); +static DECLARE_WAIT_QUEUE_HEAD(fscache_object_cong_wait); + /* * enqueue an object for metadata-type processing */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:811 @ void fscache_enqueue_object(struct fscac _enter("{OBJ%x}", object->debug_id); if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { - wait_queue_head_t *cong_wq = - &get_cpu_var(fscache_object_cong_wait); if (queue_work(fscache_object_wq, &object->work)) { if (fscache_object_congested()) - wake_up(cong_wq); + wake_up(&fscache_object_cong_wait); } else fscache_put_object(object, fscache_obj_put_queue); - - put_cpu_var(fscache_object_cong_wait); } } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:834 @ void fscache_enqueue_object(struct fscac */ bool fscache_object_sleep_till_congested(signed long *timeoutp) { - wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); DEFINE_WAIT(wait); if (fscache_object_congested()) return true; - add_wait_queue_exclusive(cong_wq, &wait); + add_wait_queue_exclusive(&fscache_object_cong_wait, &wait); if (!fscache_object_congested()) *timeoutp = schedule_timeout(*timeoutp); - finish_wait(cong_wq, &wait); + finish_wait(&fscache_object_cong_wait, &wait); return fscache_object_congested(); } Index: linux-5.15/fs/fuse/readdir.c =================================================================== --- linux-5.15.orig/fs/fuse/readdir.c +++ linux-5.15/fs/fuse/readdir.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:163 @ static int fuse_direntplus_link(struct f struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); if (!o->nodeid) { /* Index: linux-5.15/fs/namei.c =================================================================== --- linux-5.15.orig/fs/namei.c +++ linux-5.15/fs/namei.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1638 @ static struct dentry *__lookup_slow(cons { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3249 @ static struct dentry *lookup_open(struct struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); Index: linux-5.15/fs/namespace.c =================================================================== --- linux-5.15.orig/fs/namespace.c +++ linux-5.15/fs/namespace.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:347 @ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) - cpu_relax(); + might_lock(&mount_lock.lock); + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + cpu_relax(); + } else { + /* + * This prevents priority inversion, if the task + * setting MNT_WRITE_HOLD got preempted on a remote + * CPU, and it prevents life lock if the task setting + * MNT_WRITE_HOLD has a lower priority and is bound to + * the same CPU as the task that is spinning here. + */ + preempt_enable(); + lock_mount_hash(); + unlock_mount_hash(); + preempt_disable(); + } + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until Index: linux-5.15/fs/nfs/dir.c =================================================================== --- linux-5.15.orig/fs/nfs/dir.c +++ linux-5.15/fs/nfs/dir.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:640 @ void nfs_prime_dcache(struct dentry *par unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *inode; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1876 @ int nfs_atomic_open(struct inode *dir, s struct file *file, unsigned open_flags, umode_t mode) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; Index: linux-5.15/fs/nfs/unlink.c =================================================================== --- linux-5.15.orig/fs/nfs/unlink.c +++ linux-5.15/fs/nfs/unlink.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:16 @ #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/sched.h> -#include <linux/wait.h> +#include <linux/swait.h> #include <linux/namei.h> #include <linux/fsnotify.h> @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:187 @ nfs_async_unlink(struct dentry *dentry, data->cred = get_current_cred(); data->res.dir_attr = &data->dir_attr; - init_waitqueue_head(&data->wq); + init_swait_queue_head(&data->wq); status = -EBUSY; spin_lock(&dentry->d_lock); Index: linux-5.15/fs/proc/base.c =================================================================== --- linux-5.15.orig/fs/proc/base.c +++ linux-5.15/fs/proc/base.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:99 @ #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> +#include <linux/swait.h> #include <linux/cn_proc.h> #include <trace/events/oom.h> #include "internal.h" @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2075 @ bool proc_fill_cache(struct file *file, child = d_hash_and_lookup(dir, &qname); if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; Index: linux-5.15/fs/proc/proc_sysctl.c =================================================================== --- linux-5.15.orig/fs/proc/proc_sysctl.c +++ linux-5.15/fs/proc/proc_sysctl.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:682 @ static bool proc_sys_fill_cache(struct f child = d_lookup(dir, &qname); if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; Index: linux-5.15/include/asm-generic/softirq_stack.h =================================================================== --- linux-5.15.orig/include/asm-generic/softirq_stack.h +++ linux-5.15/include/asm-generic/softirq_stack.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5 @ #ifndef __ASM_GENERIC_SOFTIRQ_STACK_H #define __ASM_GENERIC_SOFTIRQ_STACK_H -#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK +#if defined(CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK) && !defined(CONFIG_PREEMPT_RT) void do_softirq_own_stack(void); #else static inline void do_softirq_own_stack(void) Index: linux-5.15/include/linux/console.h =================================================================== --- linux-5.15.orig/include/linux/console.h +++ linux-5.15/include/linux/console.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:19 @ #include <linux/atomic.h> #include <linux/types.h> +#include <linux/printk.h> +#include <linux/seqlock.h> + +struct latched_seq { + seqcount_latch_t latch; + u64 val[2]; +}; struct vc_data; struct console_font_op; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:146 @ static inline int con_debug_leave(void) #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ #define CON_BRL (32) /* Used for a braille device */ #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ +#define CON_HANDOVER (128) /* Device was previously a boot console. */ struct console { char name[16]; void (*write)(struct console *, const char *, unsigned); + void (*write_atomic)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *, char *, unsigned); struct tty_driver *(*device)(struct console *, int *); void (*unblank)(void); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:161 @ struct console { short flags; short index; int cflag; +#ifdef CONFIG_PRINTK + char sync_buf[CONSOLE_LOG_MAX]; + struct latched_seq printk_seq; + struct latched_seq printk_sync_seq; +#ifdef CONFIG_HAVE_NMI + struct latched_seq printk_sync_nmi_seq; +#endif +#endif /* CONFIG_PRINTK */ + + struct task_struct *thread; uint ispeed; uint ospeed; void *data; Index: linux-5.15/include/linux/dcache.h =================================================================== --- linux-5.15.orig/include/linux/dcache.h +++ linux-5.15/include/linux/dcache.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:111 @ struct dentry { union { struct list_head d_lru; /* LRU list */ - wait_queue_head_t *d_wait; /* in-lookup ones only */ + struct swait_queue_head *d_wait; /* in-lookup ones only */ }; struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:243 @ extern void d_set_d_op(struct dentry *de extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, - wait_queue_head_t *); + struct swait_queue_head *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); Index: linux-5.15/include/linux/entry-common.h =================================================================== --- linux-5.15.orig/include/linux/entry-common.h +++ linux-5.15/include/linux/entry-common.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:60 @ # define ARCH_EXIT_TO_USER_MODE_WORK (0) #endif +#ifdef CONFIG_PREEMPT_LAZY +# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) +#else +# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED) +#endif + #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ARCH_EXIT_TO_USER_MODE_WORK) /** Index: linux-5.15/include/linux/irq_work.h =================================================================== --- linux-5.15.orig/include/linux/irq_work.h +++ linux-5.15/include/linux/irq_work.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6 @ #define _LINUX_IRQ_WORK_H #include <linux/smp_types.h> +#include <linux/rcuwait.h> /* * An entry can be in one of four states: @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:20 @ struct irq_work { struct __call_single_node node; void (*func)(struct irq_work *); + struct rcuwait irqwait; }; #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ + .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ } #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:52 @ static inline bool irq_work_is_busy(stru return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; } +static inline bool irq_work_is_hard(struct irq_work *work) +{ + return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; +} + bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); Index: linux-5.15/include/linux/irqdesc.h =================================================================== --- linux-5.15.orig/include/linux/irqdesc.h +++ linux-5.15/include/linux/irqdesc.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:163 @ static inline void generic_handle_irq_de int handle_irq_desc(struct irq_desc *desc); int generic_handle_irq(unsigned int irq); +int generic_handle_irq_safe(unsigned int irq); #ifdef CONFIG_IRQ_DOMAIN /* Index: linux-5.15/include/linux/irqflags.h =================================================================== --- linux-5.15.orig/include/linux/irqflags.h +++ linux-5.15/include/linux/irqflags.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:74 @ do { \ do { \ __this_cpu_dec(hardirq_context); \ } while (0) -# define lockdep_softirq_enter() \ -do { \ - current->softirq_context++; \ -} while (0) -# define lockdep_softirq_exit() \ -do { \ - current->softirq_context--; \ -} while (0) # define lockdep_hrtimer_enter(__hrtimer) \ ({ \ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:135 @ do { \ # define lockdep_irq_work_exit(__work) do { } while (0) #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) +# define lockdep_softirq_enter() \ +do { \ + current->softirq_context++; \ +} while (0) +# define lockdep_softirq_exit() \ +do { \ + current->softirq_context--; \ +} while (0) + +#else +# define lockdep_softirq_enter() do { } while (0) +# define lockdep_softirq_exit() do { } while (0) +#endif + #if defined(CONFIG_IRQSOFF_TRACER) || \ defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); Index: linux-5.15/include/linux/kernel.h =================================================================== --- linux-5.15.orig/include/linux/kernel.h +++ linux-5.15/include/linux/kernel.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:114 @ static __always_inline void might_resche #endif /* CONFIG_PREEMPT_* */ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -extern void ___might_sleep(const char *file, int line, int preempt_offset); -extern void __might_sleep(const char *file, int line, int preempt_offset); +extern void __might_resched(const char *file, int line, unsigned int offsets); +extern void __might_sleep(const char *file, int line); extern void __cant_sleep(const char *file, int line, int preempt_offset); extern void __cant_migrate(const char *file, int line); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:132 @ extern void __cant_migrate(const char *f * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) /** * cant_sleep - annotation for functions that cannot sleep * @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:171 @ extern void __cant_migrate(const char *f */ # define non_block_end() WARN_ON(current->non_block_count-- == 0) #else - static inline void ___might_sleep(const char *file, int line, - int preempt_offset) { } - static inline void __might_sleep(const char *file, int line, - int preempt_offset) { } + static inline void __might_resched(const char *file, int line, + unsigned int offsets) { } +static inline void __might_sleep(const char *file, int line) { } # define might_sleep() do { might_resched(); } while (0) # define cant_sleep() do { } while (0) # define cant_migrate() do { } while (0) Index: linux-5.15/include/linux/kgdb.h =================================================================== --- linux-5.15.orig/include/linux/kgdb.h +++ linux-5.15/include/linux/kgdb.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:215 @ extern void kgdb_call_nmi_hook(void *ign */ extern void kgdb_roundup_cpus(void); +extern void kgdb_roundup_cpu(unsigned int cpu); + /** * kgdb_arch_set_pc - Generic call back to the program counter * @regs: Current &struct pt_regs. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:370 @ extern void kgdb_free_init_mem(void); #define dbg_late_init() static inline void kgdb_panic(const char *msg) {} static inline void kgdb_free_init_mem(void) { } +static inline void kgdb_roundup_cpu(unsigned int cpu) {} #endif /* ! CONFIG_KGDB */ #endif /* _KGDB_H_ */ Index: linux-5.15/include/linux/mm_types.h =================================================================== --- linux-5.15.orig/include/linux/mm_types.h +++ linux-5.15/include/linux/mm_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:15 @ #include <linux/completion.h> #include <linux/cpumask.h> #include <linux/uprobes.h> +#include <linux/rcupdate.h> #include <linux/page-flags-layout.h> #include <linux/workqueue.h> #include <linux/seqlock.h> @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:578 @ struct mm_struct { bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head delayed_drop; +#endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif Index: linux-5.15/include/linux/netdevice.h =================================================================== --- linux-5.15.orig/include/linux/netdevice.h +++ linux-5.15/include/linux/netdevice.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1941 @ enum netdev_ml_priv_type { * @sfp_bus: attached &struct sfp_bus structure. * * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock - * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2274 @ struct net_device { struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; - struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; unsigned threaded:1; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2384 @ static inline void netdev_for_each_tx_qu #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ - static struct lock_class_key qdisc_running_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ - (dev)->qdisc_running_key = &qdisc_running_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ Index: linux-5.15/include/linux/nfs_xdr.h =================================================================== --- linux-5.15.orig/include/linux/nfs_xdr.h +++ linux-5.15/include/linux/nfs_xdr.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1695 @ struct nfs_unlinkdata { struct nfs_removeargs args; struct nfs_removeres res; struct dentry *dentry; - wait_queue_head_t wq; + struct swait_queue_head wq; const struct cred *cred; struct nfs_fattr dir_attr; long timeout; Index: linux-5.15/include/linux/preempt.h =================================================================== --- linux-5.15.orig/include/linux/preempt.h +++ linux-5.15/include/linux/preempt.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:125 @ * The preempt_count offset after spin_lock() */ #if !defined(CONFIG_PREEMPT_RT) -#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET +#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET #else -#define PREEMPT_LOCK_OFFSET 0 +/* Locks on RT do not disable preemption */ +#define PREEMPT_LOCK_OFFSET 0 #endif /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:178 @ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) +#ifdef CONFIG_PREEMPT_LAZY +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) +#define inc_preempt_lazy_count() add_preempt_lazy_count(1) +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) +#else +#define add_preempt_lazy_count(val) do { } while (0) +#define sub_preempt_lazy_count(val) do { } while (0) +#define inc_preempt_lazy_count() do { } while (0) +#define dec_preempt_lazy_count() do { } while (0) +#define preempt_lazy_count() (0) +#endif + #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:200 @ do { \ barrier(); \ } while (0) +#define preempt_lazy_disable() \ +do { \ + inc_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) -#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#ifndef CONFIG_PREEMPT_RT +# define preempt_enable_no_resched() sched_preempt_enable_no_resched() +# define preempt_check_resched_rt() barrier(); +#else +# define preempt_enable_no_resched() preempt_enable() +# define preempt_check_resched_rt() preempt_check_resched() +#endif #define preemptible() (preempt_count() == 0 && !irqs_disabled()) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:243 @ do { \ __preempt_schedule(); \ } while (0) +/* + * open code preempt_check_resched() because it is not exported to modules and + * used by local_unlock() or bpf_enable_instrumentation(). + */ +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ + if (should_resched(0)) \ + __preempt_schedule(); \ +} while (0) + #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:262 @ do { \ preempt_count_dec(); \ } while (0) +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define preempt_enable_notrace() \ do { \ barrier(); \ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:306 @ do { \ #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() +#define preempt_check_resched_rt() barrier() #define preemptible() 0 +#define preempt_lazy_disable() barrier() +#define preempt_lazy_enable() barrier() + #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:330 @ do { \ } while (0) #define preempt_fold_need_resched() \ do { \ - if (tif_need_resched()) \ + if (tif_need_resched_now()) \ set_preempt_need_resched(); \ } while (0) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:446 @ extern void migrate_enable(void); #else -static inline void migrate_disable(void) { } -static inline void migrate_enable(void) { } +static inline void migrate_disable(void) +{ + preempt_lazy_disable(); +} + +static inline void migrate_enable(void) +{ + preempt_lazy_enable(); +} #endif /* CONFIG_SMP */ Index: linux-5.15/include/linux/printk.h =================================================================== --- linux-5.15.orig/include/linux/printk.h +++ linux-5.15/include/linux/printk.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:50 @ static inline const char *printk_skip_he #define CONSOLE_EXT_LOG_MAX 8192 +/* + * The maximum size of a record formatted for console printing + * (i.e. with the prefix prepended to every line). + */ +#define CONSOLE_LOG_MAX 1024 + /* printk's without a loglevel use this.. */ #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:164 @ int vprintk(const char *fmt, va_list arg asmlinkage __printf(1, 2) __cold int _printk(const char *fmt, ...); +bool pr_flush(int timeout_ms, bool reset_on_progress); + /* * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:235 @ static inline void printk_deferred_exit( { } +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return true; +} + static inline int printk_ratelimit(void) { return 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:300 @ static inline void printk_trigger_flush( extern int __printk_cpu_trylock(void); extern void __printk_wait_on_cpu_lock(void); extern void __printk_cpu_unlock(void); +extern bool kgdb_roundup_delay(unsigned int cpu); + +#else + +#define __printk_cpu_trylock() 1 +#define __printk_wait_on_cpu_lock() +#define __printk_cpu_unlock() + +static inline bool kgdb_roundup_delay(unsigned int cpu) +{ + return false; +} +#endif /* CONFIG_SMP */ /** - * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning - * lock and disable interrupts. + * raw_printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning + * lock and disable interrupts. * @flags: Stack-allocated storage for saving local interrupt state, - * to be passed to printk_cpu_unlock_irqrestore(). + * to be passed to raw_printk_cpu_unlock_irqrestore(). * * If the lock is owned by another CPU, spin until it becomes available. * Interrupts are restored while spinning. */ -#define printk_cpu_lock_irqsave(flags) \ +#define raw_printk_cpu_lock_irqsave(flags) \ for (;;) { \ local_irq_save(flags); \ if (__printk_cpu_trylock()) \ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:333 @ extern void __printk_cpu_unlock(void); } /** - * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning - * lock and restore interrupts. - * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave(). + * raw_printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant + * spinning lock and restore interrupts. + * @flags: Caller's saved interrupt state from raw_printk_cpu_lock_irqsave(). */ -#define printk_cpu_unlock_irqrestore(flags) \ +#define raw_printk_cpu_unlock_irqrestore(flags) \ do { \ __printk_cpu_unlock(); \ local_irq_restore(flags); \ - } while (0) \ + } while (0) -#else - -#define printk_cpu_lock_irqsave(flags) ((void)flags) -#define printk_cpu_unlock_irqrestore(flags) ((void)flags) +/* + * Used to synchronize atomic consoles. + * + * The same as raw_printk_cpu_lock_irqsave() except that hardware interrupts + * are _not_ restored while spinning. + */ +#define console_atomic_lock(flags) \ + do { \ + local_irq_save(flags); \ + while (!__printk_cpu_trylock()) \ + cpu_relax(); \ + } while (0) -#endif /* CONFIG_SMP */ +#define console_atomic_unlock raw_printk_cpu_unlock_irqrestore extern int kptr_restrict; Index: linux-5.15/include/linux/ratelimit_types.h =================================================================== --- linux-5.15.orig/include/linux/ratelimit_types.h +++ linux-5.15/include/linux/ratelimit_types.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:7 @ #include <linux/bits.h> #include <linux/param.h> -#include <linux/spinlock_types.h> +#include <linux/spinlock_types_raw.h> #define DEFAULT_RATELIMIT_INTERVAL (5 * HZ) #define DEFAULT_RATELIMIT_BURST 10 Index: linux-5.15/include/linux/rcupdate.h =================================================================== --- linux-5.15.orig/include/linux/rcupdate.h +++ linux-5.15/include/linux/rcupdate.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:97 @ void rcu_init_tasks_generic(void); static inline void rcu_init_tasks_generic(void) { } #endif +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TASKS_RCU_GENERIC) +void rcu_tasks_initiate_self_tests(void); +#else +static inline void rcu_tasks_initiate_self_tests(void) {} +#endif + + #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); Index: linux-5.15/include/linux/rtmutex.h =================================================================== --- linux-5.15.orig/include/linux/rtmutex.h +++ linux-5.15/include/linux/rtmutex.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:102 @ extern void __rt_mutex_init(struct rt_mu #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); +extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock); #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) +#define rt_mutex_lock_nest_lock(lock, nest_lock) \ + do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _rt_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ + } while (0) + #else extern void rt_mutex_lock(struct rt_mutex *lock); #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) +#define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock) #endif extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); +extern int rt_mutex_lock_killable(struct rt_mutex *lock); extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); Index: linux-5.15/include/linux/sched.h =================================================================== --- linux-5.15.orig/include/linux/sched.h +++ linux-5.15/include/linux/sched.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:121 @ struct task_group; #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) - #define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) -#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) - /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1083 @ struct task_struct { /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; +#ifdef CONFIG_PREEMPT_RT + /* TODO: move me into ->restart_block ? */ + struct kernel_siginfo forced_info; +#endif unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1741 @ static __always_inline bool is_percpu_th #endif } +/* Is the current task guaranteed to stay on its current CPU? */ +static inline bool is_migratable(void) +{ +#ifdef CONFIG_SMP + return preemptible() && !current->migration_disabled; +#else + return false; +#endif +} + /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2026 @ static inline int test_tsk_need_resched( return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +#ifdef CONFIG_PREEMPT_LAZY +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); +} + +static inline int need_resched_lazy(void) +{ + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +} + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#else +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } +static inline int need_resched_lazy(void) { return 0; } + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#endif + +#ifdef CONFIG_PREEMPT_RT +static inline bool task_match_saved_state(struct task_struct *p, long match_state) +{ + return p->saved_state == match_state; +} + +static inline bool task_is_traced(struct task_struct *task) +{ + bool traced = false; + + /* in case the task is sleeping on tasklist_lock */ + raw_spin_lock_irq(&task->pi_lock); + if (READ_ONCE(task->__state) & __TASK_TRACED) + traced = true; + else if (task->saved_state & __TASK_TRACED) + traced = true; + raw_spin_unlock_irq(&task->pi_lock); + return traced; +} + +static inline bool task_is_stopped_or_traced(struct task_struct *task) +{ + bool traced_stopped = false; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + if (READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) + traced_stopped = true; + else if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) + traced_stopped = true; + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return traced_stopped; +} + +#else + +static inline bool task_match_saved_state(struct task_struct *p, long match_state) +{ + return false; +} + +static inline bool task_is_traced(struct task_struct *task) +{ + return READ_ONCE(task->__state) & __TASK_TRACED; +} + +static inline bool task_is_stopped_or_traced(struct task_struct *task) +{ + return READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED); +} +#endif + +static inline bool task_match_state_or_saved(struct task_struct *p, + long match_state) +{ + if (READ_ONCE(p->__state) == match_state) + return true; + + return task_match_saved_state(p, match_state); +} + +static inline bool task_match_state_lock(struct task_struct *p, + long match_state) +{ + bool match; + + raw_spin_lock_irq(&p->pi_lock); + match = task_match_state_or_saved(p, match_state); + raw_spin_unlock_irq(&p->pi_lock); + + return match; +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2172 @ static inline int _cond_resched(void) { #endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ #define cond_resched() ({ \ - ___might_sleep(__FILE__, __LINE__, 0); \ + __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2180 @ extern int __cond_resched_lock(spinlock_ extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); -#define cond_resched_lock(lock) ({ \ - ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ - __cond_resched_lock(lock); \ +#define MIGHT_RESCHED_RCU_SHIFT 8 +#define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) + +#ifndef CONFIG_PREEMPT_RT +/* + * Non RT kernels have an elevated preempt count due to the held lock, + * but are not allowed to be inside a RCU read side critical section + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET +#else +/* + * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in + * cond_resched*lock() has to take that into account because it checks for + * preempt_count() and rcu_preempt_depth(). + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS \ + (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) +#endif + +#define cond_resched_lock(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_lock(lock); \ }) -#define cond_resched_rwlock_read(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_read(lock); \ +#define cond_resched_rwlock_read(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_read(lock); \ }) -#define cond_resched_rwlock_write(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_write(lock); \ +#define cond_resched_rwlock_write(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_write(lock); \ }) static inline void cond_resched_rcu(void) Index: linux-5.15/include/linux/sched/mm.h =================================================================== --- linux-5.15.orig/include/linux/sched/mm.h +++ linux-5.15/include/linux/sched/mm.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:52 @ static inline void mmdrop(struct mm_stru __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT +extern void __mmdrop_delayed(struct rcu_head *rhp); + +/* + * Invoked from finish_task_switch(). Delegates the heavy lifting on RT + * kernels via RCU. + */ +static inline void mmdrop_sched(struct mm_struct *mm) +{ + /* Provides a full memory barrier. See mmdrop() */ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +static inline void mmdrop_sched(struct mm_struct *mm) +{ + mmdrop(mm); +} +#endif + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. Index: linux-5.15/include/linux/serial_8250.h =================================================================== --- linux-5.15.orig/include/linux/serial_8250.h +++ linux-5.15/include/linux/serial_8250.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:10 @ #ifndef _LINUX_SERIAL_8250_H #define _LINUX_SERIAL_8250_H +#include <linux/atomic.h> #include <linux/serial_core.h> #include <linux/serial_reg.h> #include <linux/platform_device.h> @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:130 @ struct uart_8250_port { #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; + atomic_t console_printing; + struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:187 @ void serial8250_init_port(struct uart_82 void serial8250_set_defaults(struct uart_8250_port *up); void serial8250_console_write(struct uart_8250_port *up, const char *s, unsigned int count); +void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, + unsigned int count); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); Index: linux-5.15/include/linux/skbuff.h =================================================================== --- linux-5.15.orig/include/linux/skbuff.h +++ linux-5.15/include/linux/skbuff.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:303 @ struct sk_buff_head { __u32 qlen; spinlock_t lock; + raw_spinlock_t raw_lock; }; struct sk_buff; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1996 @ static inline void skb_queue_head_init(s __skb_queue_head_init(list); } +static inline void skb_queue_head_init_raw(struct sk_buff_head *list) +{ + raw_spin_lock_init(&list->raw_lock); + __skb_queue_head_init(list); +} + static inline void skb_queue_head_init_class(struct sk_buff_head *list, struct lock_class_key *class) { Index: linux-5.15/include/linux/smp.h =================================================================== --- linux-5.15.orig/include/linux/smp.h +++ linux-5.15/include/linux/smp.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:271 @ static inline int get_boot_cpu_id(void) #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() +#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) +#define put_cpu_light() migrate_enable() + /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: Index: linux-5.15/include/linux/spinlock_types_up.h =================================================================== --- linux-5.15.orig/include/linux/spinlock_types_up.h +++ linux-5.15/include/linux/spinlock_types_up.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4 @ #ifndef __LINUX_SPINLOCK_TYPES_UP_H #define __LINUX_SPINLOCK_TYPES_UP_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H # error "please don't include this file directly" #endif Index: linux-5.15/include/linux/thread_info.h =================================================================== --- linux-5.15.orig/include/linux/thread_info.h +++ linux-5.15/include/linux/thread_info.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:166 @ static inline int test_ti_thread_flag(st clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#ifdef CONFIG_PREEMPT_LAZY +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ + test_thread_flag(TIF_NEED_RESCHED_LAZY)) +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) +#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY) + +#else +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED) +#define tif_need_resched_lazy() 0 +#endif #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, Index: linux-5.15/include/linux/trace_events.h =================================================================== --- linux-5.15.orig/include/linux/trace_events.h +++ linux-5.15/include/linux/trace_events.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:72 @ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + unsigned char preempt_lazy_count; }; #define TRACE_EVENT_TYPE_MAX \ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:162 @ static inline void tracing_generic_entry unsigned int trace_ctx) { entry->preempt_count = trace_ctx & 0xff; + entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; entry->pid = current->pid; entry->type = type; - entry->flags = trace_ctx >> 16; + entry->flags = trace_ctx >> 24; } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:178 @ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, + TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, }; #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT Index: linux-5.15/include/linux/u64_stats_sync.h =================================================================== --- linux-5.15.orig/include/linux/u64_stats_sync.h +++ linux-5.15/include/linux/u64_stats_sync.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:69 @ #include <linux/seqlock.h> struct u64_stats_sync { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG==32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) seqcount_t seq; #endif }; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:86 @ static inline u64 u64_stats_read(const u return local64_read(&p->v); } +static inline void u64_stats_set(u64_stats_t *p, u64 val) +{ + local64_set(&p->v, val); +} + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) { local64_add(val, &p->v); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:112 @ static inline u64 u64_stats_read(const u return p->v; } +static inline void u64_stats_set(u64_stats_t *p, u64 val) +{ + p->v = val; +} + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) { p->v += val; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:128 @ static inline void u64_stats_inc(u64_sta } #endif -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) #else static inline void u64_stats_init(struct u64_stats_sync *syncp) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:138 @ static inline void u64_stats_init(struct static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); write_seqcount_begin(&syncp->seq); #endif } static inline void u64_stats_update_end(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); #endif } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:159 @ u64_stats_update_begin_irqsave(struct u6 { unsigned long flags = 0; -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - local_irq_save(flags); +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + local_irq_save(flags); write_seqcount_begin(&syncp->seq); #endif return flags; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:173 @ static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, unsigned long flags) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); - local_irq_restore(flags); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + local_irq_restore(flags); #endif } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_begin(&syncp->seq); #else return 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:193 @ static inline unsigned int __u64_stats_f static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_disable(); #endif return __u64_stats_fetch_begin(syncp); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:202 @ static inline unsigned int u64_stats_fet static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_retry(&syncp->seq, start); #else return false; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:212 @ static inline bool __u64_stats_fetch_ret static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_enable(); #endif return __u64_stats_fetch_retry(syncp, start); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:226 @ static inline bool u64_stats_fetch_retry */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_disable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_disable(); #endif return __u64_stats_fetch_begin(syncp); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:237 @ static inline unsigned int u64_stats_fet static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_enable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_enable(); #endif return __u64_stats_fetch_retry(syncp, start); Index: linux-5.15/include/net/act_api.h =================================================================== --- linux-5.15.orig/include/net/act_api.h +++ linux-5.15/include/net/act_api.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:33 @ struct tc_action { atomic_t tcfa_bindcnt; int tcfa_action; struct tcf_t tcfa_tm; - struct gnet_stats_basic_packed tcfa_bstats; - struct gnet_stats_basic_packed tcfa_bstats_hw; + struct gnet_stats_basic_sync tcfa_bstats; + struct gnet_stats_basic_sync tcfa_bstats_hw; struct gnet_stats_queue tcfa_qstats; struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; - struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw; + struct gnet_stats_basic_sync __percpu *cpu_bstats; + struct gnet_stats_basic_sync __percpu *cpu_bstats_hw; struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie __rcu *act_cookie; struct tcf_chain __rcu *goto_chain; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:209 @ static inline void tcf_action_update_bst struct sk_buff *skb) { if (likely(a->cpu_bstats)) { - bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + bstats_update(this_cpu_ptr(a->cpu_bstats), skb); return; } spin_lock(&a->tcfa_lock); Index: linux-5.15/include/net/gen_stats.h =================================================================== --- linux-5.15.orig/include/net/gen_stats.h +++ linux-5.15/include/net/gen_stats.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:10 @ #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> -/* Note: this used to be in include/uapi/linux/gen_stats.h */ -struct gnet_stats_basic_packed { - __u64 bytes; - __u64 packets; -}; - -struct gnet_stats_basic_cpu { - struct gnet_stats_basic_packed bstats; +/* Throughput stats. + * Must be initialized beforehand with gnet_stats_basic_sync_init(). + * + * If no reads can ever occur parallel to writes (e.g. stack-allocated + * bstats), then the internal stat values can be written to and read + * from directly. Otherwise, use _bstats_set/update() for writes and + * gnet_stats_add_basic() for reads. + */ +struct gnet_stats_basic_sync { + u64_stats_t bytes; + u64_stats_t packets; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:40 @ struct gnet_dump { struct tc_stats tc_stats; }; +void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b); int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:49 @ int gnet_stats_start_copy_compat(struct spinlock_t *lock, struct gnet_dump *d, int padattr); -int gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b); -void __gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b); -int gnet_stats_copy_basic_hw(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b); +int gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running); +void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running); +int gnet_stats_copy_basic_hw(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running); int gnet_stats_copy_rate_est(struct gnet_dump *d, struct net_rate_estimator __rcu **ptr); int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen); -void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu_q, - const struct gnet_stats_queue *q, __u32 qlen); +void gnet_stats_add_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu_q, + const struct gnet_stats_queue *q); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); -int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_new_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + bool running, struct nlattr *opt); void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); -int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **ptr, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + bool running, struct nlattr *opt); bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, struct gnet_stats_rate_est64 *sample); Index: linux-5.15/include/net/netfilter/xt_rateest.h =================================================================== --- linux-5.15.orig/include/net/netfilter/xt_rateest.h +++ linux-5.15/include/net/netfilter/xt_rateest.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:9 @ struct xt_rateest { /* keep lock and bstats on same cache line to speedup xt_rateest_tg() */ - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; spinlock_t lock; Index: linux-5.15/include/net/pkt_cls.h =================================================================== --- linux-5.15.orig/include/net/pkt_cls.h +++ linux-5.15/include/net/pkt_cls.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:768 @ struct tc_cookie { }; struct tc_qopt_offload_stats { - struct gnet_stats_basic_packed *bstats; + struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:888 @ struct tc_gred_qopt_offload_params { }; struct tc_gred_qopt_offload_stats { - struct gnet_stats_basic_packed bstats[MAX_DPs]; + struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; Index: linux-5.15/include/net/sch_generic.h =================================================================== --- linux-5.15.orig/include/net/sch_generic.h +++ linux-5.15/include/net/sch_generic.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:43 @ enum qdisc_state_t { __QDISC_STATE_DRAINING, }; +enum qdisc_state2_t { + /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. + * Use qdisc_run_begin/end() or qdisc_is_running() instead. + */ + __QDISC_STATE2_RUNNING, +}; + #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:107 @ struct Qdisc { struct netdev_queue *dev_queue; struct net_rate_estimator __rcu *rate_est; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; int pad; refcount_t refcnt; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:117 @ struct Qdisc { */ struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; - struct gnet_stats_basic_packed bstats; - seqcount_t running; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; unsigned long state; + unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:153 @ static inline struct Qdisc *qdisc_refcou return NULL; } +/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc + * root_lock section, or provide their own memory barriers -- ordering + * against qdisc_run_begin/end() atomic bit operations. + */ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); - return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; + return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:181 @ static inline bool qdisc_is_empty(const return !READ_ONCE(qdisc->q.qlen); } +/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with + * the qdisc root lock acquired. + */ static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:203 @ static inline bool qdisc_run_begin(struc * when testing it in qdisc_run_end() */ return spin_trylock(&qdisc->seqlock); - } else if (qdisc_is_running(qdisc)) { - return false; } - /* Variant of write_seqcount_begin() telling lockdep a trylock - * was attempted. - */ - raw_write_seqcount_begin(&qdisc->running); - seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); - return true; + return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline void qdisc_run_end(struct Qdisc *qdisc) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:222 @ static inline void qdisc_run_end(struct &qdisc->state))) __netif_schedule(qdisc); } else { - write_seqcount_end(&qdisc->running); + __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:586 @ static inline spinlock_t *qdisc_root_sle return qdisc_lock(root); } -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) -{ - struct Qdisc *root = qdisc_root_sleeping(qdisc); - - ASSERT_RTNL(); - return &root->running; -} - static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) { return qdisc->dev_queue->dev; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:835 @ static inline int qdisc_enqueue(struct s return sch->enqueue(skb, sch, to_free); } -static inline void _bstats_update(struct gnet_stats_basic_packed *bstats, +static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, __u64 bytes, __u32 packets) { - bstats->bytes += bytes; - bstats->packets += packets; + u64_stats_update_begin(&bstats->syncp); + u64_stats_add(&bstats->bytes, bytes); + u64_stats_add(&bstats->packets, packets); + u64_stats_update_end(&bstats->syncp); } -static inline void bstats_update(struct gnet_stats_basic_packed *bstats, +static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { _bstats_update(bstats, @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:852 @ static inline void bstats_update(struct skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); } -static inline void _bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, - __u64 bytes, __u32 packets) -{ - u64_stats_update_begin(&bstats->syncp); - _bstats_update(&bstats->bstats, bytes, packets); - u64_stats_update_end(&bstats->syncp); -} - -static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, - const struct sk_buff *skb) -{ - u64_stats_update_begin(&bstats->syncp); - bstats_update(&bstats->bstats, skb); - u64_stats_update_end(&bstats->syncp); -} - static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, const struct sk_buff *skb) { - bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb); + bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); } static inline void qdisc_bstats_update(struct Qdisc *sch, @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:944 @ static inline void qdisc_qstats_qlen_bac __u32 *backlog) { struct gnet_stats_queue qstats = { 0 }; - __u32 len = qdisc_qlen_sum(sch); - __gnet_stats_copy_queue(&qstats, sch->cpu_qstats, &sch->qstats, len); - *qlen = qstats.qlen; + gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); + *qlen = qstats.qlen + qdisc_qlen(sch); *backlog = qstats.backlog; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1291 @ void psched_ppscfg_precompute(struct psc struct mini_Qdisc { struct tcf_proto *filter_list; struct tcf_block *block; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; struct rcu_head rcu; }; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1299 @ struct mini_Qdisc { static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, const struct sk_buff *skb) { - bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb); + bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); } static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) Index: linux-5.15/init/Kconfig =================================================================== --- linux-5.15.orig/init/Kconfig +++ linux-5.15/init/Kconfig @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:922 @ config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION + depends on SMP && NUMA && MIGRATION && !PREEMPT_RT help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1917 @ choice config SLAB bool "SLAB" + depends on !PREEMPT_RT select HAVE_HARDENED_USERCOPY_ALLOCATOR help The regular slab allocator that is established and known to work @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1938 @ config SLUB config SLOB depends on EXPERT bool "SLOB (Simple Allocator)" + depends on !PREEMPT_RT help SLOB replaces the stock allocator with a drastically simpler allocator. SLOB is generally more space efficient but Index: linux-5.15/init/main.c =================================================================== --- linux-5.15.orig/init/main.c +++ linux-5.15/init/main.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1609 @ static noinline void __init kernel_init_ rcu_init_tasks_generic(); do_pre_smp_initcalls(); + rcu_tasks_initiate_self_tests(); lockup_detector_init(); smp_init(); Index: linux-5.15/kernel/Kconfig.preempt =================================================================== --- linux-5.15.orig/kernel/Kconfig.preempt +++ linux-5.15/kernel/Kconfig.preempt @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4 @ # SPDX-License-Identifier: GPL-2.0-only +config HAVE_PREEMPT_LAZY + bool + +config PREEMPT_LAZY + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT + choice prompt "Preemption Model" default PREEMPT_NONE Index: linux-5.15/kernel/cgroup/rstat.c =================================================================== --- linux-5.15.orig/kernel/cgroup/rstat.c +++ linux-5.15/kernel/cgroup/rstat.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:159 @ static void cgroup_rstat_flush_locked(st raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; + unsigned long flags; - raw_spin_lock(cpu_lock); + raw_spin_lock_irqsave(cpu_lock, flags); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:173 @ static void cgroup_rstat_flush_locked(st css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock(cpu_lock); + raw_spin_unlock_irqrestore(cpu_lock, flags); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || Index: linux-5.15/kernel/debug/debug_core.c =================================================================== --- linux-5.15.orig/kernel/debug/debug_core.c +++ linux-5.15/kernel/debug/debug_core.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:242 @ NOKPROBE_SYMBOL(kgdb_call_nmi_hook); static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = CSD_INIT(kgdb_call_nmi_hook, NULL); -void __weak kgdb_roundup_cpus(void) +void __weak kgdb_roundup_cpu(unsigned int cpu) { call_single_data_t *csd; + int ret; + + csd = &per_cpu(kgdb_roundup_csd, cpu); + + /* + * If it didn't round up last time, don't try again + * since smp_call_function_single_async() will block. + * + * If rounding_up is false then we know that the + * previous call must have at least started and that + * means smp_call_function_single_async() won't block. + */ + if (kgdb_info[cpu].rounding_up) + return; + kgdb_info[cpu].rounding_up = true; + + ret = smp_call_function_single_async(cpu, csd); + if (ret) + kgdb_info[cpu].rounding_up = false; +} +NOKPROBE_SYMBOL(kgdb_roundup_cpu); + +void __weak kgdb_roundup_cpus(void) +{ int this_cpu = raw_smp_processor_id(); int cpu; - int ret; for_each_online_cpu(cpu) { /* No need to roundup ourselves */ if (cpu == this_cpu) continue; - csd = &per_cpu(kgdb_roundup_csd, cpu); - - /* - * If it didn't round up last time, don't try again - * since smp_call_function_single_async() will block. - * - * If rounding_up is false then we know that the - * previous call must have at least started and that - * means smp_call_function_single_async() won't block. - */ - if (kgdb_info[cpu].rounding_up) - continue; - kgdb_info[cpu].rounding_up = true; - - ret = smp_call_function_single_async(cpu, csd); - if (ret) - kgdb_info[cpu].rounding_up = false; + kgdb_roundup_cpu(cpu); } } NOKPROBE_SYMBOL(kgdb_roundup_cpus); Index: linux-5.15/kernel/debug/kdb/kdb_io.c =================================================================== --- linux-5.15.orig/kernel/debug/kdb/kdb_io.c +++ linux-5.15/kernel/debug/kdb/kdb_io.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:562 @ static void kdb_msg_write(const char *ms cp++; } + /* mirror output on atomic consoles */ for_each_console(c) { if (!(c->flags & CON_ENABLED)) continue; if (c == dbg_io_ops->cons) continue; - /* - * Set oops_in_progress to encourage the console drivers to - * disregard their internal spin locks: in the current calling - * context the risk of deadlock is a bigger problem than risks - * due to re-entering the console driver. We operate directly on - * oops_in_progress rather than using bust_spinlocks() because - * the calls bust_spinlocks() makes on exit are not appropriate - * for this calling context. - */ - ++oops_in_progress; - c->write(c, msg, msg_len); - --oops_in_progress; + + if (!c->write_atomic) + continue; + c->write_atomic(c, msg, msg_len); + touch_nmi_watchdog(); } } Index: linux-5.15/kernel/entry/common.c =================================================================== --- linux-5.15.orig/kernel/entry/common.c +++ linux-5.15/kernel/entry/common.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:162 @ static unsigned long exit_to_user_mode_l local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & _TIF_NEED_RESCHED_MASK) schedule(); +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (unlikely(current->forced_info.si_signo)) { + struct task_struct *t = current; + force_sig_info(&t->forced_info); + t->forced_info.si_signo = 0; + } +#endif + if (ti_work & _TIF_UPROBE) uprobe_notify_resume(regs); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:398 @ void irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (should_resched(0)) preempt_schedule_irq(); } } Index: linux-5.15/kernel/exit.c =================================================================== --- linux-5.15.orig/kernel/exit.c +++ linux-5.15/kernel/exit.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:67 @ #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> +#include <linux/kprobes.h> #include <linux/sysfs.h> #include <linux/uaccess.h> @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:219 @ static void delayed_put_task_struct(stru { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + kprobe_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); + + /* RT enabled kernels delay freeing the VMAP'ed task stack */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + put_task_stack(tsk); + put_task_struct(tsk); } Index: linux-5.15/kernel/fork.c =================================================================== --- linux-5.15.orig/kernel/fork.c +++ linux-5.15/kernel/fork.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:297 @ static inline void free_thread_stack(str return; } - vfree_atomic(tsk->stack); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + vfree_atomic(tsk->stack); + else + vfree(tsk->stack); return; } #endif @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:719 @ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is + * by far the least expensive way to do that. + */ +void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} +#endif + static void mmdrop_async_fn(struct work_struct *work) { struct mm_struct *mm; Index: linux-5.15/kernel/irq/irqdesc.c =================================================================== --- linux-5.15.orig/kernel/irq/irqdesc.c +++ linux-5.15/kernel/irq/irqdesc.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:667 @ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +/** + * generic_handle_irq_safe - Invoke the handler for a particular irq from any + * context. + * @irq: The irq number to handle + * + * Returns: 0 on success, a negative value on error. + * + * This function can be called from any context (IRQ or process context). It + * will report an error if not invoked from IRQ context and the irq has been + * marked to enforce IRQ-context only. + */ +int generic_handle_irq_safe(unsigned int irq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_to_desc(irq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_irq_safe); + #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging Index: linux-5.15/kernel/irq/manage.c =================================================================== --- linux-5.15.orig/kernel/irq/manage.c +++ linux-5.15/kernel/irq/manage.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1304 @ static int irq_thread(void *data) irq_thread_set_ready(desc, action); + sched_set_fifo(current); + if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1471 @ setup_irq_thread(struct irqaction *new, if (IS_ERR(t)) return PTR_ERR(t); - sched_set_fifo(t); - /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2864 @ EXPORT_SYMBOL_GPL(irq_get_irqchip_state) * This call sets the internal irqchip state of an interrupt, * depending on the value of @which. * - * This function should be called with preemption disabled if the + * This function should be called with migration disabled if the * interrupt controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, Index: linux-5.15/kernel/irq/spurious.c =================================================================== --- linux-5.15.orig/kernel/irq/spurious.c +++ linux-5.15/kernel/irq/spurious.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:450 @ MODULE_PARM_DESC(noirqdebug, "Disable ir static int __init irqfixup_setup(char *str) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + pr_warn("irqfixup boot option not supported with PREEMPT_RT\n"); + return 1; + } irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:466 @ module_param(irqfixup, int, 0644); static int __init irqpoll_setup(char *str) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + pr_warn("irqpoll boot option not supported with PREEMPT_RT\n"); + return 1; + } irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); Index: linux-5.15/kernel/irq_work.c =================================================================== --- linux-5.15.orig/kernel/irq_work.c +++ linux-5.15/kernel/irq_work.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:21 @ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/smp.h> +#include <linux/smpboot.h> #include <asm/processor.h> #include <linux/kasan.h> static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); +static DEFINE_PER_CPU(struct task_struct *, irq_workd); + +static void wake_irq_workd(void) +{ + struct task_struct *tsk = __this_cpu_read(irq_workd); + + if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) + wake_up_process(tsk); +} + +#ifdef CONFIG_SMP +static void irq_work_wake(struct irq_work *entry) +{ + wake_irq_workd(); +} + +static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = + IRQ_WORK_INIT_HARD(irq_work_wake); +#endif + +static int irq_workd_should_run(unsigned int cpu) +{ + return !llist_empty(this_cpu_ptr(&lazy_list)); +} /* * Claim the entry so that no one else will poke at it. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:80 @ void __weak arch_irq_work_raise(void) /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { + struct llist_head *list; + bool rt_lazy_work = false; + bool lazy_work = false; + int work_flags; + + work_flags = atomic_read(&work->node.a_flags); + if (work_flags & IRQ_WORK_LAZY) + lazy_work = true; + else if (IS_ENABLED(CONFIG_PREEMPT_RT) && + !(work_flags & IRQ_WORK_HARD_IRQ)) + rt_lazy_work = true; + + if (lazy_work || rt_lazy_work) + list = this_cpu_ptr(&lazy_list); + else + list = this_cpu_ptr(&raised_list); + + if (!llist_add(&work->node.llist, list)) + return; + /* If the work is "lazy", handle it from next tick if any */ - if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { - if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && - tick_nohz_tick_stopped()) - arch_irq_work_raise(); - } else { - if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) - arch_irq_work_raise(); - } + if (!lazy_work || tick_nohz_tick_stopped()) + arch_irq_work_raise(); } /* Enqueue the irq work @work on the current CPU */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:146 @ bool irq_work_queue_on(struct irq_work * if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); + + /* + * On PREEMPT_RT the items which are not marked as + * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work + * item is used on the remote CPU to wake the thread. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && + !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { + + if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) + goto out; + + work = &per_cpu(irq_work_wakeup, cpu); + if (!irq_work_claim(work)) + goto out; + } + __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } +out: preempt_enable(); return true; #endif /* CONFIG_SMP */ } - bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:219 @ void irq_work_single(void *arg) * else claimed it meanwhile. */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); + + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) + rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:230 @ static void irq_work_run_list(struct lli struct irq_work *work, *tmp; struct llist_node *llnode; - BUG_ON(!irqs_disabled()); + /* + * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed + * in a per-CPU thread in preemptible context. Only the items which are + * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. + */ + BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); if (llist_empty(list)) return; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:252 @ static void irq_work_run_list(struct lli void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); - irq_work_run_list(this_cpu_ptr(&lazy_list)); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); + else + wake_irq_workd(); } EXPORT_SYMBOL_GPL(irq_work_run); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:265 @ void irq_work_tick(void) if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); - irq_work_run_list(this_cpu_ptr(&lazy_list)); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); + else + wake_irq_workd(); } /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:279 @ void irq_work_tick(void) void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); + might_sleep(); + + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) { + rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), + TASK_UNINTERRUPTIBLE); + return; + } while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); + +static void run_irq_workd(unsigned int cpu) +{ + irq_work_run_list(this_cpu_ptr(&lazy_list)); +} + +static void irq_workd_setup(unsigned int cpu) +{ + sched_set_fifo_low(current); +} + +static struct smp_hotplug_thread irqwork_threads = { + .store = &irq_workd, + .setup = irq_workd_setup, + .thread_should_run = irq_workd_should_run, + .thread_fn = run_irq_workd, + .thread_comm = "irq_work/%u", +}; + +static __init int irq_work_init_threads(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); + return 0; +} +early_initcall(irq_work_init_threads); Index: linux-5.15/kernel/kcov.c =================================================================== --- linux-5.15.orig/kernel/kcov.c +++ linux-5.15/kernel/kcov.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:91 @ static struct list_head kcov_remote_area struct kcov_percpu_data { void *irq_area; + local_lock_t lock; unsigned int saved_mode; unsigned int saved_size; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:100 @ struct kcov_percpu_data { int saved_sequence; }; -static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); +static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = { + .lock = INIT_LOCAL_LOCK(lock), +}; /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:830 @ void kcov_remote_start(u64 handle) if (!in_task() && !in_serving_softirq()) return; - local_irq_save(flags); + local_lock_irqsave(&kcov_percpu_data.lock, flags); /* * Check that kcov_remote_start() is not called twice in background @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:838 @ void kcov_remote_start(u64 handle) */ mode = READ_ONCE(t->kcov_mode); if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:847 @ void kcov_remote_start(u64 handle) * happened while collecting coverage from a background thread. */ if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - spin_unlock_irqrestore(&kcov_remote_lock, flags); + spin_unlock(&kcov_remote_lock); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } kcov_debug("handle = %llx, context: %s\n", handle, @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:876 @ void kcov_remote_start(u64 handle) size = CONFIG_KCOV_IRQ_AREA_SIZE; area = this_cpu_ptr(&kcov_percpu_data)->irq_area; } - spin_unlock_irqrestore(&kcov_remote_lock, flags); + spin_unlock(&kcov_remote_lock); /* Can only happen when in_task(). */ if (!area) { + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); area = vmalloc(size * sizeof(unsigned long)); if (!area) { kcov_put(kcov); return; } + local_lock_irqsave(&kcov_percpu_data.lock, flags); } - local_irq_save(flags); - /* Reset coverage size. */ *(u64 *)area = 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:898 @ void kcov_remote_start(u64 handle) } kcov_start(t, kcov, size, area, mode, sequence); - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); } EXPORT_SYMBOL(kcov_remote_start); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:972 @ void kcov_remote_stop(void) if (!in_task() && !in_serving_softirq()) return; - local_irq_save(flags); + local_lock_irqsave(&kcov_percpu_data.lock, flags); mode = READ_ONCE(t->kcov_mode); barrier(); if (!kcov_mode_enabled(mode)) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:985 @ void kcov_remote_stop(void) * actually found the remote handle and started collecting coverage. */ if (in_serving_softirq() && !t->kcov_softirq) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } /* Make sure that kcov_softirq is only set when in softirq. */ if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1020 @ void kcov_remote_stop(void) spin_unlock(&kcov_remote_lock); } - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); /* Get in kcov_remote_start(). */ kcov_put(kcov); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1041 @ static int __init kcov_init(void) int cpu; for_each_possible_cpu(cpu) { - void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * - sizeof(unsigned long)); + void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long), cpu_to_node(cpu)); if (!area) return -ENOMEM; per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; Index: linux-5.15/kernel/kprobes.c =================================================================== --- linux-5.15.orig/kernel/kprobes.c +++ linux-5.15/kernel/kprobes.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1251 @ void kprobe_busy_end(void) } /* - * This function is called from finish_task_switch when task tk becomes dead, - * so that we can recycle any function-return probe instances associated - * with this task. These left over instances represent probed functions - * that have been called but will never return. + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any function-return probe instances + * associated with this task. These left over instances represent probed + * functions that have been called but will never return. */ void kprobe_flush_task(struct task_struct *tk) { Index: linux-5.15/kernel/ksysfs.c =================================================================== --- linux-5.15.orig/kernel/ksysfs.c +++ linux-5.15/kernel/ksysfs.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:141 @ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_CRASH_CORE */ +#if defined(CONFIG_PREEMPT_RT) +static ssize_t realtime_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", 1); +} +KERNEL_ATTR_RO(realtime); +#endif + /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:241 @ static struct attribute * kernel_attrs[] &rcu_expedited_attr.attr, &rcu_normal_attr.attr, #endif +#ifdef CONFIG_PREEMPT_RT + &realtime_attr.attr, +#endif NULL }; Index: linux-5.15/kernel/kthread.c =================================================================== --- linux-5.15.orig/kernel/kthread.c +++ linux-5.15/kernel/kthread.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:273 @ EXPORT_SYMBOL_GPL(kthread_parkme); static int kthread(void *_create) { + static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:304 @ static int kthread(void *_create) init_completion(&self->parked); current->vfork_done = &self->exited; + /* + * The new thread inherited kthreadd's priority and CPU mask. Reset + * back to default in case they have been changed. + */ + sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); + set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); + /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:420 @ struct task_struct *__kthread_create_on_ ready: task = create->result; if (!IS_ERR(task)) { - static const struct sched_param param = { .sched_priority = 0 }; char name[TASK_COMM_LEN]; /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:428 @ ready: */ vsnprintf(name, sizeof(name), namefmt, args); set_task_comm(task, name); - /* - * root may have changed our (kthreadd's) priority or CPU mask. - * The kernel thread should not inherit these properties. - */ - sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, - housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; Index: linux-5.15/kernel/locking/lockdep.c =================================================================== --- linux-5.15.orig/kernel/locking/lockdep.c +++ linux-5.15/kernel/locking/lockdep.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5475 @ static noinstr void check_flags(unsigned } } +#ifndef CONFIG_PREEMPT_RT /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5490 @ static noinstr void check_flags(unsigned DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } +#endif if (!debug_locks) print_irqtrace_events(current); Index: linux-5.15/kernel/locking/rtmutex.c =================================================================== --- linux-5.15.orig/kernel/locking/rtmutex.c +++ linux-5.15/kernel/locking/rtmutex.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1101 @ static int __sched task_blocks_on_rt_mut * which is wrong, as the other waiter is not in a deadlock * situation. */ - if (owner == task) + if (owner == task) { +#if defined(DEBUG_WW_MUTEXES) && defined(CONFIG_DEBUG_LOCKING_API_SELFTESTS) + /* + * The lockdep selftest for ww-mutex assumes in a few cases + * the ww_ctx->contending_lock assignment via + * __ww_mutex_check_kill() which does not happen if the rtmutex + * detects the deadlock early. + */ + if (build_ww_mutex() && ww_ctx) { + struct rt_mutex *rtm; + + /* Check whether the waiter should backout immediately */ + rtm = container_of(lock, struct rt_mutex, rtmutex); + + __ww_mutex_add_waiter(waiter, rtm, ww_ctx); + __ww_mutex_check_kill(rtm, waiter, ww_ctx); + } +#endif return -EDEADLK; + } raw_spin_lock(&task->pi_lock); waiter->task = task; Index: linux-5.15/kernel/locking/rtmutex_api.c =================================================================== --- linux-5.15.orig/kernel/locking/rtmutex_api.c +++ linux-5.15/kernel/locking/rtmutex_api.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:24 @ int max_lock_depth = 1024; */ static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, unsigned int state, + struct lockdep_map *nest_lock, unsigned int subclass) { int ret; might_sleep(); - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_); ret = __rt_mutex_lock(&lock->rtmutex, state); if (ret) mutex_release(&lock->dep_map, _RET_IP_); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:52 @ EXPORT_SYMBOL(rt_mutex_base_init); */ void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); } EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); +void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); +} +EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); + #else /* !CONFIG_DEBUG_LOCK_ALLOC */ /** @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:71 @ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); #endif @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:87 @ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { - return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); + return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + +/** * rt_mutex_trylock - try to lock a rt_mutex * * @lock: the rt_mutex to be locked Index: linux-5.15/kernel/locking/spinlock_rt.c =================================================================== --- linux-5.15.orig/kernel/locking/spinlock_rt.c +++ linux-5.15/kernel/locking/spinlock_rt.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:27 @ #define RT_MUTEX_BUILD_SPINLOCKS #include "rtmutex.c" +/* + * __might_resched() skips the state check as rtlocks are state + * preserving. Take RCU nesting into account as spin/read/write_lock() can + * legitimately nest into an RCU read side critical section. + */ +#define RTLOCK_RESCHED_OFFSETS \ + (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT) + +#define rtlock_might_resched() \ + __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS) + static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) { if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:46 @ static __always_inline void rtlock_lock( static __always_inline void __rt_spin_lock(spinlock_t *lock) { - ___might_sleep(__FILE__, __LINE__, 0); + rtlock_might_resched(); rtlock_lock(&lock->lock); rcu_read_lock(); migrate_disable(); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:224 @ EXPORT_SYMBOL(rt_write_trylock); void __sched rt_read_lock(rwlock_t *rwlock) { - ___might_sleep(__FILE__, __LINE__, 0); + rtlock_might_resched(); rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:234 @ EXPORT_SYMBOL(rt_read_lock); void __sched rt_write_lock(rwlock_t *rwlock) { - ___might_sleep(__FILE__, __LINE__, 0); + rtlock_might_resched(); rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:260 @ void __sched rt_write_unlock(rwlock_t *r } EXPORT_SYMBOL(rt_write_unlock); -int __sched rt_rwlock_is_contended(rwlock_t *rwlock) -{ - return rw_base_is_contended(&rwlock->rwbase); -} -EXPORT_SYMBOL(rt_rwlock_is_contended); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void __rt_rwlock_init(rwlock_t *rwlock, const char *name, struct lock_class_key *key) Index: linux-5.15/kernel/panic.c =================================================================== --- linux-5.15.orig/kernel/panic.c +++ linux-5.15/kernel/panic.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:248 @ void check_panic_on_warn(const char *ori void panic(const char *fmt, ...) { static char buf[1024]; + va_list args2; va_list args; long i, i_next = 0, len; int state = 0; int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + console_verbose(); + pr_emerg("Kernel panic - not syncing:\n"); + va_start(args2, fmt); + va_copy(args, args2); + vprintk(fmt, args2); + va_end(args2); +#ifdef CONFIG_DEBUG_BUGVERBOSE + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + dump_stack(); +#endif + pr_flush(1000, true); if (panic_on_warn) { /* * This thread may hit another WARN() in the panic path. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:309 @ void panic(const char *fmt, ...) if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) panic_smp_self_stop(); - console_verbose(); bust_spinlocks(1); - va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); if (len && buf[len - 1] == '\n') buf[len - 1] = '\0'; - pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE - /* - * Avoid nested stack-dumping if a panic occurs during oops processing - */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) - dump_stack(); -#endif - /* * If kgdb is enabled, give it a chance to run before we stop all * the other CPUs or else we won't be able to debug processes left @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:624 @ static u64 oops_id; static int init_oops_id(void) { +#ifndef CONFIG_PREEMPT_RT if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); else +#endif oops_id++; return 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:639 @ static void print_oops_end_marker(void) { init_oops_id(); pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); + pr_flush(1000, true); } /* Index: linux-5.15/kernel/printk/printk.c =================================================================== --- linux-5.15.orig/kernel/printk/printk.c +++ linux-5.15/kernel/printk/printk.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:47 @ #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> +#include <linux/kgdb.h> +#include <linux/kthread.h> +#include <linux/clocksource.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:275 @ static void __up_console_sem(unsigned lo static int console_locked, console_suspended; /* - * If exclusive_console is non-NULL then only this console is to be printed to. - */ -static struct console *exclusive_console; - -/* * Array of consoles built from command line options (console=) */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:353 @ static int console_msg_format = MSG_FORM * non-prinatable characters are escaped in the "\xff" notation. */ +#ifdef CONFIG_PRINTK /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); -#ifdef CONFIG_PRINTK +/* Set to enable sync mode. Once set, it is never cleared. */ +static bool sync_mode; + DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:367 @ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; -/* All 3 protected by @console_sem. */ -/* the next printk record to write to the console */ -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; - -struct latched_seq { - seqcount_latch_t latch; - u64 val[2]; -}; - /* * The next printk record to read after the last 'clear' command. There are * two copies (updated with seqcount_latch) so that reads can locklessly @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:384 @ static struct latched_seq clear_seq = { #define PREFIX_MAX 32 #endif -/* the maximum size of a formatted record (i.e. with prefix added per line) */ -#define CONSOLE_LOG_MAX 1024 - /* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:427 @ bool printk_percpu_data_ready(void) return __printk_percpu_data_ready; } -/* Must be called under syslog_lock. */ +/* Must be called under associated write-protection lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { raw_write_seqcount_latch(&ls->latch); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1761 @ SYSCALL_DEFINE3(syslog, int, type, char return do_syslog(type, buf, len, SYSLOG_FROM_READER); } -/* - * Special console_lock variants that help to reduce the risk of soft-lockups. - * They allow to pass console_lock to another printk() call using a busy wait. - */ +int printk_delay_msec __read_mostly; -#ifdef CONFIG_LOCKDEP -static struct lockdep_map console_owner_dep_map = { - .name = "console_owner" -}; -#endif +static inline void printk_delay(int level) +{ + boot_delay_msec(level); -static DEFINE_RAW_SPINLOCK(console_owner_lock); -static struct task_struct *console_owner; -static bool console_waiter; + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; -/** - * console_lock_spinning_enable - mark beginning of code where another - * thread might safely busy wait - * - * This basically converts console_lock into a spinlock. This marks - * the section where the console_lock owner can not sleep, because - * there may be a waiter spinning (like a spinlock). Also it must be - * ready to hand over the lock at the end of the section. - */ -static void console_lock_spinning_enable(void) -{ - raw_spin_lock(&console_owner_lock); - console_owner = current; - raw_spin_unlock(&console_owner_lock); + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } +} - /* The waiter may spin on us after setting console_owner */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +static bool kernel_sync_mode(void) +{ + return (oops_in_progress || sync_mode); } -/** - * console_lock_spinning_disable_and_check - mark end of code where another - * thread was able to busy wait and check if there is a waiter - * - * This is called at the end of the section where spinning is allowed. - * It has two functions. First, it is a signal that it is no longer - * safe to start busy waiting for the lock. Second, it checks if - * there is a busy waiter and passes the lock rights to her. - * - * Important: Callers lose the lock if there was a busy waiter. - * They must not touch items synchronized by console_lock - * in this case. - * - * Return: 1 if the lock rights were passed, 0 otherwise. - */ -static int console_lock_spinning_disable_and_check(void) +static bool console_may_sync(struct console *con) { - int waiter; + if (!(con->flags & CON_ENABLED)) + return false; + if (con->write_atomic && kernel_sync_mode()) + return true; + if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) + return true; + if (con->write && (con->flags & CON_BOOT) && !con->thread) + return true; + return false; +} - raw_spin_lock(&console_owner_lock); - waiter = READ_ONCE(console_waiter); - console_owner = NULL; - raw_spin_unlock(&console_owner_lock); +static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) +{ + if (!(con->flags & CON_ENABLED)) + return false; - if (!waiter) { - spin_release(&console_owner_dep_map, _THIS_IP_); - return 0; + if (con->write_atomic && kernel_sync_mode()) { + con->write_atomic(con, text, text_len); + return true; } - /* The waiter is now free to continue */ - WRITE_ONCE(console_waiter, false); + if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) { + if (console_trylock()) { + con->write_atomic(con, text, text_len); + console_unlock(); + return true; + } - spin_release(&console_owner_dep_map, _THIS_IP_); + } else if (con->write && (con->flags & CON_BOOT) && !con->thread) { + if (console_trylock()) { + con->write(con, text, text_len); + console_unlock(); + return true; + } + } - /* - * Hand off console_lock to waiter. The waiter will perform - * the up(). After this, the waiter is the console_lock owner. - */ - mutex_release(&console_lock_dep_map, _THIS_IP_); - return 1; + return false; } -/** - * console_trylock_spinning - try to get console_lock by busy waiting - * - * This allows to busy wait for the console_lock when the current - * owner is running in specially marked sections. It means that - * the current owner is running and cannot reschedule until it - * is ready to lose the lock. - * - * Return: 1 if we got the lock, 0 othrewise - */ -static int console_trylock_spinning(void) +static bool have_atomic_console(void) { - struct task_struct *owner = NULL; - bool waiter; - bool spin = false; - unsigned long flags; + struct console *con; - if (console_trylock()) - return 1; + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; + if (con->write_atomic) + return true; + } + return false; +} - printk_safe_enter_irqsave(flags); +static bool print_sync(struct console *con, u64 *seq) +{ + struct printk_info info; + struct printk_record r; + size_t text_len; - raw_spin_lock(&console_owner_lock); - owner = READ_ONCE(console_owner); - waiter = READ_ONCE(console_waiter); - if (!waiter && owner && owner != current) { - WRITE_ONCE(console_waiter, true); - spin = true; - } - raw_spin_unlock(&console_owner_lock); - - /* - * If there is an active printk() writing to the - * consoles, instead of having it write our data too, - * see if we can offload that load from the active - * printer, and do some printing ourselves. - * Go into a spin only if there isn't already a waiter - * spinning, and there is an active printer, and - * that active printer isn't us (recursive printk?). - */ - if (!spin) { - printk_safe_exit_irqrestore(flags); - return 0; - } + prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf)); - /* We spin waiting for the owner to release us */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); - /* Owner will clear console_waiter on hand off */ - while (READ_ONCE(console_waiter)) - cpu_relax(); - spin_release(&console_owner_dep_map, _THIS_IP_); + if (!prb_read_valid(prb, *seq, &r)) + return false; - printk_safe_exit_irqrestore(flags); - /* - * The owner passed the console lock to us. - * Since we did not spin on console lock, annotate - * this as a trylock. Otherwise lockdep will - * complain. - */ - mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); - return 1; + if (!call_sync_console_driver(con, &con->sync_buf[0], text_len)) + return false; + + *seq = r.info->seq; + + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + touch_nmi_watchdog(); + + if (text_len) + printk_delay(r.info->level); + + return true; } -/* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. - */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) +static u64 read_console_seq(struct console *con) { - static char dropped_text[64]; - size_t dropped_len = 0; - struct console *con; + u64 seq2; + u64 seq; - trace_console_rcuidle(text, len); + seq = latched_seq_read_nolock(&con->printk_seq); + seq2 = latched_seq_read_nolock(&con->printk_sync_seq); + if (seq2 > seq) + seq = seq2; +#ifdef CONFIG_HAVE_NMI + seq2 = latched_seq_read_nolock(&con->printk_sync_nmi_seq); + if (seq2 > seq) + seq = seq2; +#endif + return seq; +} - if (!console_drivers) - return; +static void print_sync_until(struct console *con, u64 seq, bool is_locked) +{ + u64 printk_seq; - if (console_dropped) { - dropped_len = snprintf(dropped_text, sizeof(dropped_text), - "** %lu printk messages dropped **\n", - console_dropped); - console_dropped = 0; - } + while (!__printk_cpu_trylock()) + cpu_relax(); - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if (!(con->flags & CON_ENABLED)) - continue; - if (!con->write) - continue; - if (!cpu_online(smp_processor_id()) && - !(con->flags & CON_ANYTIME)) - continue; - if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); - else { - if (dropped_len) - con->write(con, dropped_text, dropped_len); - con->write(con, text, len); - } + for (;;) { + printk_seq = read_console_seq(con); + if (printk_seq >= seq) + break; + if (!print_sync(con, &printk_seq)) + break; + + if (is_locked) + latched_seq_write(&con->printk_seq, printk_seq + 1); +#ifdef CONFIG_PRINTK_NMI + else if (in_nmi()) + latched_seq_write(&con->printk_sync_nmi_seq, printk_seq + 1); +#endif + else + latched_seq_write(&con->printk_sync_seq, printk_seq + 1); } + + __printk_cpu_unlock(); } /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1979 @ static u8 *__printk_recursion_counter(vo local_irq_restore(flags); \ } while (0) -int printk_delay_msec __read_mostly; - -static inline void printk_delay(void) -{ - if (unlikely(printk_delay_msec)) { - int m = printk_delay_msec; - - while (m--) { - mdelay(1); - touch_nmi_watchdog(); - } - } -} - static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2066 @ int vprintk_store(int facility, int leve const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; enum printk_info_flags flags = 0; + bool final_commit = false; struct printk_record r; unsigned long irqflags; u16 trunc_msg_len = 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2077 @ int vprintk_store(int facility, int leve u16 text_len; int ret = 0; u64 ts_nsec; + u64 seq; /* * Since the duration of printk() can vary depending on the message @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2116 @ int vprintk_store(int facility, int leve if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { + seq = r.info->seq; text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2124 @ int vprintk_store(int facility, int leve if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); + final_commit = true; } else { prb_commit(&e); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2148 @ int vprintk_store(int facility, int leve if (!prb_reserve(&e, prb, &r)) goto out; } + seq = r.info->seq; /* fill message */ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2164 @ int vprintk_store(int facility, int leve memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ - if (!(flags & LOG_NEWLINE)) + if (!(flags & LOG_NEWLINE)) { prb_commit(&e); - else + } else { prb_final_commit(&e); + final_commit = true; + } ret = text_len + trunc_msg_len; out: + /* only the kernel may perform synchronous printing */ + if (facility == 0 && final_commit) { + struct console *con; + + for_each_console(con) { + if (console_may_sync(con)) + print_sync_until(con, seq + 1, false); + } + } + printk_exit_irqrestore(recursion_ptr, irqflags); return ret; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2192 @ asmlinkage int vprintk_emit(int facility const char *fmt, va_list args) { int printed_len; - bool in_sched = false; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) return 0; - if (level == LOGLEVEL_SCHED) { + if (level == LOGLEVEL_SCHED) level = LOGLEVEL_DEFAULT; - in_sched = true; - } - - boot_delay_msec(level); - printk_delay(); printed_len = vprintk_store(facility, level, dev_info, fmt, args); - /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { - /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console - */ - preempt_disable(); - /* - * Try to acquire and then immediately release the console - * semaphore. The release will print out buffers and wake up - * /dev/kmsg and syslog() users. - */ - if (console_trylock_spinning()) - console_unlock(); - preempt_enable(); - } - wake_up_klogd(); return printed_len; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2226 @ asmlinkage __visible int _printk(const c } EXPORT_SYMBOL(_printk); -#else /* CONFIG_PRINTK */ +static int printk_kthread_func(void *data) +{ + struct console *con = data; + unsigned long dropped = 0; + char *dropped_text = NULL; + struct printk_info info; + struct printk_record r; + char *ext_text = NULL; + size_t dropped_len; + int ret = -ENOMEM; + char *text = NULL; + char *write_text; + size_t len; + int error; + u64 seq; + + if (con->flags & CON_EXTENDED) { + ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); + if (!ext_text) + goto out; + } + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); + dropped_text = kmalloc(64, GFP_KERNEL); + if (!text || !dropped_text) + goto out; + if (con->flags & CON_EXTENDED) + write_text = ext_text; + else + write_text = text; + + seq = read_console_seq(con); -#define CONSOLE_LOG_MAX 0 -#define printk_time false + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); -#define prb_read_valid(rb, seq, r) false -#define prb_first_valid_seq(rb) 0 + for (;;) { + error = wait_event_interruptible(log_wait, + prb_read_valid(prb, seq, &r) || kthread_should_stop()); -static u64 syslog_seq; -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; + if (kthread_should_stop()) + break; + + if (error) + continue; + + if (seq != r.info->seq) { + dropped += r.info->seq - seq; + seq = r.info->seq; + } + + seq++; + + if (!(con->flags & CON_ENABLED)) + continue; + + if (suppress_message_printing(r.info->level)) + continue; + + if (con->flags & CON_EXTENDED) { + len = info_print_ext_header(ext_text, + CONSOLE_EXT_LOG_MAX, + r.info); + len += msg_print_ext_body(ext_text + len, + CONSOLE_EXT_LOG_MAX - len, + &r.text_buf[0], r.info->text_len, + &r.info->dev_info); + } else { + len = record_print_text(&r, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); + } + + console_lock(); + + /* + * Even though the printk kthread is always preemptible, it is + * still not allowed to call cond_resched() from within + * console drivers. The task may become non-preemptible in the + * console driver call chain. For example, vt_console_print() + * takes a spinlock and then can call into fbcon_redraw(), + * which can conditionally invoke cond_resched(). + */ + console_may_schedule = 0; + + if (kernel_sync_mode() && con->write_atomic) { + console_unlock(); + break; + } + + if (!(con->flags & CON_EXTENDED) && dropped) { + dropped_len = snprintf(dropped_text, 64, + "** %lu printk messages dropped **\n", + dropped); + dropped = 0; + + con->write(con, dropped_text, dropped_len); + printk_delay(r.info->level); + } + + con->write(con, write_text, len); + if (len) + printk_delay(r.info->level); + + latched_seq_write(&con->printk_seq, seq); -static size_t record_print_text(const struct printk_record *r, - bool syslog, bool time) + console_unlock(); + } + ret = 0; +out: + kfree(dropped_text); + kfree(text); + kfree(ext_text); + pr_info("%sconsole [%s%d]: printing thread stopped\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); + return ret; +} + +/* Must be called within console_lock(). */ +static void start_printk_kthread(struct console *con) { - return 0; + con->thread = kthread_run(printk_kthread_func, con, + "pr/%s%d", con->name, con->index); + if (IS_ERR(con->thread)) { + pr_err("%sconsole [%s%d]: unable to start printing thread\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); + return; + } + pr_info("%sconsole [%s%d]: printing thread started\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); } -static ssize_t info_print_ext_header(char *buf, size_t size, - struct printk_info *info) + +/* protected by console_lock */ +static bool kthreads_started; + +/* Must be called within console_lock(). */ +static void console_try_thread(struct console *con) { - return 0; + if (kthreads_started) { + start_printk_kthread(con); + return; + } + + /* + * The printing threads have not been started yet. If this console + * can print synchronously, print all unprinted messages. + */ + if (console_may_sync(con)) { + unsigned long flags; + + local_irq_save(flags); + print_sync_until(con, prb_next_seq(prb), true); + local_irq_restore(flags); + } } -static ssize_t msg_print_ext_body(char *buf, size_t size, - char *text, size_t text_len, - struct dev_printk_info *dev_info) { return 0; } -static void console_lock_spinning_enable(void) { } -static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} -static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2638 @ int is_console_locked(void) } EXPORT_SYMBOL(is_console_locked); -/* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if ((con->flags & CON_ENABLED) && - (con->flags & CON_ANYTIME)) - return 1; - - return 0; -} - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. - */ -static inline int can_use_console(void) -{ - return cpu_online(raw_smp_processor_id()) || have_callable_console(); -} - /** * console_unlock - unlock the console system * @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2654 @ static inline int can_use_console(void) */ void console_unlock(void) { - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[CONSOLE_LOG_MAX]; - unsigned long flags; - bool do_cond_resched, retry; - struct printk_info info; - struct printk_record r; - u64 __maybe_unused next_seq; - if (console_suspended) { up_console_sem(); return; } - prb_rec_init_rd(&r, &info, text, sizeof(text)); - - /* - * Console drivers are called with interrupts disabled, so - * @console_may_schedule should be cleared before; however, we may - * end up dumping a lot of lines, for example, if called from - * console registration path, and should invoke cond_resched() - * between lines if allowable. Not doing so can cause a very long - * scheduling stall on a slow console leading to RCU stall and - * softlockup warnings which exacerbate the issue with more - * messages practically incapacitating the system. - * - * console_trylock() is not able to detect the preemptive - * context reliably. Therefore the value must be stored before - * and cleared after the "again" goto label. - */ - do_cond_resched = console_may_schedule; -again: - console_may_schedule = 0; - - /* - * We released the console_sem lock, so we need to recheck if - * cpu is online and (if not) is there at least one CON_ANYTIME - * console. - */ - if (!can_use_console()) { - console_locked = 0; - up_console_sem(); - return; - } - - for (;;) { - size_t ext_len = 0; - int handover; - size_t len; - -skip: - if (!prb_read_valid(prb, console_seq, &r)) - break; - - if (console_seq != r.info->seq) { - console_dropped += r.info->seq - console_seq; - console_seq = r.info->seq; - } - - if (suppress_message_printing(r.info->level)) { - /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. - */ - console_seq++; - goto skip; - } - - /* Output to all consoles once old messages replayed. */ - if (unlikely(exclusive_console && - console_seq >= exclusive_console_stop_seq)) { - exclusive_console = NULL; - } - - /* - * Handle extended console text first because later - * record_print_text() will modify the record buffer in-place. - */ - if (nr_ext_console_drivers) { - ext_len = info_print_ext_header(ext_text, - sizeof(ext_text), - r.info); - ext_len += msg_print_ext_body(ext_text + ext_len, - sizeof(ext_text) - ext_len, - &r.text_buf[0], - r.info->text_len, - &r.info->dev_info); - } - len = record_print_text(&r, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time); - console_seq++; - - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). - */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(ext_text, ext_len, text, len); - start_critical_timings(); - - handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); - if (handover) - return; - - if (do_cond_resched) - cond_resched(); - } - - /* Get consistent value of the next-to-be-used sequence number. */ - next_seq = console_seq; - console_locked = 0; up_console_sem(); - - /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. - */ - retry = prb_read_valid(prb, next_seq, NULL); - if (retry && console_trylock()) - goto again; } EXPORT_SYMBOL(console_unlock); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2710 @ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { - /* - * If someone else is holding the console lock, trylock will fail - * and may_schedule may be set. Ignore and proceed to unlock so - * that messages are flushed out. As this can be called from any - * context and we don't want to get preempted while flushing, - * ensure may_schedule is cleared. - */ - console_trylock(); - console_may_schedule = 0; + if (!console_trylock()) + return; + +#ifdef CONFIG_PRINTK + if (mode == CONSOLE_REPLAY_ALL) { + struct console *c; + u64 seq; + + seq = prb_first_valid_seq(prb); + for_each_console(c) + latched_seq_write(&c->printk_seq, seq); + } +#endif - if (mode == CONSOLE_REPLAY_ALL) - console_seq = prb_first_valid_seq(prb); console_unlock(); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2859 @ static int try_enable_new_console(struct void register_console(struct console *newcon) { struct console *bcon = NULL; + u64 __maybe_unused seq = 0; int err; for_each_console(bcon) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2882 @ void register_console(struct console *ne } } + newcon->thread = NULL; + if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2925 @ void register_console(struct console *ne * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; + newcon->flags |= CON_HANDOVER; + } /* * Put this console in the list - keep the @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2950 @ void register_console(struct console *ne if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; - if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - * - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - * - * Set exclusive_console with disabled interrupts to reduce - * race window with eventual console_flush_on_panic() that - * ignores console_lock. - */ - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; +#ifdef CONFIG_PRINTK + if (!(newcon->flags & CON_PRINTBUFFER)) + seq = prb_next_seq(prb); - /* Get a consistent copy of @syslog_seq. */ - mutex_lock(&syslog_lock); - console_seq = syslog_seq; - mutex_unlock(&syslog_lock); - } + seqcount_latch_init(&newcon->printk_seq.latch); + latched_seq_write(&newcon->printk_seq, seq); + seqcount_latch_init(&newcon->printk_sync_seq.latch); + latched_seq_write(&newcon->printk_sync_seq, seq); +#ifdef CONFIG_HAVE_NMI + seqcount_latch_init(&newcon->printk_sync_nmi_seq.latch); + latched_seq_write(&newcon->printk_sync_nmi_seq, seq); +#endif + + console_try_thread(newcon); +#endif /* CONFIG_PRINTK */ console_unlock(); console_sysfs_notify(); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3038 @ int unregister_console(struct console *c console_unlock(); console_sysfs_notify(); + if (console->thread && !IS_ERR(console->thread)) + kthread_stop(console->thread); + if (console->exit) res = console->exit(console); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3123 @ static int __init printk_late_init(void) unregister_console(con); } } + +#ifdef CONFIG_PRINTK + console_lock(); + for_each_console(con) + start_printk_kthread(con); + kthreads_started = true; + console_unlock(); +#endif + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3155 @ static void wake_up_klogd_work_func(stru { int pending = this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_OUTPUT) { - /* If trylock fails, someone else is doing the printing */ - if (console_trylock()) - console_unlock(); - } - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); + wake_up_interruptible_all(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3203 @ void defer_console_output(void) void printk_trigger_flush(void) { - defer_console_output(); + wake_up_klogd(); } int vprintk_deferred(const char *fmt, va_list args) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3354 @ void kmsg_dump(enum kmsg_dump_reason rea { struct kmsg_dumper *dumper; + if (!oops_in_progress) { + /* + * If atomic consoles are available, activate kernel sync mode + * to make sure any final messages are visible. The trailing + * printk message is important to flush any pending messages. + */ + if (have_atomic_console()) { + sync_mode = true; + pr_info("enabled sync mode\n"); + } + + /* + * Give the printing threads time to flush, allowing up to + * 1s of no printing forward progress before giving up. + */ + pr_flush(1000, true); + } + rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { enum kmsg_dump_reason max_reason = dumper->max_reason; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3554 @ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #ifdef CONFIG_SMP static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); +static unsigned int kgdb_cpu = -1; /** * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3634 @ EXPORT_SYMBOL(__printk_cpu_trylock); */ void __printk_cpu_unlock(void) { + bool trigger_kgdb = false; + unsigned int cpu; + if (atomic_read(&printk_cpulock_nested)) { atomic_dec(&printk_cpulock_nested); return; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3647 @ void __printk_cpu_unlock(void) * LMM(__printk_cpu_unlock:A) */ + cpu = smp_processor_id(); + if (kgdb_cpu == cpu) { + trigger_kgdb = true; + kgdb_cpu = -1; + } + /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3673 @ void __printk_cpu_unlock(void) */ atomic_set_release(&printk_cpulock_owner, -1); /* LMM(__printk_cpu_unlock:B) */ + + if (trigger_kgdb) { + pr_warn("re-triggering kgdb roundup for CPU#%d\n", cpu); + kgdb_roundup_cpu(cpu); + } } EXPORT_SYMBOL(__printk_cpu_unlock); + +bool kgdb_roundup_delay(unsigned int cpu) +{ + if (cpu != atomic_read(&printk_cpulock_owner)) + return false; + + kgdb_cpu = cpu; + return true; +} +EXPORT_SYMBOL(kgdb_roundup_delay); #endif /* CONFIG_SMP */ + +#ifdef CONFIG_PRINTK +static void pr_msleep(bool may_sleep, int ms) +{ + if (may_sleep) { + msleep(ms); + } else { + while (ms--) + udelay(1000); + } +} + +/** + * pr_flush() - Wait for printing threads to catch up. + * + * @timeout_ms: The maximum time (in ms) to wait. + * @reset_on_progress: Reset the timeout if forward progress is seen. + * + * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 + * represents infinite waiting. + * + * If @reset_on_progress is true, the timeout will be reset whenever any + * printer has been seen to make some forward progress. + * + * Context: Any context. + * Return: true if all enabled printers are caught up. + */ +bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + int remaining = timeout_ms; + struct console *con; + u64 last_diff = 0; + bool may_sleep; + u64 printk_seq; + u64 diff; + u64 seq; + + may_sleep = (preemptible() && + !in_softirq() && + system_state >= SYSTEM_RUNNING); + + seq = prb_next_seq(prb); + + for (;;) { + diff = 0; + + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; + printk_seq = read_console_seq(con); + if (printk_seq < seq) + diff += seq - printk_seq; + } + + if (diff != last_diff && reset_on_progress) + remaining = timeout_ms; + + if (diff == 0 || remaining == 0) + break; + + if (remaining < 0) { + pr_msleep(may_sleep, 100); + } else if (remaining < 100) { + pr_msleep(may_sleep, remaining); + remaining = 0; + } else { + pr_msleep(may_sleep, 100); + remaining -= 100; + } + + last_diff = diff; + } + + return (diff == 0); +} +EXPORT_SYMBOL(pr_flush); +#endif /* CONFIG_PRINTK */ Index: linux-5.15/kernel/ptrace.c =================================================================== --- linux-5.15.orig/kernel/ptrace.c +++ linux-5.15/kernel/ptrace.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:200 @ static bool ptrace_freeze_traced(struct spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (READ_ONCE(task->__state) & __TASK_TRACED) + WRITE_ONCE(task->__state, __TASK_TRACED); + else + task->saved_state = __TASK_TRACED; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); +#else WRITE_ONCE(task->__state, __TASK_TRACED); +#endif ret = true; } spin_unlock_irq(&task->sighand->siglock); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:221 @ static bool ptrace_freeze_traced(struct static void ptrace_unfreeze_traced(struct task_struct *task) { - if (READ_ONCE(task->__state) != __TASK_TRACED) + unsigned long flags; + bool frozen = true; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && + READ_ONCE(task->__state) != __TASK_TRACED) return; WARN_ON(!task->ptrace || task->parent != current); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:235 @ static void ptrace_unfreeze_traced(struc * Recheck state under the lock to close this race. */ spin_lock_irq(&task->sighand->siglock); - if (READ_ONCE(task->__state) == __TASK_TRACED) { - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - WRITE_ONCE(task->__state, TASK_TRACED); - } + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (READ_ONCE(task->__state) == __TASK_TRACED) + WRITE_ONCE(task->__state, TASK_TRACED); + +#ifdef CONFIG_PREEMPT_RT + else if (task->saved_state == __TASK_TRACED) + task->saved_state = TASK_TRACED; +#endif + else + frozen = false; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + if (frozen && __fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + spin_unlock_irq(&task->sighand->siglock); } Index: linux-5.15/kernel/rcu/tasks.h =================================================================== --- linux-5.15.orig/kernel/rcu/tasks.h +++ linux-5.15/kernel/rcu/tasks.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1365 @ static void test_rcu_tasks_callback(stru rttd->notrun = true; } -static void rcu_tasks_initiate_self_tests(void) +void rcu_tasks_initiate_self_tests(void) { pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1402 @ static int rcu_tasks_verify_self_tests(v return ret; } late_initcall(rcu_tasks_verify_self_tests); -#else /* #ifdef CONFIG_PROVE_RCU */ -static void rcu_tasks_initiate_self_tests(void) { } -#endif /* #else #ifdef CONFIG_PROVE_RCU */ +#endif /* #ifdef CONFIG_PROVE_RCU */ void __init rcu_init_tasks_generic(void) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1417 @ void __init rcu_init_tasks_generic(void) #ifdef CONFIG_TASKS_TRACE_RCU rcu_spawn_tasks_trace_kthread(); #endif - - // Run the self-tests. - rcu_tasks_initiate_self_tests(); } #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ Index: linux-5.15/kernel/rcu/tree.c =================================================================== --- linux-5.15.orig/kernel/rcu/tree.c +++ linux-5.15/kernel/rcu/tree.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2282 @ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake = false; - const bool offloaded = rcu_rdp_is_offloaded(rdp); + bool offloaded, needwake = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); + offloaded = rcu_rdp_is_offloaded(rdp); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || rdp->gpwrap) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2450 @ static void rcu_do_batch(struct rcu_data int div; bool __maybe_unused empty; unsigned long flags; - const bool offloaded = rcu_rdp_is_offloaded(rdp); + bool offloaded; struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2476 @ static void rcu_do_batch(struct rcu_data rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); + offloaded = rcu_rdp_is_offloaded(rdp); div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); Index: linux-5.15/kernel/sched/core.c =================================================================== --- linux-5.15.orig/kernel/sched/core.c +++ linux-5.15/kernel/sched/core.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:78 @ __read_mostly int sysctl_resched_latency * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifdef CONFIG_PREEMPT_RT +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#else const_debug unsigned int sysctl_sched_nr_migrate = 32; +#endif /* * period over which we measure -rt task CPU usage in us. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:990 @ void resched_curr(struct rq *rq) trace_sched_wake_idle_without_ipi(cpu); } +#ifdef CONFIG_PREEMPT_LAZY + +static int tsk_is_polling(struct task_struct *p) +{ +#ifdef TIF_POLLING_NRFLAG + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); +#else + return 0; +#endif +} + +void resched_curr_lazy(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + int cpu; + + if (!sched_feat(PREEMPT_LAZY)) { + resched_curr(rq); + return; + } + + if (test_tsk_need_resched(curr)) + return; + + if (test_tsk_need_resched_lazy(curr)) + return; + + set_tsk_need_resched_lazy(curr); + + cpu = cpu_of(rq); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED_LAZY must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(curr)) + smp_send_reschedule(cpu); +} +#endif + void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2185 @ void migrate_disable(void) preempt_disable(); this_rq()->nr_pinned++; p->migration_disabled = 1; + preempt_lazy_disable(); preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2197 @ void migrate_enable(void) if (p->migration_disabled > 1) { p->migration_disabled--; return; + } else if (WARN_ON_ONCE(p->migration_disabled == 0)) { + return; } /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2216 @ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; + preempt_lazy_enable(); preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3283 @ unsigned long wait_task_inactive(struct * is actually now running somewhere else! */ while (task_running(rq, p)) { - if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) + if (match_state && !task_match_state_lock(p, match_state)) return 0; cpu_relax(); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3298 @ unsigned long wait_task_inactive(struct running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || READ_ONCE(p->__state) == match_state) + if (!match_state || task_match_state_or_saved(p, match_state)) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3332 @ unsigned long wait_task_inactive(struct ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); continue; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4486 @ int sched_fork(unsigned long clone_flags p->on_cpu = 0; #endif init_task_preempt_count(p); +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(p)->preempt_lazy_count = 0; +#endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4984 @ static struct rq *finish_task_switch(str */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_sched(mm); } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. + * Release VMAP'ed task stack immediate for reuse. On RT + * enabled kernels this is delayed for latency reasons. */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + put_task_stack(prev); put_task_struct_rcu_user(prev); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6395 @ static void __sched notrace __schedule(u next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); + clear_tsk_need_resched_lazy(prev); clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6617 @ static void __sched notrace preempt_sche } while (need_resched()); } +#ifdef CONFIG_PREEMPT_LAZY +/* + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as + * preempt_lazy_count counter >0. + */ +static __always_inline int preemptible_lazy(void) +{ + if (test_thread_flag(TIF_NEED_RESCHED)) + return 1; + if (current_thread_info()->preempt_lazy_count) + return 0; + return 1; +} + +#else + +static inline int preemptible_lazy(void) +{ + return 1; +} + +#endif + #ifdef CONFIG_PREEMPTION /* * This is the entry point to schedule() from in-kernel preemption @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6654 @ asmlinkage __visible void __sched notrac */ if (likely(!preemptible())) return; - + if (!preemptible_lazy()) + return; preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6688 @ asmlinkage __visible void __sched notrac if (likely(!preemptible())) return; + if (!preemptible_lazy()) + return; + do { /* * Because the function tracer can trace preempt_count_sub() @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:8844 @ void __init init_idle(struct task_struct /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); - +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(idle)->preempt_lazy_count = 0; +#endif /* * The idle tasks have their own, simple scheduling class: */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:9647 @ void __init sched_init(void) } #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline int preempt_count_equals(int preempt_offset) -{ - int nested = preempt_count() + rcu_preempt_depth(); - - return (nested == preempt_offset); -} -void __might_sleep(const char *file, int line, int preempt_offset) +void __might_sleep(const char *file, int line) { unsigned int state = get_current_state(); /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:9662 @ void __might_sleep(const char *file, int (void *)current->task_state_change, (void *)current->task_state_change); - ___might_sleep(file, line, preempt_offset); + __might_resched(file, line, 0); } EXPORT_SYMBOL(__might_sleep); -void ___might_sleep(const char *file, int line, int preempt_offset) +static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) +{ + if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) + return; + + if (preempt_count() == preempt_offset) + return; + + pr_err("Preemption disabled at:"); + print_ip_sym(KERN_ERR, ip); +} + +static inline bool resched_offsets_ok(unsigned int offsets) +{ + unsigned int nested = preempt_count(); + + nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; + + return nested == offsets; +} + +void __might_resched(const char *file, int line, unsigned int offsets) { /* Ratelimiting timestamp: */ static unsigned long prev_jiffy; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:9697 @ void ___might_sleep(const char *file, in /* WARN_ON_ONCE() by default, no rate limit required: */ rcu_sleep_check(); - if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + if ((resched_offsets_ok(offsets) && !irqs_disabled() && !is_idle_task(current) && !current->non_block_count) || system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || oops_in_progress) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:9710 @ void ___might_sleep(const char *file, in /* Save this before calling printk(), since that will clobber it: */ preempt_disable_ip = get_preempt_disable_ip(current); - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), current->non_block_count, - current->pid, current->comm); + pr_err("BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->non_block_count, + current->pid, current->comm); + pr_err("preempt_count: %x, expected: %x\n", preempt_count(), + offsets & MIGHT_RESCHED_PREEMPT_MASK); + + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + pr_err("RCU nest depth: %d, expected: %u\n", + rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); + } if (task_stack_end_corrupted(current)) - printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + pr_emerg("Thread overran stack, or stack corrupted\n"); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && !preempt_count_equals(preempt_offset)) { - pr_err("Preemption disabled at:"); - print_ip_sym(KERN_ERR, preempt_disable_ip); - } + + print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, + preempt_disable_ip); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } -EXPORT_SYMBOL(___might_sleep); +EXPORT_SYMBOL(__might_resched); void __cant_sleep(const char *file, int line, int preempt_offset) { Index: linux-5.15/kernel/sched/fair.c =================================================================== --- linux-5.15.orig/kernel/sched/fair.c +++ linux-5.15/kernel/sched/fair.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4587 @ check_preempt_tick(struct cfs_rq *cfs_rq ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4611 @ check_preempt_tick(struct cfs_rq *cfs_rq return; if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static void @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4754 @ entity_tick(struct cfs_rq *cfs_rq, struc * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4894 @ static void __account_cfs_rq_runtime(str * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static __always_inline @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5657 @ static void hrtick_start_fair(struct rq if (delta < 0) { if (task_current(rq, p)) - resched_curr(rq); + resched_curr_lazy(rq); return; } hrtick_start(rq, delta); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:7363 @ static void check_preempt_wakeup(struct return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:11444 @ static void task_fork_fair(struct task_s * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_curr(rq); + resched_curr_lazy(rq); } se->vruntime -= cfs_rq->min_vruntime; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:11471 @ prio_changed_fair(struct rq *rq, struct */ if (task_current(rq, p)) { if (p->prio > oldprio) - resched_curr(rq); + resched_curr_lazy(rq); } else check_preempt_curr(rq, p, 0); } Index: linux-5.15/kernel/sched/features.h =================================================================== --- linux-5.15.orig/kernel/sched/features.h +++ linux-5.15/kernel/sched/features.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:49 @ SCHED_FEAT(DOUBLE_TICK, false) */ SCHED_FEAT(NONTASK_CAPACITY, true) +#ifdef CONFIG_PREEMPT_RT +SCHED_FEAT(TTWU_QUEUE, false) +# ifdef CONFIG_PREEMPT_LAZY +SCHED_FEAT(PREEMPT_LAZY, true) +# endif +#else + /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, true) +#endif /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. Index: linux-5.15/kernel/sched/sched.h =================================================================== --- linux-5.15.orig/kernel/sched/sched.h +++ linux-5.15/kernel/sched/sched.h @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2330 @ extern void reweight_task(struct task_st extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); +#ifdef CONFIG_PREEMPT_LAZY +extern void resched_curr_lazy(struct rq *rq); +#else +static inline void resched_curr_lazy(struct rq *rq) +{ + resched_curr(rq); +} +#endif + extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); Index: linux-5.15/kernel/sched/swait.c =================================================================== --- linux-5.15.orig/kernel/sched/swait.c +++ linux-5.15/kernel/sched/swait.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:67 @ void swake_up_all(struct swait_queue_hea struct swait_queue *curr; LIST_HEAD(tmp); + WARN_ON(irqs_disabled()); raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { Index: linux-5.15/kernel/sched/topology.c =================================================================== --- linux-5.15.orig/kernel/sched/topology.c +++ linux-5.15/kernel/sched/topology.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:529 @ static int init_rootdomain(struct root_d #ifdef HAVE_RT_PUSH_IPI rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); - init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); + rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif rd->visit_gen = 0; Index: linux-5.15/kernel/signal.c =================================================================== --- linux-5.15.orig/kernel/signal.c +++ linux-5.15/kernel/signal.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1327 @ force_sig_info_to_task(struct kernel_sig struct k_sigaction *action; int sig = info->si_signo; + /* + * On some archs, PREEMPT_RT has to delay sending a signal from a trap + * since it can not enable preemption, and the signal code's spin_locks + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will + * send the signal on exit of the trap. + */ +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (in_atomic()) { + struct task_struct *t = current; + + if (WARN_ON_ONCE(t->forced_info.si_signo)) + return 0; + + if (is_si_special(info)) { + WARN_ON_ONCE(info != SEND_SIG_PRIV); + t->forced_info.si_signo = info->si_signo; + t->forced_info.si_errno = 0; + t->forced_info.si_code = SI_KERNEL; + t->forced_info.si_pid = 0; + t->forced_info.si_uid = 0; + } else { + t->forced_info = *info; + } + + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + return 0; + } +#endif spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2339 @ static void ptrace_stop(int exit_code, i if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); - preempt_enable_no_resched(); freezable_schedule(); cgroup_leave_frozen(true); } else { Index: linux-5.15/kernel/smp.c =================================================================== --- linux-5.15.orig/kernel/smp.c +++ linux-5.15/kernel/smp.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:693 @ void flush_smp_call_function_from_idle(v cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_IDLE); + local_irq_save(flags); flush_smp_call_function_queue(true); - if (local_softirq_pending()) - do_softirq(); + + if (local_softirq_pending()) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + do_softirq(); + } else { + struct task_struct *ksoftirqd = this_cpu_ksoftirqd(); + + if (ksoftirqd && !task_is_running(ksoftirqd)) + wake_up_process(ksoftirqd); + } + } local_irq_restore(flags); } Index: linux-5.15/kernel/trace/trace.c =================================================================== --- linux-5.15.orig/kernel/trace/trace.c +++ linux-5.15/kernel/trace/trace.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2633 @ unsigned int tracing_gen_ctx_irq_test(un trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; - return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | +#ifdef CONFIG_PREEMPT_LAZY + if (need_resched_lazy()) + trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; +#endif + + return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (preempt_lazy_count() & 0xff) << 16 | (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4220 @ unsigned long trace_total_entries(struct static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" - "# | / _----=> need-resched \n" - "# || / _---=> hardirq/softirq \n" - "# ||| / _--=> preempt-depth \n" - "# |||| / _-=> migrate-disable \n" - "# ||||| / delay \n" - "# cmd pid |||||| time | caller \n" - "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# _--------=> CPU# \n" + "# / _-------=> irqs-off \n" + "# | / _------=> need-resched \n" + "# || / _-----=> need-resched-lazy\n" + "# ||| / _----=> hardirq/softirq \n" + "# |||| / _---=> preempt-depth \n" + "# ||||| / _--=> preempt-lazy-depth\n" + "# |||||| / _-=> migrate-disable \n" + "# ||||||| / delay \n" + "# cmd pid |||||||| time | caller \n" + "# \\ / |||||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4264 @ static void print_func_help_header_irq(s print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); - seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); - seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); - seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); - seq_printf(m, "# %.*s|||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); + seq_printf(m, "# %.*s _-------=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space); + seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space); + seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); } void Index: linux-5.15/kernel/trace/trace_events.c =================================================================== --- linux-5.15.orig/kernel/trace/trace_events.c +++ linux-5.15/kernel/trace/trace_events.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:196 @ static int trace_define_common_fields(vo /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(unsigned char, preempt_lazy_count); return ret; } Index: linux-5.15/kernel/trace/trace_output.c =================================================================== --- linux-5.15.orig/kernel/trace/trace_output.c +++ linux-5.15/kernel/trace/trace_output.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:454 @ int trace_print_lat_fmt(struct trace_seq { char hardsoft_irq; char need_resched; + char need_resched_lazy; char irqs_off; int hardirq; int softirq; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:485 @ int trace_print_lat_fmt(struct trace_seq break; } + need_resched_lazy = + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; + hardsoft_irq = (nmi && hardirq) ? 'Z' : nmi ? 'z' : @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:496 @ int trace_print_lat_fmt(struct trace_seq softirq ? 's' : '.' ; - trace_seq_printf(s, "%c%c%c", - irqs_off, need_resched, hardsoft_irq); + trace_seq_printf(s, "%c%c%c%c", + irqs_off, need_resched, need_resched_lazy, + hardsoft_irq); if (entry->preempt_count & 0xf) trace_seq_printf(s, "%x", entry->preempt_count & 0xf); else trace_seq_putc(s, '.'); + if (entry->preempt_lazy_count) + trace_seq_printf(s, "%x", entry->preempt_lazy_count); + else + trace_seq_putc(s, '.'); + if (entry->preempt_count & 0xf0) trace_seq_printf(s, "%x", entry->preempt_count >> 4); else Index: linux-5.15/lib/bug.c =================================================================== --- linux-5.15.orig/lib/bug.c +++ linux-5.15/lib/bug.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:208 @ enum bug_trap_type report_bug(unsigned l else pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", (void *)bugaddr); + pr_flush(1000, true); return BUG_TRAP_TYPE_BUG; } Index: linux-5.15/lib/dump_stack.c =================================================================== --- linux-5.15.orig/lib/dump_stack.c +++ linux-5.15/lib/dump_stack.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:105 @ asmlinkage __visible void dump_stack_lvl * Permit this cpu to perform nested stack dumps while serialising * against other CPUs */ - printk_cpu_lock_irqsave(flags); + raw_printk_cpu_lock_irqsave(flags); __dump_stack(log_lvl); - printk_cpu_unlock_irqrestore(flags); + raw_printk_cpu_unlock_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); Index: linux-5.15/lib/irq_poll.c =================================================================== --- linux-5.15.orig/lib/irq_poll.c +++ linux-5.15/lib/irq_poll.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:194 @ static int irq_poll_cpu_dead(unsigned in * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ + local_bh_disable(); local_irq_disable(); list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), this_cpu_ptr(&blk_cpu_iopoll)); __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_enable(); + local_bh_enable(); return 0; } Index: linux-5.15/lib/locking-selftest.c =================================================================== --- linux-5.15.orig/lib/locking-selftest.c +++ linux-5.15/lib/locking-selftest.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:29 @ #include <linux/rtmutex.h> #include <linux/local_lock.h> +#ifdef CONFIG_PREEMPT_RT +# define NON_RT(...) +#else +# define NON_RT(...) __VA_ARGS__ +#endif + /* * Change this to 1 if you want to see the failure printouts: */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:148 @ static DEFINE_RT_MUTEX(rtmutex_Z2); #endif -static local_lock_t local_A = INIT_LOCAL_LOCK(local_A); +static DEFINE_PER_CPU(local_lock_t, local_A); /* * non-inlined runtime initializers, to let separate locks share @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:721 @ GENERATE_TESTCASE(ABCDBCDA_rtmutex); #undef E +#ifdef CONFIG_PREEMPT_RT +# define RT_PREPARE_DBL_UNLOCK() { migrate_disable(); rcu_read_lock(); } +#else +# define RT_PREPARE_DBL_UNLOCK() +#endif /* * Double unlock: */ #define E() \ \ LOCK(A); \ + RT_PREPARE_DBL_UNLOCK(); \ UNLOCK(A); \ UNLOCK(A); /* fail */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:817 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_ #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_spin) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:826 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_ #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) +#endif #undef E1 #undef E2 +#ifndef CONFIG_PREEMPT_RT /* * Enabling hardirqs with a softirq-safe lock held: */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:864 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A #undef E1 #undef E2 +#endif + /* * Enabling irqs with an irq-safe lock held: */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:895 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_spin) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:904 @ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) +#endif #undef E1 #undef E2 @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:943 @ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_ #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_spin) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:952 @ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_ #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) +#endif #undef E1 #undef E2 @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:993 @ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_ #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_spin) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1002 @ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_ #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) +#endif #undef E1 #undef E2 @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1057 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_inver #include "locking-selftest-wlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-spin-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_spin) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1066 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_inver #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) +#endif #undef E1 #undef E2 @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1234 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_ #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-softirq.h" #include "locking-selftest-rlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) +#endif #undef E1 #undef E2 @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1282 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_ #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-softirq.h" #include "locking-selftest-rlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_wlock) +#endif #undef E1 #undef E2 @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1338 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_ #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_wlock) +#ifndef CONFIG_PREEMPT_RT #include "locking-selftest-softirq.h" #include "locking-selftest-rlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_rlock) #include "locking-selftest-wlock.h" GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) +#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1354 @ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_ # define I_MUTEX(x) lockdep_reset_lock(&mutex_##x.dep_map) # define I_RWSEM(x) lockdep_reset_lock(&rwsem_##x.dep_map) # define I_WW(x) lockdep_reset_lock(&x.dep_map) -# define I_LOCAL_LOCK(x) lockdep_reset_lock(&local_##x.dep_map) +# define I_LOCAL_LOCK(x) lockdep_reset_lock(this_cpu_ptr(&local_##x.dep_map)) #ifdef CONFIG_RT_MUTEXES # define I_RTMUTEX(x) lockdep_reset_lock(&rtmutex_##x.dep_map) #endif @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1414 @ static void reset_locks(void) init_shared_classes(); raw_spin_lock_init(&raw_lock_A); raw_spin_lock_init(&raw_lock_B); - local_lock_init(&local_A); + local_lock_init(this_cpu_ptr(&local_A)); ww_mutex_init(&o, &ww_lockdep); ww_mutex_init(&o2, &ww_lockdep); ww_mutex_init(&o3, &ww_lockdep); memset(&t, 0, sizeof(t)); memset(&t2, 0, sizeof(t2)); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1432 @ static int unexpected_testcase_failures; static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) { - unsigned long saved_preempt_count = preempt_count(); + int saved_preempt_count = preempt_count(); +#ifdef CONFIG_PREEMPT_RT +#ifdef CONFIG_SMP + int saved_mgd_count = current->migration_disabled; +#endif + int saved_rcu_count = current->rcu_read_lock_nesting; +#endif WARN_ON(irqs_disabled()); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1472 @ static void dotest(void (*testcase_fn)(v * count, so restore it: */ preempt_count_set(saved_preempt_count); + +#ifdef CONFIG_PREEMPT_RT +#ifdef CONFIG_SMP + while (current->migration_disabled > saved_mgd_count) + migrate_enable(); +#endif + + while (current->rcu_read_lock_nesting > saved_rcu_count) + rcu_read_unlock(); + WARN_ON_ONCE(current->rcu_read_lock_nesting < saved_rcu_count); +#endif + #ifdef CONFIG_TRACE_IRQFLAGS if (softirq_count()) current->softirqs_enabled = 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1551 @ static inline void print_testname(const #define DO_TESTCASE_2x2RW(desc, name, nr) \ DO_TESTCASE_2RW("hard-"desc, name##_hard, nr) \ - DO_TESTCASE_2RW("soft-"desc, name##_soft, nr) \ + NON_RT(DO_TESTCASE_2RW("soft-"desc, name##_soft, nr)) \ #define DO_TESTCASE_6x2x2RW(desc, name) \ DO_TESTCASE_2x2RW(desc, name, 123); \ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1599 @ static inline void print_testname(const #define DO_TESTCASE_2I(desc, name, nr) \ DO_TESTCASE_1("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_1("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_1("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_2IB(desc, name, nr) \ DO_TESTCASE_1B("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_1B("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_1B("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_6I(desc, name, nr) \ DO_TESTCASE_3("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_3("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_3("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_6IRW(desc, name, nr) \ DO_TESTCASE_3RW("hard-"desc, name##_hard, nr); \ - DO_TESTCASE_3RW("soft-"desc, name##_soft, nr); + NON_RT(DO_TESTCASE_3RW("soft-"desc, name##_soft, nr)); #define DO_TESTCASE_2x3(desc, name) \ DO_TESTCASE_3(desc, name, 12); \ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1703 @ static void ww_test_fail_acquire(void) #endif } +#ifdef CONFIG_PREEMPT_RT +#define ww_mutex_base_lock(b) rt_mutex_lock(b) +#define ww_mutex_base_lock_nest_lock(b, b2) rt_mutex_lock_nest_lock(b, b2) +#define ww_mutex_base_lock_interruptible(b) rt_mutex_lock_interruptible(b) +#define ww_mutex_base_lock_killable(b) rt_mutex_lock_killable(b) +#define ww_mutex_base_unlock(b) rt_mutex_unlock(b) +#else +#define ww_mutex_base_lock(b) mutex_lock(b) +#define ww_mutex_base_lock_nest_lock(b, b2) mutex_lock_nest_lock(b, b2) +#define ww_mutex_base_lock_interruptible(b) mutex_lock_interruptible(b) +#define ww_mutex_base_lock_killable(b) mutex_lock_killable(b) +#define ww_mutex_base_unlock(b) mutex_unlock(b) +#endif + static void ww_test_normal(void) { int ret; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1731 @ static void ww_test_normal(void) /* mutex_lock (and indirectly, mutex_lock_nested) */ o.ctx = (void *)~0UL; - mutex_lock(&o.base); - mutex_unlock(&o.base); + ww_mutex_base_lock(&o.base); + ww_mutex_base_unlock(&o.base); WARN_ON(o.ctx != (void *)~0UL); /* mutex_lock_interruptible (and *_nested) */ o.ctx = (void *)~0UL; - ret = mutex_lock_interruptible(&o.base); + ret = ww_mutex_base_lock_interruptible(&o.base); if (!ret) - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); else WARN_ON(1); WARN_ON(o.ctx != (void *)~0UL); /* mutex_lock_killable (and *_nested) */ o.ctx = (void *)~0UL; - ret = mutex_lock_killable(&o.base); + ret = ww_mutex_base_lock_killable(&o.base); if (!ret) - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); else WARN_ON(1); WARN_ON(o.ctx != (void *)~0UL); /* trylock, succeeding */ o.ctx = (void *)~0UL; - ret = mutex_trylock(&o.base); + ret = ww_mutex_base_trylock(&o.base); WARN_ON(!ret); if (ret) - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); else WARN_ON(1); WARN_ON(o.ctx != (void *)~0UL); /* trylock, failing */ o.ctx = (void *)~0UL; - mutex_lock(&o.base); - ret = mutex_trylock(&o.base); + ww_mutex_base_lock(&o.base); + ret = ww_mutex_base_trylock(&o.base); WARN_ON(ret); - mutex_unlock(&o.base); + ww_mutex_base_unlock(&o.base); WARN_ON(o.ctx != (void *)~0UL); /* nest_lock */ o.ctx = (void *)~0UL; - mutex_lock_nest_lock(&o.base, &t); - mutex_unlock(&o.base); + ww_mutex_base_lock_nest_lock(&o.base, &t); + ww_mutex_base_unlock(&o.base); WARN_ON(o.ctx != (void *)~0UL); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1787 @ static void ww_test_two_contexts(void) static void ww_test_diff_class(void) { WWAI(&t); -#ifdef CONFIG_DEBUG_MUTEXES +#ifdef DEBUG_WW_MUTEXES t.ww_class = NULL; #endif WWL(&o, &t); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1851 @ static void ww_test_edeadlk_normal(void) { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); o2.ctx = &t2; mutex_release(&o2.base.dep_map, _THIS_IP_); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1867 @ static void ww_test_edeadlk_normal(void) o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); WWU(&o); WWL(&o2, &t); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1877 @ static void ww_test_edeadlk_normal_slow( { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1893 @ static void ww_test_edeadlk_normal_slow( o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); WWU(&o); ww_mutex_lock_slow(&o2, &t); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1903 @ static void ww_test_edeadlk_no_unlock(vo { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); o2.ctx = &t2; mutex_release(&o2.base.dep_map, _THIS_IP_); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1919 @ static void ww_test_edeadlk_no_unlock(vo o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); WWL(&o2, &t); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1928 @ static void ww_test_edeadlk_no_unlock_sl { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1944 @ static void ww_test_edeadlk_no_unlock_sl o2.ctx = NULL; mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); - mutex_unlock(&o2.base); + ww_mutex_base_unlock(&o2.base); ww_mutex_lock_slow(&o2, &t); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1953 @ static void ww_test_edeadlk_acquire_more { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1974 @ static void ww_test_edeadlk_acquire_more { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1995 @ static void ww_test_edeadlk_acquire_more { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; - mutex_lock(&o3.base); + ww_mutex_base_lock(&o3.base); mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2021 @ static void ww_test_edeadlk_acquire_more { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; - mutex_lock(&o3.base); + ww_mutex_base_lock(&o3.base); mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2046 @ static void ww_test_edeadlk_acquire_wron { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2071 @ static void ww_test_edeadlk_acquire_wron { int ret; - mutex_lock(&o2.base); + ww_mutex_base_lock(&o2.base); mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2712 @ static void wait_context_tests(void) static void local_lock_2(void) { - local_lock_acquire(&local_A); /* IRQ-ON */ - local_lock_release(&local_A); + local_lock(&local_A); /* IRQ-ON */ + local_unlock(&local_A); HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2722 @ static void local_lock_2(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); - local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ - local_lock_release(&local_A); + local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ + local_unlock(&local_A); spin_unlock(&lock_A); HARDIRQ_ENABLE(); } static void local_lock_3A(void) { - local_lock_acquire(&local_A); /* IRQ-ON */ + local_lock(&local_A); /* IRQ-ON */ spin_lock(&lock_B); /* IRQ-ON */ spin_unlock(&lock_B); - local_lock_release(&local_A); + local_unlock(&local_A); HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2742 @ static void local_lock_3A(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); - local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ - local_lock_release(&local_A); + local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ + local_unlock(&local_A); spin_unlock(&lock_A); HARDIRQ_ENABLE(); } static void local_lock_3B(void) { - local_lock_acquire(&local_A); /* IRQ-ON */ + local_lock(&local_A); /* IRQ-ON */ spin_lock(&lock_B); /* IRQ-ON */ spin_unlock(&lock_B); - local_lock_release(&local_A); + local_unlock(&local_A); HARDIRQ_ENTER(); spin_lock(&lock_A); /* IN-IRQ */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2762 @ static void local_lock_3B(void) HARDIRQ_DISABLE(); spin_lock(&lock_A); - local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ - local_lock_release(&local_A); + local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ + local_unlock(&local_A); spin_unlock(&lock_A); HARDIRQ_ENABLE(); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2878 @ void locking_selftest(void) printk("------------------------\n"); printk("| Locking API testsuite:\n"); printk("----------------------------------------------------------------------------\n"); - printk(" | spin |wlock |rlock |mutex | wsem | rsem |\n"); + printk(" | spin |wlock |rlock |mutex | wsem | rsem |rtmutex\n"); printk(" --------------------------------------------------------------------------\n"); init_shared_classes(); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2951 @ void locking_selftest(void) DO_TESTCASE_6x1RR("rlock W1R2/R2R3/W3W1", W1R2_R2R3_W3W1); printk(" --------------------------------------------------------------------------\n"); - /* * irq-context testcases: */ DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); - DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); + NON_RT(DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A)); DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); DO_TESTCASE_6x6("safe-A + unsafe-B #1", irqsafe3); DO_TESTCASE_6x6("safe-A + unsafe-B #2", irqsafe4); Index: linux-5.15/lib/nmi_backtrace.c =================================================================== --- linux-5.15.orig/lib/nmi_backtrace.c +++ linux-5.15/lib/nmi_backtrace.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:102 @ bool nmi_cpu_backtrace(struct pt_regs *r * Allow nested NMI backtraces while serializing * against other CPUs. */ - printk_cpu_lock_irqsave(flags); + raw_printk_cpu_lock_irqsave(flags); if (!READ_ONCE(backtrace_idle) && regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", cpu, (void *)instruction_pointer(regs)); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:113 @ bool nmi_cpu_backtrace(struct pt_regs *r else dump_stack(); } - printk_cpu_unlock_irqrestore(flags); + raw_printk_cpu_unlock_irqrestore(flags); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } Index: linux-5.15/lib/scatterlist.c =================================================================== --- linux-5.15.orig/lib/scatterlist.c +++ linux-5.15/lib/scatterlist.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:831 @ static bool sg_miter_get_next_page(struc * stops @miter. * * Context: - * Don't care if @miter is stopped, or not proceeded yet. - * Otherwise, preemption disabled if the SG_MITER_ATOMIC is set. + * Don't care. * * Returns: * true if @miter contains the valid mapping. false if end of sg @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:867 @ EXPORT_SYMBOL(sg_miter_skip); * @miter->addr and @miter->length point to the current mapping. * * Context: - * Preemption disabled if SG_MITER_ATOMIC. Preemption must stay disabled - * till @miter is stopped. May sleep if !SG_MITER_ATOMIC. + * May sleep if !SG_MITER_ATOMIC. * * Returns: * true if @miter contains the next mapping. false if end of sg @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:907 @ EXPORT_SYMBOL(sg_miter_next); * need to be released during iteration. * * Context: - * Preemption disabled if the SG_MITER_ATOMIC is set. Don't care - * otherwise. + * Don't care otherwise. */ void sg_miter_stop(struct sg_mapping_iter *miter) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:922 @ void sg_miter_stop(struct sg_mapping_ite flush_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { - WARN_ON_ONCE(preemptible()); + WARN_ON_ONCE(!pagefault_disabled()); kunmap_atomic(miter->addr); } else kunmap(miter->page); Index: linux-5.15/localversion-rt =================================================================== --- /dev/null +++ linux-5.15/localversion-rt @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1 @ +-rt61 Index: linux-5.15/mm/Kconfig =================================================================== --- linux-5.15.orig/mm/Kconfig +++ linux-5.15/mm/Kconfig @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:374 @ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT select COMPACTION select XARRAY_MULTI help Index: linux-5.15/mm/memcontrol.c =================================================================== --- linux-5.15.orig/mm/memcontrol.c +++ linux-5.15/mm/memcontrol.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:657 @ static u64 flush_next_time; #define FLUSH_TIME (2UL*HZ) +/* + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can + * not rely on this as part of an acquired spinlock_t lock. These functions are + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion + * is sufficient. + */ +static void memcg_stats_lock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#else + VM_BUG_ON(!irqs_disabled()); +#endif +} + +static void __memcg_stats_lock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); +#endif +} + +static void memcg_stats_unlock(void) +{ +#ifdef CONFIG_PREEMPT_RT + preempt_enable(); +#endif +} + static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { unsigned int x; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:769 @ void __mod_memcg_lruvec_state(struct lru pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; + /* + * The caller from rmap relay on disabled preemption becase they never + * update their counter from in-interrupt context. For these two + * counters we check that the update is never performed from an + * interrupt context while other caller need to have disabled interrupt. + */ + __memcg_stats_lock(); + if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { + switch (idx) { + case NR_ANON_MAPPED: + case NR_FILE_MAPPED: + case NR_ANON_THPS: + case NR_SHMEM_PMDMAPPED: + case NR_FILE_PMDMAPPED: + WARN_ON_ONCE(!in_task()); + break; + default: + WARN_ON_ONCE(!irqs_disabled()); + } + } + /* Update memcg */ __this_cpu_add(memcg->vmstats_percpu->state[idx], val); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:797 @ void __mod_memcg_lruvec_state(struct lru __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); memcg_rstat_updated(memcg, val); + memcg_stats_unlock(); } /** @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:898 @ void __count_memcg_events(struct mem_cgr if (mem_cgroup_disabled()) return; + memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->events[idx], count); memcg_rstat_updated(memcg, count); + memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:965 @ static bool mem_cgroup_event_ratelimit(s */ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return; + /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2161 @ void unlock_page_memcg(struct page *page } EXPORT_SYMBOL(unlock_page_memcg); -struct obj_stock { +struct memcg_stock_pcp { + local_lock_t stock_lock; + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; -#else - int dummy[0]; #endif -}; - -struct memcg_stock_pcp { - struct mem_cgroup *cached; /* this never be root cgroup */ - unsigned int nr_pages; - struct obj_stock task_obj; - struct obj_stock irq_obj; struct work_struct work; unsigned long flags; #define FLUSHING_CACHED_CHARGE 0 }; -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { + .stock_lock = INIT_LOCAL_LOCK(stock_lock), +}; static DEFINE_MUTEX(percpu_charge_mutex); #ifdef CONFIG_MEMCG_KMEM -static void drain_obj_stock(struct obj_stock *stock); +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); #else -static inline void drain_obj_stock(struct obj_stock *stock) +static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { + return NULL; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2201 @ static bool obj_stock_flush_required(str #endif /* - * Most kmem_cache_alloc() calls are from user context. The irq disable/enable - * sequence used in this case to access content from object stock is slow. - * To optimize for user context access, there are now two object stocks for - * task context and interrupt context access respectively. - * - * The task context object stock can be accessed by disabling preemption only - * which is cheap in non-preempt kernel. The interrupt context object stock - * can only be accessed after disabling interrupt. User context code can - * access interrupt object stock, but not vice versa. - */ -static inline struct obj_stock *get_obj_stock(unsigned long *pflags) -{ - struct memcg_stock_pcp *stock; - - if (likely(in_task())) { - *pflags = 0UL; - preempt_disable(); - stock = this_cpu_ptr(&memcg_stock); - return &stock->task_obj; - } - - local_irq_save(*pflags); - stock = this_cpu_ptr(&memcg_stock); - return &stock->irq_obj; -} - -static inline void put_obj_stock(unsigned long flags) -{ - if (likely(in_task())) - preempt_enable(); - else - local_irq_restore(flags); -} - -/** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. * @nr_pages: how many pages to charge. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2220 @ static bool consume_stock(struct mem_cgr if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2228 @ static bool consume_stock(struct mem_cgr ret = true; } - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2257 @ static void drain_stock(struct memcg_sto static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2265 @ static void drain_local_stock(struct wor * drain_stock races is that we always operate on local CPU stock * here with IRQ disabled */ - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - drain_obj_stock(&stock->irq_obj); - if (in_task()) - drain_obj_stock(&stock->task_obj); + old = drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); } /* * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - unsigned long flags; - - local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2295 @ static void refill_stock(struct mem_cgro if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); +} - local_irq_restore(flags); +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + unsigned long flags; + + local_lock_irqsave(&memcg_stock.stock_lock, flags); + __refill_stock(memcg, nr_pages); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2323 @ static void drain_all_stock(struct mem_c * as well as workers from this path always operate on the local * per-cpu data. CPU up doesn't touch memcg_stock at all. */ - curcpu = get_cpu(); + migrate_disable(); + curcpu = smp_processor_id(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2347 @ static void drain_all_stock(struct mem_c schedule_work_on(cpu, &stock->work); } } - put_cpu(); + migrate_enable(); mutex_unlock(&percpu_charge_mutex); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3112 @ void __memcg_kmem_uncharge_page(struct p void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { + struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); int *bytes; + local_lock_irqsave(&memcg_stock.stock_lock, flags); + stock = this_cpu_ptr(&memcg_stock); + /* * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat or idx * changes. */ if (stock->cached_objcg != objcg) { - drain_obj_stock(stock); + old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3170 @ void mod_objcg_state(struct obj_cgroup * if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); - put_obj_stock(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { + struct memcg_stock_pcp *stock; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); bool ret = false; + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } - put_obj_stock(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } -static void drain_obj_stock(struct obj_stock *stock) +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { struct obj_cgroup *old = stock->cached_objcg; if (!old) - return; + return NULL; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); - if (nr_pages) - obj_cgroup_uncharge_pages(old, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(old); + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + + __refill_stock(memcg, nr_pages); + + css_put(&memcg->css); + } /* * The leftover is flushed to the centralized per-memcg value. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3251 @ static void drain_obj_stock(struct obj_s stock->cached_pgdat = NULL; } - obj_cgroup_put(old); stock->cached_objcg = NULL; + /* + * The `old' objects needs to be released by the caller via + * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. + */ + return old; } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3264 @ static bool obj_stock_flush_required(str { struct mem_cgroup *memcg; - if (in_task() && stock->task_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); - if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) - return true; - } - if (stock->irq_obj.cached_objcg) { - memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); + if (stock->cached_objcg) { + memcg = obj_cgroup_memcg(stock->cached_objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3276 @ static bool obj_stock_flush_required(str static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { + struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; unsigned long flags; - struct obj_stock *stock = get_obj_stock(&flags); unsigned int nr_pages = 0; + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); if (stock->cached_objcg != objcg) { /* reset if necessary */ - drain_obj_stock(stock); + old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->cached_objcg = objcg; stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3299 @ static void refill_obj_stock(struct obj_ stock->nr_bytes &= (PAGE_SIZE - 1); } - put_obj_stock(flags); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3868 @ static ssize_t mem_cgroup_write(struct k } break; case RES_SOFT_LIMIT: - memcg->soft_limit = nr_pages; - ret = 0; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + ret = -EOPNOTSUPP; + } else { + memcg->soft_limit = nr_pages; + ret = 0; + } break; } return ret ?: nbytes; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4854 @ static ssize_t memcg_write_event_control char *endp; int ret; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return -EOPNOTSUPP; + buf = strstrip(buf); efd = simple_strtoul(buf, &endp, 10); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6948 @ static void uncharge_page(struct page *p unsigned long nr_pages; struct mem_cgroup *memcg; struct obj_cgroup *objcg; - bool use_objcg = PageMemcgKmem(page); VM_BUG_ON_PAGE(PageLRU(page), page); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6956 @ static void uncharge_page(struct page *p * page memcg or objcg at this point, we have fully * exclusive access to the page. */ - if (use_objcg) { + if (PageMemcgKmem(page)) { objcg = __page_objcg(page); /* * This get matches the put at the end of the function and @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6984 @ static void uncharge_page(struct page *p nr_pages = compound_nr(page); - if (use_objcg) { + if (PageMemcgKmem(page)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:7314 @ void mem_cgroup_swapout(struct page *pag * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. */ - VM_BUG_ON(!irqs_disabled()); + memcg_stats_lock(); mem_cgroup_charge_statistics(memcg, page, -nr_entries); + memcg_stats_unlock(); memcg_check_events(memcg, page); css_put(&memcg->css); Index: linux-5.15/mm/memory.c =================================================================== --- linux-5.15.orig/mm/memory.c +++ linux-5.15/mm/memory.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:5295 @ void __might_fault(const char *file, int return; if (pagefault_disabled()) return; - __might_sleep(file, line, 0); + __might_sleep(file, line); #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); Index: linux-5.15/mm/page_alloc.c =================================================================== --- linux-5.15.orig/mm/page_alloc.c +++ linux-5.15/mm/page_alloc.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3152 @ static void drain_local_pages_wq(struct * cpu which is alright but we also have to make sure to not move to * a different one. */ - preempt_disable(); + migrate_disable(); drain_local_pages(drain->zone); - preempt_enable(); + migrate_enable(); } /* Index: linux-5.15/mm/vmalloc.c =================================================================== --- linux-5.15.orig/mm/vmalloc.c +++ linux-5.15/mm/vmalloc.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1921 @ static void *new_vmap_block(unsigned int return ERR_PTR(err); } - vbq = &get_cpu_var(vmap_block_queue); + get_cpu_light(); + vbq = this_cpu_ptr(&vmap_block_queue); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); - put_cpu_var(vmap_block_queue); + put_cpu_light(); return vaddr; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2005 @ static void *vb_alloc(unsigned long size order = get_order(size); rcu_read_lock(); - vbq = &get_cpu_var(vmap_block_queue); + get_cpu_light(); + vbq = this_cpu_ptr(&vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2029 @ static void *vb_alloc(unsigned long size break; } - put_cpu_var(vmap_block_queue); + put_cpu_light(); rcu_read_unlock(); /* Allocate new block if nothing was found */ Index: linux-5.15/mm/workingset.c =================================================================== --- linux-5.15.orig/mm/workingset.c +++ linux-5.15/mm/workingset.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:436 @ static struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { + struct address_space *mapping; + /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:446 @ void workingset_update_node(struct xa_no * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + mapping = container_of(node->array, struct address_space, i_pages); + lockdep_assert_held(&mapping->i_pages.xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { Index: linux-5.15/mm/zsmalloc.c =================================================================== --- linux-5.15.orig/mm/zsmalloc.c +++ linux-5.15/mm/zsmalloc.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:60 @ #include <linux/wait.h> #include <linux/pagemap.h> #include <linux/fs.h> +#include <linux/local_lock.h> #define ZSPAGE_MAGIC 0x58 @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:81 @ #define ZS_HANDLE_SIZE (sizeof(unsigned long)) +#ifdef CONFIG_PREEMPT_RT + +struct zsmalloc_handle { + unsigned long addr; + spinlock_t lock; +}; + +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) + +#else + +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) +#endif + /* * Object location (<PFN>, <obj_idx>) is encoded as * a single (unsigned long) handle value. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:311 @ struct zspage { }; struct mapping_area { + local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:341 @ static void SetZsPageMovable(struct zs_p static int create_cache(struct zs_pool *pool) { - pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, 0, 0, NULL); if (!pool->handle_cachep) return 1; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:365 @ static void destroy_cache(struct zs_pool static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { - return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); + void *p; + + p = kmem_cache_alloc(pool->handle_cachep, + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); +#ifdef CONFIG_PREEMPT_RT + if (p) { + struct zsmalloc_handle *zh = p; + + spin_lock_init(&zh->lock); + } +#endif + return (unsigned long)p; } +#ifdef CONFIG_PREEMPT_RT +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) +{ + return (void *)(handle & ~((1 << OBJ_TAG_BITS) - 1)); +} +#endif + static void cache_free_handle(struct zs_pool *pool, unsigned long handle) { kmem_cache_free(pool->handle_cachep, (void *)handle); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:404 @ static void cache_free_zspage(struct zs_ static void record_obj(unsigned long handle, unsigned long obj) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + WRITE_ONCE(zh->addr, obj); +#else /* * lsb of @obj represents handle lock while other bits * represent object value the handle is pointing so * updating shouldn't do store tearing. */ WRITE_ONCE(*(unsigned long *)handle, obj); +#endif } /* zpool driver */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:497 @ MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); +static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static bool is_zspage_isolated(struct zspage *zspage) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:906 @ static unsigned long location_to_obj(str static unsigned long handle_to_obj(unsigned long handle) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return zh->addr; +#else return *(unsigned long *)handle; +#endif } static unsigned long obj_to_head(struct page *page, void *obj) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:926 @ static unsigned long obj_to_head(struct static inline int testpin_tag(unsigned long handle) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return spin_is_locked(&zh->lock); +#else return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif } static inline int trypin_tag(unsigned long handle) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return spin_trylock(&zh->lock); +#else return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif } static void pin_tag(unsigned long handle) __acquires(bitlock) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return spin_lock(&zh->lock); +#else bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif } static void unpin_tag(unsigned long handle) __releases(bitlock) { +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + + return spin_unlock(&zh->lock); +#else bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif } static void reset_page(struct page *page) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1348 @ void *zs_map_object(struct zs_pool *pool class = pool->size_class[class_idx]; off = (class->size * obj_idx) & ~PAGE_MASK; - area = &get_cpu_var(zs_map_area); + local_lock(&zs_map_area.lock); + area = this_cpu_ptr(&zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1403 @ void zs_unmap_object(struct zs_pool *poo __zs_unmap_object(area, pages, off, class->size); } - put_cpu_var(zs_map_area); + local_unlock(&zs_map_area.lock); migrate_read_unlock(zspage); unpin_tag(handle); Index: linux-5.15/net/Kconfig =================================================================== --- linux-5.15.orig/net/Kconfig +++ linux-5.15/net/Kconfig @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:297 @ config CGROUP_NET_CLASSID config NET_RX_BUSY_POLL bool - default y + default y if !PREEMPT_RT config BQL bool Index: linux-5.15/net/core/dev.c =================================================================== --- linux-5.15.orig/net/core/dev.c +++ linux-5.15/net/core/dev.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:228 @ static inline struct hlist_head *dev_ind static inline void rps_lock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); + raw_spin_lock(&sd->input_pkt_queue.raw_lock); #endif } static inline void rps_unlock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); + raw_spin_unlock(&sd->input_pkt_queue.raw_lock); #endif } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3067 @ static void __netif_reschedule(struct Qd sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); + preempt_check_resched_rt(); } void __netif_schedule(struct Qdisc *q) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3130 @ void __dev_kfree_skb_irq(struct sk_buff __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); + preempt_check_resched_rt(); } EXPORT_SYMBOL(__dev_kfree_skb_irq); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:3862 @ no_lock_out: * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. */ +#ifdef CONFIG_PREEMPT_RT + contended = true; +#else contended = qdisc_is_running(q); +#endif if (unlikely(contended)) spin_lock(&q->busylock); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4692 @ drop: rps_unlock(sd); local_irq_restore(flags); + preempt_check_resched_rt(); atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4933 @ static int netif_rx_internal(struct sk_b struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); + migrate_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4943 @ static int netif_rx_internal(struct sk_b ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); + migrate_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); + put_cpu_light(); } return ret; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:4989 @ int netif_rx_ni(struct sk_buff *skb) trace_netif_rx_ni_entry(skb); - preempt_disable(); + local_bh_disable(); err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); + local_bh_enable(); trace_netif_rx_ni_exit(err); return err; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6435 @ static void net_rps_action_and_irq_enabl sd->rps_ipi_list = NULL; local_irq_enable(); + preempt_check_resched_rt(); /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); } else #endif local_irq_enable(); + preempt_check_resched_rt(); } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:6520 @ void __napi_schedule(struct napi_struct local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); + preempt_check_resched_rt(); } EXPORT_SYMBOL(__napi_schedule); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:11343 @ static int dev_cpu_dead(unsigned int old raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + preempt_check_resched_rt(); #ifdef CONFIG_RPS remsd = oldsd->rps_ipi_list; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:11357 @ static int dev_cpu_dead(unsigned int old netif_rx_ni(skb); input_queue_head_incr(oldsd); } - while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx_ni(skb); input_queue_head_incr(oldsd); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:11672 @ static int __init net_dev_init(void) INIT_WORK(flush, flush_backlog); - skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init_raw(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); Index: linux-5.15/net/core/gen_estimator.c =================================================================== --- linux-5.15.orig/net/core/gen_estimator.c +++ linux-5.15/net/core/gen_estimator.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:43 @ */ struct net_rate_estimator { - struct gnet_stats_basic_packed *bstats; + struct gnet_stats_basic_sync *bstats; spinlock_t *stats_lock; - seqcount_t *running; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + bool running; + struct gnet_stats_basic_sync __percpu *cpu_bstats; u8 ewma_log; u8 intvl_log; /* period : (250ms << intvl_log) */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:63 @ struct net_rate_estimator { }; static void est_fetch_counters(struct net_rate_estimator *e, - struct gnet_stats_basic_packed *b) + struct gnet_stats_basic_sync *b) { - memset(b, 0, sizeof(*b)); + gnet_stats_basic_sync_init(b); if (e->stats_lock) spin_lock(e->stats_lock); - __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats); + gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); if (e->stats_lock) spin_unlock(e->stats_lock); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:79 @ static void est_fetch_counters(struct ne static void est_timer(struct timer_list *t) { struct net_rate_estimator *est = from_timer(est, t, timer); - struct gnet_stats_basic_packed b; + struct gnet_stats_basic_sync b; + u64 b_bytes, b_packets; u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); + b_bytes = u64_stats_read(&b.bytes); + b_packets = u64_stats_read(&b.packets); + + brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - rate = (b.packets - est->last_packets) << (10 - est->intvl_log); + rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:98 @ static void est_timer(struct timer_list est->avpps += rate; write_seqcount_end(&est->seq); - est->last_bytes = b.bytes; - est->last_packets = b.packets; + est->last_bytes = b_bytes; + est->last_packets = b_packets; est->next_jiffies += ((HZ/4) << est->intvl_log); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:116 @ static void est_timer(struct timer_list * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path - * @running: qdisc running seqcount + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @bstats_cpu is NULL * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:130 @ static void est_timer(struct timer_list * Returns 0 on success or a negative error code. * */ -int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_new_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, + bool running, struct nlattr *opt) { struct gnet_estimator *parm = nla_data(opt); struct net_rate_estimator *old, *est; - struct gnet_stats_basic_packed b; + struct gnet_stats_basic_sync b; int intvl_log; if (nla_len(opt) < sizeof(*parm)) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:173 @ int gen_new_estimator(struct gnet_stats_ est_fetch_counters(est, &b); if (lock) local_bh_enable(); - est->last_bytes = b.bytes; - est->last_packets = b.packets; + est->last_bytes = u64_stats_read(&b.bytes); + est->last_packets = u64_stats_read(&b.packets); if (lock) spin_lock_bh(lock); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:223 @ EXPORT_SYMBOL(gen_kill_estimator); * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path - * @running: qdisc running seqcount (might be NULL) + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @cpu_bstats is NULL * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:233 @ EXPORT_SYMBOL(gen_kill_estimator); * * Returns 0 on success or a negative error code. */ -int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt) + bool running, struct nlattr *opt) { return gen_new_estimator(bstats, cpu_bstats, rate_est, lock, running, opt); Index: linux-5.15/net/core/gen_stats.c =================================================================== --- linux-5.15.orig/net/core/gen_stats.c +++ linux-5.15/net/core/gen_stats.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:21 @ #include <linux/gen_stats.h> #include <net/netlink.h> #include <net/gen_stats.h> - +#include <net/sch_generic.h> static inline int gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:117 @ gnet_stats_start_copy(struct sk_buff *sk } EXPORT_SYMBOL(gnet_stats_start_copy); -static void -__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu) +/* Must not be inlined, due to u64_stats seqcount_t lockdep key */ +void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) { + u64_stats_set(&b->bytes, 0); + u64_stats_set(&b->packets, 0); + u64_stats_init(&b->syncp); +} +EXPORT_SYMBOL(gnet_stats_basic_sync_init); + +static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu) +{ + u64 t_bytes = 0, t_packets = 0; int i; for_each_possible_cpu(i) { - struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); unsigned int start; u64 bytes, packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); - bytes = bcpu->bstats.bytes; - packets = bcpu->bstats.packets; + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); - bstats->bytes += bytes; - bstats->packets += packets; + t_bytes += bytes; + t_packets += packets; } + _bstats_update(bstats, t_bytes, t_packets); } -void -__gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) { - unsigned int seq; + unsigned int start; + u64 bytes = 0; + u64 packets = 0; + + WARN_ON_ONCE((cpu || running) && in_hardirq()); if (cpu) { - __gnet_stats_copy_basic_cpu(bstats, cpu); + gnet_stats_add_basic_cpu(bstats, cpu); return; } do { if (running) - seq = read_seqcount_begin(running); - bstats->bytes = b->bytes; - bstats->packets = b->packets; - } while (running && read_seqcount_retry(running, seq)); + start = u64_stats_fetch_begin_irq(&b->syncp); + bytes = u64_stats_read(&b->bytes); + packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + + _bstats_update(bstats, bytes, packets); +} +EXPORT_SYMBOL(gnet_stats_add_basic); + +static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) +{ + unsigned int start; + + if (cpu) { + u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + t_bytes += bytes; + t_packets += packets; + } + *ret_bytes = t_bytes; + *ret_packets = t_packets; + return; + } + do { + if (running) + start = u64_stats_fetch_begin_irq(&b->syncp); + *ret_bytes = u64_stats_read(&b->bytes); + *ret_packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); } -EXPORT_SYMBOL(__gnet_stats_copy_basic); static int -___gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b, - int type) +___gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + int type, bool running) { - struct gnet_stats_basic_packed bstats = {0}; + u64 bstats_bytes, bstats_packets; - __gnet_stats_copy_basic(running, &bstats, cpu, b); + gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); if (d->compat_tc_stats && type == TCA_STATS_BASIC) { - d->tc_stats.bytes = bstats.bytes; - d->tc_stats.packets = bstats.packets; + d->tc_stats.bytes = bstats_bytes; + d->tc_stats.packets = bstats_packets; } if (d->tail) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:230 @ ___gnet_stats_copy_basic(const seqcount_ int res; memset(&sb, 0, sizeof(sb)); - sb.bytes = bstats.bytes; - sb.packets = bstats.packets; + sb.bytes = bstats_bytes; + sb.packets = bstats_packets; res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); - if (res < 0 || sb.packets == bstats.packets) + if (res < 0 || sb.packets == bstats_packets) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, - sizeof(bstats.packets), TCA_STATS_PAD); + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, + sizeof(bstats_packets), TCA_STATS_PAD); } return 0; } /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV - * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:260 @ ___gnet_stats_copy_basic(const seqcount_ * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) { - return ___gnet_stats_copy_basic(running, d, cpu, b, - TCA_STATS_BASIC); + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); } EXPORT_SYMBOL(gnet_stats_copy_basic); /** * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV - * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:287 @ EXPORT_SYMBOL(gnet_stats_copy_basic); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic_hw(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic_hw(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) { - return ___gnet_stats_copy_basic(running, d, cpu, b, - TCA_STATS_BASIC_HW); + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); } EXPORT_SYMBOL(gnet_stats_copy_basic_hw); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:340 @ gnet_stats_copy_rate_est(struct gnet_dum } EXPORT_SYMBOL(gnet_stats_copy_rate_est); -static void -__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *q) +static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) { int i; for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); - qstats->qlen = 0; + qstats->qlen += qcpu->backlog; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:356 @ __gnet_stats_copy_queue_cpu(struct gnet_ } } -void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void gnet_stats_add_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q) { if (cpu) { - __gnet_stats_copy_queue_cpu(qstats, cpu); + gnet_stats_add_queue_cpu(qstats, cpu); } else { - qstats->qlen = q->qlen; - qstats->backlog = q->backlog; - qstats->drops = q->drops; - qstats->requeues = q->requeues; - qstats->overlimits = q->overlimits; + qstats->qlen += q->qlen; + qstats->backlog += q->backlog; + qstats->drops += q->drops; + qstats->requeues += q->requeues; + qstats->overlimits += q->overlimits; } - - qstats->qlen = qlen; } -EXPORT_SYMBOL(__gnet_stats_copy_queue); +EXPORT_SYMBOL(gnet_stats_add_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:393 @ gnet_stats_copy_queue(struct gnet_dump * { struct gnet_stats_queue qstats = {0}; - __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); + gnet_stats_add_queue(&qstats, cpu_q, q); + qstats.qlen = qlen; if (d->compat_tc_stats) { d->tc_stats.drops = qstats.drops; Index: linux-5.15/net/netfilter/xt_RATEEST.c =================================================================== --- linux-5.15.orig/net/netfilter/xt_RATEEST.c +++ linux-5.15/net/netfilter/xt_RATEEST.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:97 @ static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; - struct gnet_stats_basic_packed *stats = &info->est->bstats; + struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); - stats->bytes += skb->len; - stats->packets++; + u64_stats_add(&stats->bytes, skb->len); + u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:146 @ static int xt_rateest_tg_checkentry(cons if (!est) goto err1; + gnet_stats_basic_sync_init(&est->bstats); strlcpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; Index: linux-5.15/net/sched/act_api.c =================================================================== --- linux-5.15.orig/net/sched/act_api.c +++ linux-5.15/net/sched/act_api.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:489 @ int tcf_idr_create(struct tc_action_net atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { - p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats) goto err1; - p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats_hw) goto err2; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) goto err3; } + gnet_stats_basic_sync_init(&p->tcfa_bstats); + gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; p->tcfa_tm.install = jiffies; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:510 @ int tcf_idr_create(struct tc_action_net if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, - &p->tcfa_lock, NULL, est); + &p->tcfa_lock, false, est); if (err) goto err4; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1146 @ void tcf_action_update_stats(struct tc_a u64 drops, bool hw) { if (a->cpu_bstats) { - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); this_cpu_ptr(a->cpu_qstats)->drops += drops; if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); return; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1191 @ int tcf_action_copy_stats(struct sk_buff if (err < 0) goto errout; - if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || - gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw, - &p->tcfa_bstats_hw) < 0 || + if (gnet_stats_copy_basic(&d, p->cpu_bstats, + &p->tcfa_bstats, false) < 0 || + gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, + &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, Index: linux-5.15/net/sched/act_bpf.c =================================================================== --- linux-5.15.orig/net/sched/act_bpf.c +++ linux-5.15/net/sched/act_bpf.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:44 @ static int tcf_bpf_act(struct sk_buff *s int action, filter_res; tcf_lastuse_update(&prog->tcf_tm); - bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); filter = rcu_dereference(prog->filter); if (at_ingress) { Index: linux-5.15/net/sched/act_ife.c =================================================================== --- linux-5.15.orig/net/sched/act_ife.c +++ linux-5.15/net/sched/act_ife.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:721 @ static int tcf_ife_decode(struct sk_buff u8 *tlv_data; u16 metalen; - bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (skb_at_tc_ingress(skb)) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:809 @ static int tcf_ife_encode(struct sk_buff exceed_mtu = true; } - bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ Index: linux-5.15/net/sched/act_mpls.c =================================================================== --- linux-5.15.orig/net/sched/act_mpls.c +++ linux-5.15/net/sched/act_mpls.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:62 @ static int tcf_mpls_act(struct sk_buff * int ret, mac_len; tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); /* Ensure 'data' points at mac_header prior calling mpls manipulating * functions. Index: linux-5.15/net/sched/act_police.c =================================================================== --- linux-5.15.orig/net/sched/act_police.c +++ linux-5.15/net/sched/act_police.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:128 @ static int tcf_police_init(struct net *n police->common.cpu_bstats, &police->tcf_rate_est, &police->tcf_lock, - NULL, est); + false, est); if (err) goto failure; } else if (tb[TCA_POLICE_AVRATE] && @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:265 @ static int tcf_police_act(struct sk_buff int ret; tcf_lastuse_update(&police->tcf_tm); - bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); Index: linux-5.15/net/sched/act_sample.c =================================================================== --- linux-5.15.orig/net/sched/act_sample.c +++ linux-5.15/net/sched/act_sample.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:166 @ static int tcf_sample_act(struct sk_buff int retval; tcf_lastuse_update(&s->tcf_tm); - bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); psample_group = rcu_dereference_bh(s->psample_group); Index: linux-5.15/net/sched/act_simple.c =================================================================== --- linux-5.15.orig/net/sched/act_simple.c +++ linux-5.15/net/sched/act_simple.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:39 @ static int tcf_simp_act(struct sk_buff * * then it would look like "hello_3" (without quotes) */ pr_info("simple: %s_%llu\n", - (char *)d->tcfd_defdata, d->tcf_bstats.packets); + (char *)d->tcfd_defdata, + u64_stats_read(&d->tcf_bstats.packets)); spin_unlock(&d->tcf_lock); return d->tcf_action; } Index: linux-5.15/net/sched/act_skbedit.c =================================================================== --- linux-5.15.orig/net/sched/act_skbedit.c +++ linux-5.15/net/sched/act_skbedit.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:34 @ static int tcf_skbedit_act(struct sk_buf int action; tcf_lastuse_update(&d->tcf_tm); - bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); Index: linux-5.15/net/sched/act_skbmod.c =================================================================== --- linux-5.15.orig/net/sched/act_skbmod.c +++ linux-5.15/net/sched/act_skbmod.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:34 @ static int tcf_skbmod_act(struct sk_buff u64 flags; tcf_lastuse_update(&d->tcf_tm); - bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); action = READ_ONCE(d->tcf_action); if (unlikely(action == TC_ACT_SHOT)) Index: linux-5.15/net/sched/sch_api.c =================================================================== --- linux-5.15.orig/net/sched/sch_api.c +++ linux-5.15/net/sched/sch_api.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:887 @ static void qdisc_offload_graft_root(str static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:945 @ static int tc_fill_qdisc(struct sk_buff cpu_qstats = q->cpu_qstats; } - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), - &d, cpu_bstats, &q->bstats) < 0 || + if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1272 @ static struct Qdisc *qdisc_create(struct rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { - seqcount_t *running; - err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); goto err_out4; } - if (sch->parent != TC_H_ROOT && - !(sch->flags & TCQ_F_INGRESS) && - (!p || !(p->flags & TCQ_F_MQROOT))) - running = qdisc_root_sleeping_running(sch); - else - running = &sch->running; - err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, - running, + true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1358 @ static int qdisc_change(struct Qdisc *sc sch->cpu_bstats, &sch->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); } out: Index: linux-5.15/net/sched/sch_atm.c =================================================================== --- linux-5.15.orig/net/sched/sch_atm.c +++ linux-5.15/net/sched/sch_atm.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:55 @ struct atm_flow_data { struct atm_qdisc_data *parent; /* parent qdisc */ struct socket *sock; /* for closing */ int ref; /* reference count */ - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct list_head list; struct atm_flow_data *excess; /* flow for excess traffic; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:554 @ static int atm_tc_init(struct Qdisc *sch pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); INIT_LIST_HEAD(&p->flows); INIT_LIST_HEAD(&p->link.list); + gnet_stats_basic_sync_init(&p->link.bstats); list_add(&p->link.list, &p->flows); p->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, extack); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:658 @ atm_tc_dump_class_stats(struct Qdisc *sc { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &flow->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 || gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; Index: linux-5.15/net/sched/sch_cbq.c =================================================================== --- linux-5.15.orig/net/sched/sch_cbq.c +++ linux-5.15/net/sched/sch_cbq.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:119 @ struct cbq_class { long avgidle; long deficit; /* Saved deficit for WRR */ psched_time_t penalized; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tc_cbq_xstats xstats; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:568 @ cbq_update(struct cbq_sched_data *q) long avgidle = cl->avgidle; long idle; - cl->bstats.packets++; - cl->bstats.bytes += len; + _bstats_update(&cl->bstats, len, 1); /* * (now - last) is total time between packet right edges. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1385 @ cbq_dump_class_stats(struct Qdisc *sch, if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1519 @ cbq_change_class(struct Qdisc *sch, u32 err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1611 @ cbq_change_class(struct Qdisc *sch, u32 if (cl == NULL) goto failure; + gnet_stats_basic_sync_init(&cl->bstats); err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1620 @ cbq_change_class(struct Qdisc *sch, u32 if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); tcf_block_put(cl->block); Index: linux-5.15/net/sched/sch_drr.c =================================================================== --- linux-5.15.orig/net/sched/sch_drr.c +++ linux-5.15/net/sched/sch_drr.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:22 @ struct drr_class { struct Qdisc_class_common common; unsigned int filter_cnt; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct list_head alist; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:88 @ static int drr_change_class(struct Qdisc if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:108 @ static int drr_change_class(struct Qdisc if (cl == NULL) return -ENOBUFS; + gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:121 @ static int drr_change_class(struct Qdisc if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); qdisc_put(cl->qdisc); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:268 @ static int drr_dump_class_stats(struct Q if (qlen) xstats.deficit = cl->deficit; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) return -1; Index: linux-5.15/net/sched/sch_ets.c =================================================================== --- linux-5.15.orig/net/sched/sch_ets.c +++ linux-5.15/net/sched/sch_ets.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:44 @ struct ets_class { struct Qdisc *qdisc; u32 quantum; u32 deficit; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; }; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:328 @ static int ets_class_dump_stats(struct Q struct ets_class *cl = ets_class_from_arg(sch, arg); struct Qdisc *cl_q = cl->qdisc; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:663 @ static int ets_qdisc_change(struct Qdisc q->nbands = nbands; for (i = nstrict; i < q->nstrict; i++) { - INIT_LIST_HEAD(&q->classes[i].alist); if (q->classes[i].qdisc->q.qlen) { list_add_tail(&q->classes[i].alist, &q->active); q->classes[i].deficit = quanta[i]; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:690 @ static int ets_qdisc_change(struct Qdisc ets_offload_change(sch); for (i = q->nbands; i < oldbands; i++) { qdisc_put(q->classes[i].qdisc); - memset(&q->classes[i], 0, sizeof(q->classes[i])); + q->classes[i].qdisc = NULL; + q->classes[i].quantum = 0; + q->classes[i].deficit = 0; + gnet_stats_basic_sync_init(&q->classes[i].bstats); + memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); } return 0; } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:703 @ static int ets_qdisc_init(struct Qdisc * struct netlink_ext_ack *extack) { struct ets_sched *q = qdisc_priv(sch); - int err; + int err, i; if (!opt) return -EINVAL; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:713 @ static int ets_qdisc_init(struct Qdisc * return err; INIT_LIST_HEAD(&q->active); + for (i = 0; i < TCQ_ETS_MAX_BANDS; i++) + INIT_LIST_HEAD(&q->classes[i].alist); + return ets_qdisc_change(sch, opt, extack); } Index: linux-5.15/net/sched/sch_generic.c =================================================================== --- linux-5.15.orig/net/sched/sch_generic.c +++ linux-5.15/net/sched/sch_generic.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:307 @ trace: /* * Transmit possibly several skbs, and handle the return status as - * required. Owning running seqcount bit guarantees that - * only one CPU can execute this function. + * required. Owning qdisc running bit guarantees that only one CPU + * can execute this function. * * Returns to the caller: * false - hardware queue frozen backoff @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:609 @ struct Qdisc noop_qdisc = { .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, - .running = SEQCNT_ZERO(noop_qdisc.running), .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:869 @ struct Qdisc_ops pfifo_fast_ops __read_m EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; -static struct lock_class_key qdisc_running_key; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:893 @ struct Qdisc *qdisc_alloc(struct netdev_ __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); qdisc_skb_head_init(&sch->q); + gnet_stats_basic_sync_init(&sch->bstats); spin_lock_init(&sch->q.lock); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = - netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!sch->cpu_bstats) goto errout1; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:918 @ struct Qdisc *qdisc_alloc(struct netdev_ lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - seqcount_init(&sch->running); - lockdep_set_class(&sch->running, - dev->qdisc_running_key ?: &qdisc_running_key); - sch->ops = ops; sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; Index: linux-5.15/net/sched/sch_gred.c =================================================================== --- linux-5.15.orig/net/sched/sch_gred.c +++ linux-5.15/net/sched/sch_gred.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:59 @ struct gred_sched { u32 DPs; u32 def; struct red_vars wred_set; + struct tc_gred_qopt_offload *opt; }; static inline int gred_wred_mode(struct gred_sched *table) @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:315 @ static void gred_offload(struct Qdisc *s { struct gred_sched *table = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct tc_gred_qopt_offload opt = { - .command = command, - .handle = sch->handle, - .parent = sch->parent, - }; + struct tc_gred_qopt_offload *opt = table->opt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; + memset(opt, 0, sizeof(*opt)); + opt->command = command; + opt->handle = sch->handle; + opt->parent = sch->parent; + if (command == TC_GRED_REPLACE) { unsigned int i; - opt.set.grio_on = gred_rio_mode(table); - opt.set.wred_on = gred_wred_mode(table); - opt.set.dp_cnt = table->DPs; - opt.set.dp_def = table->def; + opt->set.grio_on = gred_rio_mode(table); + opt->set.wred_on = gred_wred_mode(table); + opt->set.dp_cnt = table->DPs; + opt->set.dp_def = table->def; for (i = 0; i < table->DPs; i++) { struct gred_sched_data *q = table->tab[i]; if (!q) continue; - opt.set.tab[i].present = true; - opt.set.tab[i].limit = q->limit; - opt.set.tab[i].prio = q->prio; - opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; - opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; - opt.set.tab[i].is_ecn = gred_use_ecn(q); - opt.set.tab[i].is_harddrop = gred_use_harddrop(q); - opt.set.tab[i].probability = q->parms.max_P; - opt.set.tab[i].backlog = &q->backlog; + opt->set.tab[i].present = true; + opt->set.tab[i].limit = q->limit; + opt->set.tab[i].prio = q->prio; + opt->set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; + opt->set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; + opt->set.tab[i].is_ecn = gred_use_ecn(q); + opt->set.tab[i].is_harddrop = gred_use_harddrop(q); + opt->set.tab[i].probability = q->parms.max_P; + opt->set.tab[i].backlog = &q->backlog; } - opt.set.qstats = &sch->qstats; + opt->set.qstats = &sch->qstats; } - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt); + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, opt); } static int gred_offload_dump_stats(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt_offload *hw_stats; + u64 bytes = 0, packets = 0; unsigned int i; int ret; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:370 @ static int gred_offload_dump_stats(struc hw_stats->handle = sch->handle; hw_stats->parent = sch->parent; - for (i = 0; i < MAX_DPs; i++) + for (i = 0; i < MAX_DPs; i++) { + gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]); if (table->tab[i]) hw_stats->stats.xstats[i] = &table->tab[i]->stats; + } ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); /* Even if driver returns failure adjust the stats - in case offload @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:383 @ static int gred_offload_dump_stats(struc for (i = 0; i < MAX_DPs; i++) { if (!table->tab[i]) continue; - table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; - table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; + table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets); + table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes); table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; - _bstats_update(&sch->bstats, - hw_stats->stats.bstats[i].bytes, - hw_stats->stats.bstats[i].packets); + bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes); + packets += u64_stats_read(&hw_stats->stats.bstats[i].packets); sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; sch->qstats.drops += hw_stats->stats.qstats[i].drops; sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; } + _bstats_update(&sch->bstats, bytes, packets); kfree(hw_stats); return ret; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:736 @ err_unlock_free: static int gred_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct gred_sched *table = qdisc_priv(sch); struct nlattr *tb[TCA_GRED_MAX + 1]; int err; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:760 @ static int gred_init(struct Qdisc *sch, sch->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); + if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) { + table->opt = kzalloc(sizeof(*table->opt), GFP_KERNEL); + if (!table->opt) + return -ENOMEM; + } + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:922 @ static void gred_destroy(struct Qdisc *s gred_destroy_vq(table->tab[i]); } gred_offload(sch, TC_GRED_DESTROY); + kfree(table->opt); } static struct Qdisc_ops gred_qdisc_ops __read_mostly = { Index: linux-5.15/net/sched/sch_hfsc.c =================================================================== --- linux-5.15.orig/net/sched/sch_hfsc.c +++ linux-5.15/net/sched/sch_hfsc.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:114 @ enum hfsc_class_flags { struct hfsc_class { struct Qdisc_class_common cl_common; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:968 @ hfsc_change_class(struct Qdisc *sch, u32 err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1036 @ hfsc_change_class(struct Qdisc *sch, u32 if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { tcf_block_put(cl->block); kfree(cl); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1329 @ hfsc_dump_class_stats(struct Qdisc *sch, xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1407 @ hfsc_init_qdisc(struct Qdisc *sch, struc if (err) return err; + gnet_stats_basic_sync_init(&q->root.bstats); q->root.cl_common.classid = sch->handle; q->root.sched = q; q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, Index: linux-5.15/net/sched/sch_htb.c =================================================================== --- linux-5.15.orig/net/sched/sch_htb.c +++ linux-5.15/net/sched/sch_htb.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:116 @ struct htb_class { /* * Written often fields */ - struct gnet_stats_basic_packed bstats; - struct gnet_stats_basic_packed bstats_bias; + struct gnet_stats_basic_sync bstats; + struct gnet_stats_basic_sync bstats_bias; struct tc_htb_xstats xstats; /* our special stats */ /* token bucket parameters */ @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1312 @ nla_put_failure: static void htb_offload_aggregate_stats(struct htb_sched *q, struct htb_class *cl) { + u64 bytes = 0, packets = 0; struct htb_class *c; unsigned int i; - memset(&cl->bstats, 0, sizeof(cl->bstats)); + gnet_stats_basic_sync_init(&cl->bstats); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1328 @ static void htb_offload_aggregate_stats( if (p != cl) continue; - cl->bstats.bytes += c->bstats_bias.bytes; - cl->bstats.packets += c->bstats_bias.packets; + bytes += u64_stats_read(&c->bstats_bias.bytes); + packets += u64_stats_read(&c->bstats_bias.packets); if (c->level == 0) { - cl->bstats.bytes += c->leaf.q->bstats.bytes; - cl->bstats.packets += c->leaf.q->bstats.packets; + bytes += u64_stats_read(&c->leaf.q->bstats.bytes); + packets += u64_stats_read(&c->leaf.q->bstats.packets); } } } + _bstats_update(&cl->bstats, bytes, packets); } static int @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1363 @ htb_dump_class_stats(struct Qdisc *sch, if (cl->leaf.q) cl->bstats = cl->leaf.q->bstats; else - memset(&cl->bstats, 0, sizeof(cl->bstats)); - cl->bstats.bytes += cl->bstats_bias.bytes; - cl->bstats.packets += cl->bstats_bias.packets; + gnet_stats_basic_sync_init(&cl->bstats); + _bstats_update(&cl->bstats, + u64_stats_read(&cl->bstats_bias.bytes), + u64_stats_read(&cl->bstats_bias.packets)); } else { htb_offload_aggregate_stats(q, cl); } } - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) return -1; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1587 @ static int htb_destroy_class_offload(str } if (cl->parent) { - cl->parent->bstats_bias.bytes += q->bstats.bytes; - cl->parent->bstats_bias.packets += q->bstats.packets; + _bstats_update(&cl->parent->bstats_bias, + u64_stats_read(&q->bstats.bytes), + u64_stats_read(&q->bstats.packets)); } offload_opt = (struct tc_htb_qopt_offload) { @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1881 @ static int htb_change_class(struct Qdisc if (!cl) goto failure; + gnet_stats_basic_sync_init(&cl->bstats); + gnet_stats_basic_sync_init(&cl->bstats_bias); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1893 @ static int htb_change_class(struct Qdisc err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE] ? : &est.nla); if (err) goto err_block_put; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1957 @ static int htb_change_class(struct Qdisc htb_graft_helper(dev_queue, old_q); goto err_kill_estimator; } - parent->bstats_bias.bytes += old_q->bstats.bytes; - parent->bstats_bias.packets += old_q->bstats.packets; + _bstats_update(&parent->bstats_bias, + u64_stats_read(&old_q->bstats.bytes), + u64_stats_read(&old_q->bstats.packets)); qdisc_put(old_q); } new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2019 @ static int htb_change_class(struct Qdisc err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; Index: linux-5.15/net/sched/sch_mq.c =================================================================== --- linux-5.15.orig/net/sched/sch_mq.c +++ linux-5.15/net/sched/sch_mq.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:156 @ static int mq_dump(struct Qdisc *sch, st struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; - __u32 qlen = 0; sch->q.qlen = 0; - memset(&sch->bstats, 0, sizeof(sch->bstats)); + gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:170 @ static int mq_dump(struct Qdisc *sch, st qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - qlen = qdisc_qlen_sum(qdisc); - __gnet_stats_copy_basic(NULL, &sch->bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - qdisc->cpu_qstats, - &qdisc->qstats, qlen); - sch->q.qlen += qlen; - } else { - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; - } + gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:257 @ static int mq_dump_class_stats(struct Qd struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, - &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; Index: linux-5.15/net/sched/sch_mqprio.c =================================================================== --- linux-5.15.orig/net/sched/sch_mqprio.c +++ linux-5.15/net/sched/sch_mqprio.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:448 @ static int mqprio_dump(struct Qdisc *sch unsigned int ntx, tc; sch->q.qlen = 0; - memset(&sch->bstats, 0, sizeof(sch->bstats)); + gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:460 @ static int mqprio_dump(struct Qdisc *sch qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - __u32 qlen = qdisc_qlen_sum(qdisc); - - __gnet_stats_copy_basic(NULL, &sch->bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - qdisc->cpu_qstats, - &qdisc->qstats, qlen); - sch->q.qlen += qlen; - } else { - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; - } + gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:560 @ static int mqprio_dump_class_stats(struc { if (cl >= TC_H_MIN_PRIORITY) { int i; - __u32 qlen = 0; + __u32 qlen; struct gnet_stats_queue qstats = {0}; - struct gnet_stats_basic_packed bstats = {0}; + struct gnet_stats_basic_sync bstats; struct net_device *dev = qdisc_dev(sch); struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; + gnet_stats_basic_sync_init(&bstats); /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we * hold here is the look on dev_queue->qdisc_sleeping @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:581 @ static int mqprio_dump_class_stats(struc spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - qlen = qdisc_qlen_sum(qdisc); + gnet_stats_add_basic(&bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); - __gnet_stats_copy_basic(NULL, &bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&qstats, - qdisc->cpu_qstats, - &qdisc->qstats, - qlen); - } else { - qlen += qdisc->q.qlen; - bstats.bytes += qdisc->bstats.bytes; - bstats.packets += qdisc->bstats.packets; - qstats.backlog += qdisc->qstats.backlog; - qstats.drops += qdisc->qstats.drops; - qstats.requeues += qdisc->qstats.requeues; - qstats.overlimits += qdisc->qstats.overlimits; - } spin_unlock_bh(qdisc_lock(qdisc)); } + qlen = qdisc_qlen(sch) + qstats.qlen; /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, - sch->cpu_bstats, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, + &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; } Index: linux-5.15/net/sched/sch_multiq.c =================================================================== --- linux-5.15.orig/net/sched/sch_multiq.c +++ linux-5.15/net/sched/sch_multiq.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:340 @ static int multiq_dump_class_stats(struc struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; Index: linux-5.15/net/sched/sch_prio.c =================================================================== --- linux-5.15.orig/net/sched/sch_prio.c +++ linux-5.15/net/sched/sch_prio.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:362 @ static int prio_dump_class_stats(struct struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, + &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; Index: linux-5.15/net/sched/sch_qfq.c =================================================================== --- linux-5.15.orig/net/sched/sch_qfq.c +++ linux-5.15/net/sched/sch_qfq.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:134 @ struct qfq_class { unsigned int filter_cnt; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct Qdisc *qdisc; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:455 @ static int qfq_change_class(struct Qdisc err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:469 @ static int qfq_change_class(struct Qdisc if (cl == NULL) return -ENOBUFS; + gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:482 @ static int qfq_change_class(struct Qdisc err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) goto destroy_class; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:644 @ static int qfq_dump_class_stats(struct Q xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || qdisc_qstats_copy(d, cl->qdisc) < 0) return -1; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:1238 @ static int qfq_enqueue(struct sk_buff *s return err; } - cl->bstats.bytes += len; - cl->bstats.packets += gso_segs; + _bstats_update(&cl->bstats, len, gso_segs); sch->qstats.backlog += len; ++sch->q.qlen; Index: linux-5.15/net/sched/sch_taprio.c =================================================================== --- linux-5.15.orig/net/sched/sch_taprio.c +++ linux-5.15/net/sched/sch_taprio.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2025 @ static int taprio_dump_class_stats(struc struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; Index: linux-5.15/net/sunrpc/svc_xprt.c =================================================================== --- linux-5.15.orig/net/sunrpc/svc_xprt.c +++ linux-5.15/net/sunrpc/svc_xprt.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:444 @ void svc_xprt_do_enqueue(struct svc_xprt if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) return; - cpu = get_cpu(); + cpu = get_cpu_light(); pool = svc_pool_for_cpu(xprt->xpt_server, cpu); atomic_long_inc(&pool->sp_stats.packets); @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:468 @ void svc_xprt_do_enqueue(struct svc_xprt rqstp = NULL; out_unlock: rcu_read_unlock(); - put_cpu(); + put_cpu_light(); trace_svc_xprt_do_enqueue(xprt, rqstp); } EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); Index: linux-5.15/samples/kfifo/bytestream-example.c =================================================================== --- linux-5.15.orig/samples/kfifo/bytestream-example.c +++ linux-5.15/samples/kfifo/bytestream-example.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:25 @ #define PROC_FIFO "bytestream-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:119 @ static ssize_t fifo_write(struct file *f int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); if (ret) return ret; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:137 @ static ssize_t fifo_read(struct file *fi int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); if (ret) return ret; Index: linux-5.15/samples/kfifo/inttype-example.c =================================================================== --- linux-5.15.orig/samples/kfifo/inttype-example.c +++ linux-5.15/samples/kfifo/inttype-example.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:25 @ #define PROC_FIFO "int-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:112 @ static ssize_t fifo_write(struct file *f int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); if (ret) return ret; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:130 @ static ssize_t fifo_read(struct file *fi int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); if (ret) return ret; Index: linux-5.15/samples/kfifo/record-example.c =================================================================== --- linux-5.15.orig/samples/kfifo/record-example.c +++ linux-5.15/samples/kfifo/record-example.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:25 @ #define PROC_FIFO "record-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:126 @ static ssize_t fifo_write(struct file *f int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); if (ret) return ret; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:144 @ static ssize_t fifo_read(struct file *fi int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); if (ret) return ret; Index: linux-5.15/security/smack/smack_lsm.c =================================================================== --- linux-5.15.orig/security/smack/smack_lsm.c +++ linux-5.15/security/smack/smack_lsm.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:54 @ #define SMK_RECEIVING 1 #define SMK_SENDING 2 +#ifdef SMACK_IPV6_PORT_LABELING static DEFINE_MUTEX(smack_ipv6_lock); static LIST_HEAD(smk_ipv6_port_list); +#endif struct kmem_cache *smack_rule_cache; int smack_enabled __initdata; @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2601 @ static void smk_ipv6_port_label(struct s mutex_unlock(&smack_ipv6_lock); return; } -#endif /** * smk_ipv6_port_check - check Smack port access @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2663 @ static int smk_ipv6_port_check(struct so return smk_ipv6_check(skp, object, address, act); } +#endif /** * smack_inode_setsecurity - set smack xattrs @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:2850 @ static int smack_socket_connect(struct s rc = smk_ipv6_check(ssp->smk_out, rsp, sip, SMK_CONNECTING); } - if (__is_defined(SMACK_IPV6_PORT_LABELING)) - rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); +#ifdef SMACK_IPV6_PORT_LABELING + rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); +#endif return rc; } Index: linux-5.15/sound/soc/mediatek/common/mtk-afe-fe-dai.c =================================================================== --- linux-5.15.orig/sound/soc/mediatek/common/mtk-afe-fe-dai.c +++ linux-5.15/sound/soc/mediatek/common/mtk-afe-fe-dai.c @ linux-5.15/Documentation/admin-guide/cgroup-v1/memory.rst:291 @ const struct snd_soc_dai_ops mtk_afe_fe_ }; EXPORT_SYMBOL_GPL(mtk_afe_fe_ops); -static DEFINE_MUTEX(irqs_lock); int mtk_dynamic_irq_acquire(struct mtk_base_afe *afe) { int i;