distribution/packages/kernel/linux/patches/AMD64/001-patch-6.3.3-rt15.patch

7875 lines
235 KiB
Diff

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e24a9820e12fa..ba2cf1cec3d9c 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -34,6 +34,7 @@ config ARM
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_MEMTEST
@@ -72,7 +73,7 @@ config ARM
select HARDIRQS_SW_RESEND
select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT
select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT
select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
@@ -117,6 +118,8 @@ config ARM
select HAVE_PERF_EVENTS
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM
+ select HAVE_PREEMPT_LAZY
select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RSEQ
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 7f092cb55a417..ffcbf8ebed4bf 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -62,6 +62,7 @@ struct cpu_context_save {
struct thread_info {
unsigned long flags; /* low level flags */
int preempt_count; /* 0 => preemptable, <0 => bug */
+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
__u32 cpu; /* cpu */
__u32 cpu_domain; /* cpu domain */
struct cpu_context_save cpu_context; /* cpu context */
@@ -129,6 +130,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_UPROBE 3 /* breakpointed or singlestepping */
#define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY 5
#define TIF_USING_IWMMXT 17
#define TIF_MEMDIE 18 /* is terminating due to OOM killer */
@@ -148,6 +150,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
#define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT)
/* Checks for any syscall work in entry-common.S */
@@ -157,7 +160,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
/*
* Change these and you break ASM code in entry-common.S
*/
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+ _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NOTIFY_SIGNAL)
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 38121c59cbc26..c6fafd53d5bea 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -43,6 +43,7 @@ int main(void)
BLANK();
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
+ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain));
DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context));
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index c39303e5c2347..cfb4660e9feab 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -222,11 +222,18 @@ ENDPROC(__dabt_svc)
#ifdef CONFIG_PREEMPTION
ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
- ldr r0, [tsk, #TI_FLAGS] @ get flags
teq r8, #0 @ if preempt count != 0
+ bne 1f @ return from exeption
+ ldr r0, [tsk, #TI_FLAGS] @ get flags
+ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
+ blne svc_preempt @ preempt!
+
+ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
+ teq r8, #0 @ if preempt lazy count != 0
movne r0, #0 @ force flags to 0
- tst r0, #_TIF_NEED_RESCHED
+ tst r0, #_TIF_NEED_RESCHED_LAZY
blne svc_preempt
+1:
#endif
svc_exit r5, irq = 1 @ return from exception
@@ -241,8 +248,14 @@ ENDPROC(__irq_svc)
1: bl preempt_schedule_irq @ irq en/disable is done inside
ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
tst r0, #_TIF_NEED_RESCHED
+ bne 1b
+ tst r0, #_TIF_NEED_RESCHED_LAZY
reteq r8 @ go again
- b 1b
+ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
+ teq r0, #0 @ if preempt lazy count != 0
+ beq 1b
+ ret r8 @ go again
+
#endif
__und_fault:
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index e07f359254c3c..b50a3248e79f3 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
*/
trace_hardirqs_off();
do {
- if (likely(thread_flags & _TIF_NEED_RESCHED)) {
+ if (likely(thread_flags & (_TIF_NEED_RESCHED |
+ _TIF_NEED_RESCHED_LAZY))) {
schedule();
} else {
if (unlikely(!user_mode(regs)))
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 2418f1efabd87..79ab2138ab0a8 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -436,6 +436,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
if (addr < TASK_SIZE)
return do_page_fault(addr, fsr, regs);
+ if (interrupts_enabled(regs))
+ local_irq_enable();
+
if (user_mode(regs))
goto bad_area;
@@ -506,6 +509,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
static int
do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
+ if (interrupts_enabled(regs))
+ local_irq_enable();
+
do_bad_area(addr, fsr, regs);
return 0;
}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1023e896d46b8..29fcf54cf68ad 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -95,6 +95,7 @@ config ARM64
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ select ARCH_SUPPORTS_RT
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
@@ -207,6 +208,7 @@ config ARM64
select HAVE_PERF_USER_STACK_DUMP
select HAVE_PREEMPT_DYNAMIC_KEY
select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_PREEMPT_LAZY
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_FUNCTION_ARG_ACCESS_API
select MMU_GATHER_RCU_TABLE_FREE
diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
index 0159b625cc7f0..a5486918e5eeb 100644
--- a/arch/arm64/include/asm/preempt.h
+++ b/arch/arm64/include/asm/preempt.h
@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_and_test(void)
* interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE
* pair.
*/
- return !pc || !READ_ONCE(ti->preempt_count);
+ if (!pc || !READ_ONCE(ti->preempt_count))
+ return true;
+#ifdef CONFIG_PREEMPT_LAZY
+ if ((pc & ~PREEMPT_NEED_RESCHED))
+ return false;
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+ return false;
+#endif
}
static inline bool should_resched(int preempt_offset)
{
+#ifdef CONFIG_PREEMPT_LAZY
+ u64 pc = READ_ONCE(current_thread_info()->preempt_count);
+ if (pc == preempt_offset)
+ return true;
+
+ if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset)
+ return false;
+
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
u64 pc = READ_ONCE(current_thread_info()->preempt_count);
return pc == preempt_offset;
+#endif
}
#ifdef CONFIG_PREEMPTION
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 848739c15de82..4b7148fd5551f 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -26,6 +26,7 @@ struct thread_info {
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
u64 ttbr0; /* saved TTBR0_EL1 */
#endif
+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
union {
u64 preempt_count; /* 0 => preemptible, <0 => bug */
struct {
@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_struct *dst,
#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
#define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */
#define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY 7
#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
#define TIF_SYSCALL_AUDIT 9 /* syscall auditing */
#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */
@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_struct *dst,
#define _TIF_SVE (1 << TIF_SVE)
#define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+ _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
_TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
_TIF_NOTIFY_SIGNAL)
@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_struct *dst,
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
_TIF_SYSCALL_EMU)
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+
#ifdef CONFIG_SHADOW_CALL_STACK
#define INIT_SCS \
.scs_base = init_shadow_call_stack, \
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index ae345b06e9f7e..da7e10ff11a81 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -33,6 +33,7 @@ int main(void)
DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu));
DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
+ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count));
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
#endif
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 06a02707f4882..e7f5284d5daf1 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -1278,7 +1278,7 @@ static void do_signal(struct pt_regs *regs)
void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
{
do {
- if (thread_flags & _TIF_NEED_RESCHED) {
+ if (thread_flags & _TIF_NEED_RESCHED_MASK) {
/* Unmask Debug and SError for the next task */
local_daif_restore(DAIF_PROCCTX_NOIRQ);
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a6c4407d3ec83..25f98b854d32f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -154,6 +154,7 @@ config PPC
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x
+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
select ARCH_USE_MEMTEST
@@ -247,8 +248,10 @@ config PPC
select HAVE_PERF_EVENTS_NMI if PPC64
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_PREEMPT_LAZY
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM
select HAVE_RSEQ
select HAVE_SETUP_PER_CPU_AREA if PPC64
select HAVE_SOFTIRQ_ON_OWN_STACK
diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h
index 283c346478565..4727f40052ddd 100644
--- a/arch/powerpc/include/asm/stackprotector.h
+++ b/arch/powerpc/include/asm/stackprotector.h
@@ -19,8 +19,13 @@
*/
static __always_inline void boot_init_stack_canary(void)
{
- unsigned long canary = get_random_canary();
+ unsigned long canary;
+#ifndef CONFIG_PREEMPT_RT
+ canary = get_random_canary();
+#else
+ canary = ((unsigned long)&canary) & CANARY_MASK;
+#endif
current->stack_canary = canary;
#ifdef CONFIG_PPC64
get_paca()->canary = canary;
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index af58f1ed3952e..520864de8bb27 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -53,6 +53,8 @@
struct thread_info {
int preempt_count; /* 0 => preemptable,
<0 => BUG */
+ int preempt_lazy_count; /* 0 => preemptable,
+ <0 => BUG */
#ifdef CONFIG_SMP
unsigned int cpu;
#endif
@@ -77,6 +79,7 @@ struct thread_info {
#define INIT_THREAD_INFO(tsk) \
{ \
.preempt_count = INIT_PREEMPT_COUNT, \
+ .preempt_lazy_count = 0, \
.flags = 0, \
}
@@ -102,6 +105,7 @@ void arch_setup_new_exec(void);
#define TIF_PATCH_PENDING 6 /* pending live patching update */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SINGLESTEP 8 /* singlestepping active */
+#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
#define TIF_SECCOMP 10 /* secure computing */
#define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */
#define TIF_NOERROR 12 /* Force successful syscall return */
@@ -117,6 +121,7 @@ void arch_setup_new_exec(void);
#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */
#define TIF_32BIT 20 /* 32 bit binary */
+
/* as above, but as bit values */
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
@@ -128,6 +133,7 @@ void arch_setup_new_exec(void);
#define _TIF_PATCH_PENDING (1<<TIF_PATCH_PENDING)
#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
#define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP)
+#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
#define _TIF_SECCOMP (1<<TIF_SECCOMP)
#define _TIF_RESTOREALL (1<<TIF_RESTOREALL)
#define _TIF_NOERROR (1<<TIF_NOERROR)
@@ -141,10 +147,12 @@ void arch_setup_new_exec(void);
_TIF_SYSCALL_EMU)
#define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
+ _TIF_NEED_RESCHED_LAZY | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_RESTORE_TM | _TIF_PATCH_PENDING | \
_TIF_NOTIFY_SIGNAL)
#define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
/* Bits in local_flags */
/* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 0ec1581619db5..e333feb12c984 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -186,7 +186,7 @@ interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs)
ti_flags = read_thread_flags();
while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
local_irq_enable();
- if (ti_flags & _TIF_NEED_RESCHED) {
+ if (ti_flags & _TIF_NEED_RESCHED_MASK) {
schedule();
} else {
/*
@@ -398,11 +398,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
/* Returning to a kernel context with local irqs enabled. */
WARN_ON_ONCE(!(regs->msr & MSR_EE));
again:
- if (IS_ENABLED(CONFIG_PREEMPT)) {
+ if (IS_ENABLED(CONFIG_PREEMPTION)) {
/* Return to preemptible kernel context */
if (unlikely(read_thread_flags() & _TIF_NEED_RESCHED)) {
if (preempt_count() == 0)
preempt_schedule_irq();
+ } else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) {
+ if ((preempt_count() == 0) &&
+ (current_thread_info()->preempt_lazy_count == 0))
+ preempt_schedule_irq();
}
}
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 9bdd79aa51cfc..038f8355b29ca 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -261,12 +261,17 @@ static char *get_mmu_str(void)
static int __die(const char *str, struct pt_regs *regs, long err)
{
+ const char *pr = "";
+
printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
+
printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n",
IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
PAGE_SIZE / 1024, get_mmu_str(),
- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
+ pr,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index a9f57dad6d916..a0b528d4bb7cd 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -225,6 +225,7 @@ config KVM_E500MC
config KVM_MPIC
bool "KVM in-kernel MPIC emulation"
depends on KVM && PPC_E500
+ depends on !PREEMPT_RT
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 9d229ef7f86ef..ada817c49b722 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -51,7 +51,7 @@ static int trace_imc_mem_size;
* core and trace-imc
*/
static struct imc_pmu_ref imc_global_refc = {
- .lock = __SPIN_LOCK_INITIALIZER(imc_global_refc.lock),
+ .lock = __SPIN_LOCK_UNLOCKED(imc_global_refc.lock),
.id = 0,
.refc = 0,
};
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 21b22bf16ce66..b506377a16a74 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -2,6 +2,7 @@
config PPC_PSERIES
depends on PPC64 && PPC_BOOK3S
bool "IBM pSeries & new (POWER5-based) iSeries"
+ select GENERIC_ALLOCATOR
select HAVE_PCSPKR_PLATFORM
select MPIC
select OF_DYNAMIC
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index c74b71d4733d4..64ba14baabd30 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -24,6 +24,7 @@
#include <linux/of.h>
#include <linux/iommu.h>
#include <linux/rculist.h>
+#include <linux/local_lock.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/rtas.h>
@@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
return ret;
}
-static DEFINE_PER_CPU(__be64 *, tce_page);
+struct tce_page {
+ __be64 * page;
+ local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct tce_page, tce_page) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
long npages, unsigned long uaddr,
@@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
direction, attrs);
}
- local_irq_save(flags); /* to protect tcep and the page behind it */
+ /* to protect tcep and the page behind it */
+ local_lock_irqsave(&tce_page.lock, flags);
- tcep = __this_cpu_read(tce_page);
+ tcep = __this_cpu_read(tce_page.page);
/* This is safe to do since interrupts are off when we're called
* from iommu_alloc{,_sg}()
@@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
/* If allocation fails, fall back to the loop implementation */
if (!tcep) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&tce_page.lock, flags);
return tce_build_pSeriesLP(tbl->it_index, tcenum,
tceshift,
npages, uaddr, direction, attrs);
}
- __this_cpu_write(tce_page, tcep);
+ __this_cpu_write(tce_page.page, tcep);
}
rpn = __pa(uaddr) >> tceshift;
@@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
tcenum += limit;
} while (npages > 0 && !rc);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&tce_page.lock, flags);
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
ret = (int)rc;
@@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
DMA_BIDIRECTIONAL, 0);
}
- local_irq_disable(); /* to protect tcep and the page behind it */
- tcep = __this_cpu_read(tce_page);
+ /* to protect tcep and the page behind it */
+ local_lock_irq(&tce_page.lock);
+ tcep = __this_cpu_read(tce_page.page);
if (!tcep) {
tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
if (!tcep) {
- local_irq_enable();
+ local_unlock_irq(&tce_page.lock);
return -ENOMEM;
}
- __this_cpu_write(tce_page, tcep);
+ __this_cpu_write(tce_page.page, tcep);
}
proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
@@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
/* error cleanup: caller will clear whole range */
- local_irq_enable();
+ local_unlock_irq(&tce_page.lock);
return rc;
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a825bf031f495..dcbf3c08926b1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -114,6 +114,7 @@ config X86
select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG
select ARCH_SUPPORTS_LTO_CLANG
select ARCH_SUPPORTS_LTO_CLANG_THIN
+ select ARCH_SUPPORTS_RT
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
@@ -252,6 +253,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_PREEMPT_LAZY
select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25b1bd8f..5b096893f6a21 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -90,18 +90,49 @@ static __always_inline void __preempt_count_sub(int val)
* a decrement which hits zero means we have no preempt_count and should
* reschedule.
*/
-static __always_inline bool __preempt_count_dec_and_test(void)
+static __always_inline bool ____preempt_count_dec_and_test(void)
{
return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e,
__percpu_arg([var]));
}
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+ if (____preempt_count_dec_and_test())
+ return true;
+#ifdef CONFIG_PREEMPT_LAZY
+ if (preempt_count())
+ return false;
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+ return false;
+#endif
+}
+
/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(int preempt_offset)
{
+#ifdef CONFIG_PREEMPT_LAZY
+ u32 tmp;
+ tmp = raw_cpu_read_4(pcpu_hot.preempt_count);
+ if (tmp == preempt_offset)
+ return true;
+
+ /* preempt count == 0 ? */
+ tmp &= ~PREEMPT_NEED_RESCHED;
+ if (tmp != preempt_offset)
+ return false;
+ /* XXX PREEMPT_LOCK_OFFSET */
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset);
+#endif
}
#ifdef CONFIG_PREEMPTION
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f1cccba52eb97..c8697ca0378f4 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,6 +57,8 @@ struct thread_info {
unsigned long flags; /* low level flags */
unsigned long syscall_work; /* SYSCALL_WORK_ flags */
u32 status; /* thread synchronous flags */
+ int preempt_lazy_count; /* 0 => lazy preemptable
+ <0 => BUG */
#ifdef CONFIG_SMP
u32 cpu; /* current CPU */
#endif
@@ -65,6 +67,7 @@ struct thread_info {
#define INIT_THREAD_INFO(tsk) \
{ \
.flags = 0, \
+ .preempt_lazy_count = 0, \
}
#else /* !__ASSEMBLY__ */
@@ -92,6 +95,7 @@ struct thread_info {
#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -115,6 +119,7 @@ struct thread_info {
#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index aa490da3cef23..d73b6d32bd827 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -57,6 +57,40 @@ static void zram_free_page(struct zram *zram, size_t index);
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
u32 index, int offset, struct bio *bio);
+#ifdef CONFIG_PREEMPT_RT
+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
+{
+ size_t index;
+
+ for (index = 0; index < num_pages; index++)
+ spin_lock_init(&zram->table[index].lock);
+}
+
+static int zram_slot_trylock(struct zram *zram, u32 index)
+{
+ int ret;
+
+ ret = spin_trylock(&zram->table[index].lock);
+ if (ret)
+ __set_bit(ZRAM_LOCK, &zram->table[index].flags);
+ return ret;
+}
+
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+ spin_lock(&zram->table[index].lock);
+ __set_bit(ZRAM_LOCK, &zram->table[index].flags);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+ __clear_bit(ZRAM_LOCK, &zram->table[index].flags);
+ spin_unlock(&zram->table[index].lock);
+}
+
+#else
+
+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
static int zram_slot_trylock(struct zram *zram, u32 index)
{
@@ -72,6 +106,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
{
bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
}
+#endif
static inline bool init_done(struct zram *zram)
{
@@ -1311,6 +1346,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
if (!huge_class_size)
huge_class_size = zs_huge_class_size(zram->mem_pool);
+ zram_meta_init_table_locks(zram, num_pages);
return true;
}
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index c5254626f051f..a6b24dfec95fb 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -69,6 +69,9 @@ struct zram_table_entry {
unsigned long element;
};
unsigned long flags;
+#ifdef CONFIG_PREEMPT_RT
+ spinlock_t lock;
+#endif
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
ktime_t ac_time;
#endif
diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index ed5dabd3c72d6..450d7985ff346 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -50,6 +50,45 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
return container_of(data, struct tpm_tis_tcg_phy, priv);
}
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Flush previous write operations with a dummy read operation to the
+ * TPM MMIO base address.
+ */
+static inline void tpm_tis_flush(void __iomem *iobase)
+{
+ ioread8(iobase + TPM_ACCESS(0));
+}
+#else
+#define tpm_tis_flush(iobase) do { } while (0)
+#endif
+
+/*
+ * Write a byte word to the TPM MMIO address, and flush the write queue.
+ * The flush ensures that the data is sent immediately over the bus and not
+ * aggregated with further requests and transferred later in a batch. The large
+ * write requests can lead to unwanted latency spikes by blocking the CPU until
+ * the complete batch has been transferred.
+ */
+static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
+{
+ iowrite8(b, iobase + addr);
+ tpm_tis_flush(iobase);
+}
+
+/*
+ * Write a 32-bit word to the TPM MMIO address, and flush the write queue.
+ * The flush ensures that the data is sent immediately over the bus and not
+ * aggregated with further requests and transferred later in a batch. The large
+ * write requests can lead to unwanted latency spikes by blocking the CPU until
+ * the complete batch has been transferred.
+ */
+static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
+{
+ iowrite32(b, iobase + addr);
+ tpm_tis_flush(iobase);
+}
+
static int interrupts = -1;
module_param(interrupts, int, 0444);
MODULE_PARM_DESC(interrupts, "Enable interrupts");
@@ -186,12 +225,12 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
switch (io_mode) {
case TPM_TIS_PHYS_8:
while (len--)
- iowrite8(*value++, phy->iobase + addr);
+ tpm_tis_iowrite8(*value++, phy->iobase, addr);
break;
case TPM_TIS_PHYS_16:
return -EINVAL;
case TPM_TIS_PHYS_32:
- iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase + addr);
+ tpm_tis_iowrite32(le32_to_cpu(*((__le32 *)value)), phy->iobase, addr);
break;
}
diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 98f4e44976e09..d4dba1f89fde5 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -3,7 +3,6 @@ config DRM_I915
tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics"
depends on DRM
depends on X86 && PCI
- depends on !PREEMPT_RT
select INTEL_GTT if X86
select INTERVAL_TREE
# we need shmfs for the swappable backing store, and in particular
diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c
index d5b5d40ed817f..710e51d2377fe 100644
--- a/drivers/gpu/drm/i915/display/intel_crtc.c
+++ b/drivers/gpu/drm/i915/display/intel_crtc.c
@@ -520,7 +520,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
*/
intel_psr_wait_for_idle_locked(new_crtc_state);
- local_irq_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
crtc->debug.min_vbl = min;
crtc->debug.max_vbl = max;
@@ -545,11 +546,13 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
break;
}
- local_irq_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
timeout = schedule_timeout(timeout);
- local_irq_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
}
finish_wait(wq, &wait);
@@ -582,7 +585,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state)
return;
irq_disable:
- local_irq_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
}
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE)
@@ -691,7 +695,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state)
if (new_crtc_state->seamless_m_n && intel_crtc_needs_fastset(new_crtc_state))
intel_crtc_update_active_timings(new_crtc_state);
- local_irq_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
if (intel_vgpu_active(dev_priv))
return;
diff --git a/drivers/gpu/drm/i915/display/intel_vblank.c b/drivers/gpu/drm/i915/display/intel_vblank.c
index 4c83e2320bcac..2dd4ac8b30266 100644
--- a/drivers/gpu/drm/i915/display/intel_vblank.c
+++ b/drivers/gpu/drm/i915/display/intel_vblank.c
@@ -293,7 +293,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
*/
spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
- /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
/* Get optional system timestamp before query. */
if (stime)
@@ -358,7 +359,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
if (etime)
*etime = ktime_get();
- /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index ecc990ec1b952..8d04b10681f0d 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
/* Kick the work once more to drain the signalers, and disarm the irq */
irq_work_sync(&b->irq_work);
while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
- local_irq_disable();
- signal_irq_work(&b->irq_work);
- local_irq_enable();
+ irq_work_queue(&b->irq_work);
cond_resched();
+ irq_work_sync(&b->irq_work);
}
}
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 750326434677f..a2658a8ff7353 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -1303,7 +1303,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
* and context switches) submission.
*/
- spin_lock(&sched_engine->lock);
+ spin_lock_irq(&sched_engine->lock);
/*
* If the queue is higher priority than the last
@@ -1403,7 +1403,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
* Even if ELSP[1] is occupied and not worthy
* of timeslices, our queue might be.
*/
- spin_unlock(&sched_engine->lock);
+ spin_unlock_irq(&sched_engine->lock);
return;
}
}
@@ -1429,7 +1429,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
if (last && !can_merge_rq(last, rq)) {
spin_unlock(&ve->base.sched_engine->lock);
- spin_unlock(&engine->sched_engine->lock);
+ spin_unlock_irq(&engine->sched_engine->lock);
return; /* leave this for another sibling */
}
@@ -1591,7 +1591,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
*/
sched_engine->queue_priority_hint = queue_prio(sched_engine);
i915_sched_engine_reset_on_empty(sched_engine);
- spin_unlock(&sched_engine->lock);
+ spin_unlock_irq(&sched_engine->lock);
/*
* We can skip poking the HW if we ended up with exactly the same set
@@ -1617,13 +1617,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
}
}
-static void execlists_dequeue_irq(struct intel_engine_cs *engine)
-{
- local_irq_disable(); /* Suspend interrupts across request submission */
- execlists_dequeue(engine);
- local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */
-}
-
static void clear_ports(struct i915_request **ports, int count)
{
memset_p((void **)ports, NULL, count);
@@ -2477,7 +2470,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t)
}
if (!engine->execlists.pending[0]) {
- execlists_dequeue_irq(engine);
+ execlists_dequeue(engine);
start_timeslice(engine);
}
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 7503dcb9043bb..799fb8083470f 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -613,7 +613,6 @@ bool __i915_request_submit(struct i915_request *request)
RQ_TRACE(request, "\n");
- GEM_BUG_ON(!irqs_disabled());
lockdep_assert_held(&engine->sched_engine->lock);
/*
@@ -722,7 +721,6 @@ void __i915_request_unsubmit(struct i915_request *request)
*/
RQ_TRACE(request, "\n");
- GEM_BUG_ON(!irqs_disabled());
lockdep_assert_held(&engine->sched_engine->lock);
/*
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index f6f9228a13518..0ff1b60be8382 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -6,6 +6,10 @@
#if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
#define _I915_TRACE_H_
+#ifdef CONFIG_PREEMPT_RT
+#define NOTRACE
+#endif
+
#include <linux/stringify.h>
#include <linux/types.h>
#include <linux/tracepoint.h>
@@ -322,7 +326,7 @@ DEFINE_EVENT(i915_request, i915_request_add,
TP_ARGS(rq)
);
-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS)
+#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE)
DEFINE_EVENT(i915_request, i915_request_guc_submit,
TP_PROTO(struct i915_request *rq),
TP_ARGS(rq)
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 2c430c0c3badd..7ec828637d622 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -288,7 +288,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
#define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000)
/* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */
-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT)
+#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT)
# define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic())
#else
# define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0)
diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index 1e8fe44a7099f..b336d00d7988e 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -177,12 +177,277 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value)
up->dl_write(up, value);
}
+static inline bool serial8250_is_console(struct uart_port *port)
+{
+ return uart_console(port) && !hlist_unhashed_lockless(&port->cons->node);
+}
+
+/**
+ * serial8250_init_wctxt - Initialize a write context for
+ * non-console-printing usage
+ * @wctxt: The write context to initialize
+ * @cons: The console to assign to the write context
+ *
+ * In order to mark an unsafe region, drivers must acquire the console. This
+ * requires providing an initialized write context (even if that driver will
+ * not be doing any printing).
+ *
+ * This function should not be used for console printing contexts.
+ */
+static inline void serial8250_init_wctxt(struct cons_write_context *wctxt,
+ struct console *cons)
+{
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ memset(wctxt, 0, sizeof(*wctxt));
+ ctxt->console = cons;
+ ctxt->prio = CONS_PRIO_NORMAL;
+}
+
+/**
+ * __serial8250_console_acquire - Acquire a console for
+ * non-console-printing usage
+ * @wctxt: An uninitialized write context to use for acquiring
+ * @cons: The console to assign to the write context
+ *
+ * The caller is holding the port->lock.
+ * The caller is holding the console_srcu_read_lock.
+ *
+ * This function should not be used for console printing contexts.
+ */
+static inline void __serial8250_console_acquire(struct cons_write_context *wctxt,
+ struct console *cons)
+{
+ for (;;) {
+ serial8250_init_wctxt(wctxt, cons);
+ if (console_try_acquire(wctxt))
+ break;
+ cpu_relax();
+ }
+}
+
+/**
+ * serial8250_enter_unsafe - Mark the beginning of an unsafe region for
+ * non-console-printing usage
+ * @up: The port that is entering the unsafe state
+ *
+ * The caller should ensure @up is a console before calling this function.
+ *
+ * The caller is holding the port->lock.
+ * This function takes the console_srcu_read_lock and becomes owner of the
+ * console associated with @up.
+ *
+ * This function should not be used for console printing contexts.
+ */
+static inline void serial8250_enter_unsafe(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+
+ lockdep_assert_held_once(&port->lock);
+
+ for (;;) {
+ up->cookie = console_srcu_read_lock();
+
+ __serial8250_console_acquire(&up->wctxt, port->cons);
+
+ if (console_enter_unsafe(&up->wctxt))
+ break;
+
+ console_srcu_read_unlock(up->cookie);
+ cpu_relax();
+ }
+}
+
+/**
+ * serial8250_exit_unsafe - Mark the end of an unsafe region for
+ * non-console-printing usage
+ * @up: The port that is exiting the unsafe state
+ *
+ * The caller is holding the port->lock.
+ * This function releases ownership of the console associated with @up and
+ * releases the console_srcu_read_lock.
+ *
+ * This function should not be used for console printing contexts.
+ */
+static inline void serial8250_exit_unsafe(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+
+ lockdep_assert_held_once(&port->lock);
+
+ if (console_exit_unsafe(&up->wctxt))
+ console_release(&up->wctxt);
+
+ console_srcu_read_unlock(up->cookie);
+}
+
+/**
+ * serial8250_in_IER - Read the IER register for
+ * non-console-printing usage
+ * @up: The port to work on
+ *
+ * Returns: The value read from IER
+ *
+ * The caller is holding the port->lock.
+ *
+ * This is the top-level function for non-console-printing contexts to
+ * read the IER register. The caller does not need to care if @up is a
+ * console before calling this function.
+ *
+ * This function should not be used for printing contexts.
+ */
+static inline int serial8250_in_IER(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+ bool is_console;
+ int ier;
+
+ is_console = serial8250_is_console(port);
+
+ if (is_console)
+ serial8250_enter_unsafe(up);
+
+ ier = serial_in(up, UART_IER);
+
+ if (is_console)
+ serial8250_exit_unsafe(up);
+
+ return ier;
+}
+
+/**
+ * __serial8250_set_IER - Directly write to the IER register
+ * @up: The port to work on
+ * @wctxt: The current write context
+ * @ier: The value to write
+ *
+ * Returns: True if IER was written to. False otherwise
+ *
+ * The caller is holding the port->lock.
+ * The caller is holding the console_srcu_read_unlock.
+ * The caller is the owner of the console associated with @up.
+ *
+ * This function should only be directly called within console printing
+ * contexts. Other contexts should use serial8250_set_IER().
+ */
+static inline bool __serial8250_set_IER(struct uart_8250_port *up,
+ struct cons_write_context *wctxt,
+ int ier)
+{
+ if (wctxt && !console_can_proceed(wctxt))
+ return false;
+ serial_out(up, UART_IER, ier);
+ return true;
+}
+
+/**
+ * serial8250_set_IER - Write a new value to the IER register for
+ * non-console-printing usage
+ * @up: The port to work on
+ * @ier: The value to write
+ *
+ * The caller is holding the port->lock.
+ *
+ * This is the top-level function for non-console-printing contexts to
+ * write to the IER register. The caller does not need to care if @up is a
+ * console before calling this function.
+ *
+ * This function should not be used for printing contexts.
+ */
+static inline void serial8250_set_IER(struct uart_8250_port *up, int ier)
+{
+ struct uart_port *port = &up->port;
+ bool is_console;
+
+ is_console = serial8250_is_console(port);
+
+ if (is_console) {
+ serial8250_enter_unsafe(up);
+ while (!__serial8250_set_IER(up, &up->wctxt, ier)) {
+ console_srcu_read_unlock(up->cookie);
+ console_enter_unsafe(&up->wctxt);
+ }
+ serial8250_exit_unsafe(up);
+ } else {
+ __serial8250_set_IER(up, NULL, ier);
+ }
+}
+
+/**
+ * __serial8250_clear_IER - Directly clear the IER register
+ * @up: The port to work on
+ * @wctxt: The current write context
+ * @prior: Gets set to the previous value of IER
+ *
+ * Returns: True if IER was cleared and @prior points to the previous
+ * value of IER. False otherwise and @prior is invalid
+ *
+ * The caller is holding the port->lock.
+ * The caller is holding the console_srcu_read_unlock.
+ * The caller is the owner of the console associated with @up.
+ *
+ * This function should only be directly called within console printing
+ * contexts. Other contexts should use serial8250_clear_IER().
+ */
+static inline bool __serial8250_clear_IER(struct uart_8250_port *up,
+ struct cons_write_context *wctxt,
+ int *prior)
+{
+ unsigned int clearval = 0;
+
+ if (up->capabilities & UART_CAP_UUE)
+ clearval = UART_IER_UUE;
+
+ *prior = serial_in(up, UART_IER);
+ if (wctxt && !console_can_proceed(wctxt))
+ return false;
+ serial_out(up, UART_IER, clearval);
+ return true;
+}
+
+/**
+ * serial8250_clear_IER - Clear the IER register for
+ * non-console-printing usage
+ * @up: The port to work on
+ *
+ * Returns: The previous value of IER
+ *
+ * The caller is holding the port->lock.
+ *
+ * This is the top-level function for non-console-printing contexts to
+ * clear the IER register. The caller does not need to care if @up is a
+ * console before calling this function.
+ *
+ * This function should not be used for printing contexts.
+ */
+static inline int serial8250_clear_IER(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+ bool is_console;
+ int prior;
+
+ is_console = serial8250_is_console(port);
+
+ if (is_console) {
+ serial8250_enter_unsafe(up);
+ while (!__serial8250_clear_IER(up, &up->wctxt, &prior)) {
+ console_srcu_read_unlock(up->cookie);
+ console_enter_unsafe(&up->wctxt);
+ }
+ serial8250_exit_unsafe(up);
+ } else {
+ __serial8250_clear_IER(up, NULL, &prior);
+ }
+
+ return prior;
+}
+
static inline bool serial8250_set_THRI(struct uart_8250_port *up)
{
if (up->ier & UART_IER_THRI)
return false;
up->ier |= UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
return true;
}
@@ -191,7 +456,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
if (!(up->ier & UART_IER_THRI))
return false;
up->ier &= ~UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
return true;
}
diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c
index 9d2a7856784f7..7cc6b527c088b 100644
--- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle(struct uart_8250_port *up,
up->ier &= ~irqs;
if (!throttle)
up->ier |= irqs;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
}
static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle)
{
diff --git a/drivers/tty/serial/8250/8250_bcm7271.c b/drivers/tty/serial/8250/8250_bcm7271.c
index f801b1f5b46c0..a29f5f45d22f2 100644
--- a/drivers/tty/serial/8250/8250_bcm7271.c
+++ b/drivers/tty/serial/8250/8250_bcm7271.c
@@ -606,8 +606,10 @@ static int brcmuart_startup(struct uart_port *port)
* Disable the Receive Data Interrupt because the DMA engine
* will handle this.
*/
+ spin_lock_irq(&port->lock);
up->ier &= ~UART_IER_RDI;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
+ spin_unlock_irq(&port->lock);
priv->tx_running = false;
priv->dma.rx_dma = NULL;
@@ -787,6 +789,12 @@ static int brcmuart_handle_irq(struct uart_port *p)
spin_lock_irqsave(&p->lock, flags);
status = serial_port_in(p, UART_LSR);
if ((status & UART_LSR_DR) == 0) {
+ bool is_console;
+
+ is_console = serial8250_is_console(p);
+
+ if (is_console)
+ serial8250_enter_unsafe(up);
ier = serial_port_in(p, UART_IER);
/*
@@ -807,6 +815,9 @@ static int brcmuart_handle_irq(struct uart_port *p)
serial_port_in(p, UART_RX);
}
+ if (is_console)
+ serial8250_exit_unsafe(up);
+
handled = 1;
}
spin_unlock_irqrestore(&p->lock, flags);
@@ -844,12 +855,22 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t)
/* re-enable receive unless upper layer has disabled it */
if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) ==
(UART_IER_RLSI | UART_IER_RDI)) {
+ bool is_console;
+
+ is_console = serial8250_is_console(p);
+
+ if (is_console)
+ serial8250_enter_unsafe(up);
+
status = serial_port_in(p, UART_IER);
status |= (UART_IER_RLSI | UART_IER_RDI);
serial_port_out(p, UART_IER, status);
status = serial_port_in(p, UART_MCR);
status |= UART_MCR_RTS;
serial_port_out(p, UART_MCR, status);
+
+ if (is_console)
+ serial8250_exit_unsafe(up);
}
spin_unlock_irqrestore(&p->lock, flags);
return HRTIMER_NORESTART;
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index ab63c308be0a2..8e89bffa1f121 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -256,6 +256,7 @@ static void serial8250_timeout(struct timer_list *t)
static void serial8250_backup_timeout(struct timer_list *t)
{
struct uart_8250_port *up = from_timer(up, t, timer);
+ struct uart_port *port = &up->port;
unsigned int iir, ier = 0, lsr;
unsigned long flags;
@@ -266,8 +267,23 @@ static void serial8250_backup_timeout(struct timer_list *t)
* based handler.
*/
if (up->port.irq) {
+ bool is_console;
+
+ /*
+ * Do not use serial8250_clear_IER() because this code
+ * ignores capabilties.
+ */
+
+ is_console = serial8250_is_console(port);
+
+ if (is_console)
+ serial8250_enter_unsafe(up);
+
ier = serial_in(up, UART_IER);
serial_out(up, UART_IER, 0);
+
+ if (is_console)
+ serial8250_exit_unsafe(up);
}
iir = serial_in(up, UART_IIR);
@@ -290,7 +306,7 @@ static void serial8250_backup_timeout(struct timer_list *t)
serial8250_tx_chars(up);
if (up->port.irq)
- serial_out(up, UART_IER, ier);
+ serial8250_set_IER(up, ier);
spin_unlock_irqrestore(&up->port.lock, flags);
@@ -576,12 +592,30 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
#ifdef CONFIG_SERIAL_8250_CONSOLE
-static void univ8250_console_write(struct console *co, const char *s,
- unsigned int count)
+static void univ8250_console_port_lock(struct console *con, bool do_lock, unsigned long *flags)
+{
+ struct uart_8250_port *up = &serial8250_ports[con->index];
+
+ if (do_lock)
+ spin_lock_irqsave(&up->port.lock, *flags);
+ else
+ spin_unlock_irqrestore(&up->port.lock, *flags);
+}
+
+static bool univ8250_console_write_atomic(struct console *co,
+ struct cons_write_context *wctxt)
{
struct uart_8250_port *up = &serial8250_ports[co->index];
- serial8250_console_write(up, s, count);
+ return serial8250_console_write_atomic(up, wctxt);
+}
+
+static bool univ8250_console_write_thread(struct console *co,
+ struct cons_write_context *wctxt)
+{
+ struct uart_8250_port *up = &serial8250_ports[co->index];
+
+ return serial8250_console_write_thread(up, wctxt);
}
static int univ8250_console_setup(struct console *co, char *options)
@@ -669,12 +703,14 @@ static int univ8250_console_match(struct console *co, char *name, int idx,
static struct console univ8250_console = {
.name = "ttyS",
- .write = univ8250_console_write,
+ .write_atomic = univ8250_console_write_atomic,
+ .write_thread = univ8250_console_write_thread,
+ .port_lock = univ8250_console_port_lock,
.device = uart_console_device,
.setup = univ8250_console_setup,
.exit = univ8250_console_exit,
.match = univ8250_console_match,
- .flags = CON_PRINTBUFFER | CON_ANYTIME,
+ .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_NO_BKL,
.index = -1,
.data = &serial8250_reg,
};
@@ -962,7 +998,7 @@ static void serial_8250_overrun_backoff_work(struct work_struct *work)
spin_lock_irqsave(&port->lock, flags);
up->ier |= UART_IER_RLSI | UART_IER_RDI;
up->port.read_status_mask |= UART_LSR_DR;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
spin_unlock_irqrestore(&port->lock, flags);
}
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index 64770c62bbec5..ccb70b20b1f4f 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -185,6 +185,10 @@ static void xr17v35x_set_divisor(struct uart_port *p, unsigned int baud,
static int xr17v35x_startup(struct uart_port *port)
{
+ struct uart_8250_port *up = up_to_u8250p(port);
+
+ spin_lock_irq(&port->lock);
+
/*
* First enable access to IER [7:5], ISR [5:4], FCR [5:4],
* MCR [7:5] and MSR [7:0]
@@ -195,7 +199,9 @@ static int xr17v35x_startup(struct uart_port *port)
* Make sure all interrups are masked until initialization is
* complete and the FIFOs are cleared
*/
- serial_port_out(port, UART_IER, 0);
+ serial8250_set_IER(up, 0);
+
+ spin_unlock_irq(&port->lock);
return serial8250_do_startup(port);
}
diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c
index 8adfaa183f778..eaf148245a10d 100644
--- a/drivers/tty/serial/8250/8250_fsl.c
+++ b/drivers/tty/serial/8250/8250_fsl.c
@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port *port)
if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
unsigned long delay;
- up->ier = port->serial_in(port, UART_IER);
+ up->ier = serial8250_in_IER(up);
+
if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
port->ops->stop_rx(port);
} else {
diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c
index fb1d5ec0940e6..bf7ab55c8923f 100644
--- a/drivers/tty/serial/8250/8250_mtk.c
+++ b/drivers/tty/serial/8250/8250_mtk.c
@@ -222,12 +222,38 @@ static void mtk8250_shutdown(struct uart_port *port)
static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask));
+ struct uart_port *port = &up->port;
+ bool is_console;
+ int ier;
+
+ is_console = serial8250_is_console(port);
+
+ if (is_console)
+ serial8250_enter_unsafe(up);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier & (~mask));
+
+ if (is_console)
+ serial8250_exit_unsafe(up);
}
static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask);
+ struct uart_port *port = &up->port;
+ bool is_console;
+ int ier;
+
+ is_console = serial8250_is_console(port);
+
+ if (is_console)
+ serial8250_enter_unsafe(up);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier | mask);
+
+ if (is_console)
+ serial8250_exit_unsafe(up);
}
static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 734f092ef839a..bfa50a26349dd 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -334,8 +334,7 @@ static void omap8250_restore_regs(struct uart_8250_port *up)
/* drop TCR + TLR access, we setup XON/XOFF later */
serial8250_out_MCR(up, mcr);
-
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
serial_dl_write(up, priv->quot);
@@ -523,16 +522,21 @@ static void omap_8250_pm(struct uart_port *port, unsigned int state,
u8 efr;
pm_runtime_get_sync(port->dev);
+
+ spin_lock_irq(&port->lock);
+
serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
efr = serial_in(up, UART_EFR);
serial_out(up, UART_EFR, efr | UART_EFR_ECB);
serial_out(up, UART_LCR, 0);
- serial_out(up, UART_IER, (state != 0) ? UART_IERX_SLEEP : 0);
+ serial8250_set_IER(up, (state != 0) ? UART_IERX_SLEEP : 0);
serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
serial_out(up, UART_EFR, efr);
serial_out(up, UART_LCR, 0);
+ spin_unlock_irq(&port->lock);
+
pm_runtime_mark_last_busy(port->dev);
pm_runtime_put_autosuspend(port->dev);
}
@@ -649,7 +653,8 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id)
if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) {
unsigned long delay;
- up->ier = port->serial_in(port, UART_IER);
+ spin_lock(&port->lock);
+ up->ier = serial8250_in_IER(up);
if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
port->ops->stop_rx(port);
} else {
@@ -658,6 +663,7 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id)
*/
cancel_delayed_work(&up->overrun_backoff);
}
+ spin_unlock(&port->lock);
delay = msecs_to_jiffies(up->overrun_backoff_time_ms);
schedule_delayed_work(&up->overrun_backoff, delay);
@@ -707,8 +713,10 @@ static int omap_8250_startup(struct uart_port *port)
if (ret < 0)
goto err;
+ spin_lock_irq(&port->lock);
up->ier = UART_IER_RLSI | UART_IER_RDI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
+ spin_unlock_irq(&port->lock);
#ifdef CONFIG_PM
up->capabilities |= UART_CAP_RPM;
@@ -748,8 +756,10 @@ static void omap_8250_shutdown(struct uart_port *port)
if (priv->habit & UART_HAS_EFR2)
serial_out(up, UART_OMAP_EFR2, 0x0);
+ spin_lock_irq(&port->lock);
up->ier = 0;
- serial_out(up, UART_IER, 0);
+ serial8250_set_IER(up, 0);
+ spin_unlock_irq(&port->lock);
if (up->dma)
serial8250_release_dma(up);
@@ -797,7 +807,7 @@ static void omap_8250_unthrottle(struct uart_port *port)
up->dma->rx_dma(up);
up->ier |= UART_IER_RLSI | UART_IER_RDI;
port->read_status_mask |= UART_LSR_DR;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
spin_unlock_irqrestore(&port->lock, flags);
pm_runtime_mark_last_busy(port->dev);
@@ -956,7 +966,7 @@ static void __dma_rx_complete(void *param)
__dma_rx_do_complete(p);
if (!priv->throttled) {
p->ier |= UART_IER_RLSI | UART_IER_RDI;
- serial_out(p, UART_IER, p->ier);
+ serial8250_set_IER(p, p->ier);
if (!(priv->habit & UART_HAS_EFR2))
omap_8250_rx_dma(p);
}
@@ -1013,7 +1023,7 @@ static int omap_8250_rx_dma(struct uart_8250_port *p)
* callback to run.
*/
p->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
- serial_out(p, UART_IER, p->ier);
+ serial8250_set_IER(p, p->ier);
}
goto out;
}
@@ -1226,12 +1236,12 @@ static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir,
* periodic timeouts, re-enable interrupts.
*/
up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
omap_8250_rx_dma_flush(up);
serial_in(up, UART_IIR);
serial_out(up, UART_OMAP_EFR2, 0x0);
up->ier |= UART_IER_RLSI | UART_IER_RDI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
}
}
@@ -1717,12 +1727,16 @@ static int omap8250_runtime_resume(struct device *dev)
up = serial8250_get_port(priv->line);
+ spin_lock_irq(&up->port.lock);
+
if (omap8250_lost_context(up))
omap8250_restore_regs(up);
if (up->dma && up->dma->rxchan && !(priv->habit & UART_HAS_EFR2))
omap_8250_rx_dma(up);
+ spin_unlock_irq(&up->port.lock);
+
priv->latency = priv->calc_latency;
schedule_work(&priv->qos_work);
return 0;
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index fe8d79c4ae95e..68f01f8bdf64b 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -745,6 +745,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
serial8250_rpm_get(p);
if (p->capabilities & UART_CAP_SLEEP) {
+ spin_lock_irq(&p->port.lock);
if (p->capabilities & UART_CAP_EFR) {
lcr = serial_in(p, UART_LCR);
efr = serial_in(p, UART_EFR);
@@ -752,25 +753,18 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
serial_out(p, UART_EFR, UART_EFR_ECB);
serial_out(p, UART_LCR, 0);
}
- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
+ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0);
if (p->capabilities & UART_CAP_EFR) {
serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
serial_out(p, UART_EFR, efr);
serial_out(p, UART_LCR, lcr);
}
+ spin_unlock_irq(&p->port.lock);
}
serial8250_rpm_put(p);
}
-static void serial8250_clear_IER(struct uart_8250_port *up)
-{
- if (up->capabilities & UART_CAP_UUE)
- serial_out(up, UART_IER, UART_IER_UUE);
- else
- serial_out(up, UART_IER, 0);
-}
-
#ifdef CONFIG_SERIAL_8250_RSA
/*
* Attempts to turn on the RSA FIFO. Returns zero on failure.
@@ -1034,8 +1028,10 @@ static int broken_efr(struct uart_8250_port *up)
*/
static void autoconfig_16550a(struct uart_8250_port *up)
{
+ struct uart_port *port = &up->port;
unsigned char status1, status2;
unsigned int iersave;
+ bool is_console;
up->port.type = PORT_16550A;
up->capabilities |= UART_CAP_FIFO;
@@ -1151,6 +1147,11 @@ static void autoconfig_16550a(struct uart_8250_port *up)
return;
}
+ is_console = serial8250_is_console(port);
+
+ if (is_console)
+ serial8250_enter_unsafe(up);
+
/*
* Try writing and reading the UART_IER_UUE bit (b6).
* If it works, this is probably one of the Xscale platform's
@@ -1186,6 +1187,9 @@ static void autoconfig_16550a(struct uart_8250_port *up)
}
serial_out(up, UART_IER, iersave);
+ if (is_console)
+ serial8250_exit_unsafe(up);
+
/*
* We distinguish between 16550A and U6 16550A by counting
* how many bytes are in the FIFO.
@@ -1227,6 +1231,13 @@ static void autoconfig(struct uart_8250_port *up)
up->bugs = 0;
if (!(port->flags & UPF_BUGGY_UART)) {
+ bool is_console;
+
+ is_console = serial8250_is_console(port);
+
+ if (is_console)
+ serial8250_enter_unsafe(up);
+
/*
* Do a simple existence test first; if we fail this,
* there's no point trying anything else.
@@ -1256,6 +1267,10 @@ static void autoconfig(struct uart_8250_port *up)
#endif
scratch3 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
serial_out(up, UART_IER, scratch);
+
+ if (is_console)
+ serial8250_exit_unsafe(up);
+
if (scratch2 != 0 || scratch3 != UART_IER_ALL_INTR) {
/*
* We failed; there's nothing here
@@ -1377,6 +1392,7 @@ static void autoconfig_irq(struct uart_8250_port *up)
unsigned char save_ICP = 0;
unsigned int ICP = 0;
unsigned long irqs;
+ bool is_console;
int irq;
if (port->flags & UPF_FOURPORT) {
@@ -1386,8 +1402,12 @@ static void autoconfig_irq(struct uart_8250_port *up)
inb_p(ICP);
}
- if (uart_console(port))
+ is_console = serial8250_is_console(port);
+
+ if (is_console) {
console_lock();
+ serial8250_enter_unsafe(up);
+ }
/* forget possible initially masked and pending IRQ */
probe_irq_off(probe_irq_on());
@@ -1419,8 +1439,10 @@ static void autoconfig_irq(struct uart_8250_port *up)
if (port->flags & UPF_FOURPORT)
outb_p(save_ICP, ICP);
- if (uart_console(port))
+ if (is_console) {
+ serial8250_exit_unsafe(up);
console_unlock();
+ }
port->irq = (irq > 0) ? irq : 0;
}
@@ -1433,7 +1455,7 @@ static void serial8250_stop_rx(struct uart_port *port)
up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
up->port.read_status_mask &= ~UART_LSR_DR;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -1463,7 +1485,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p)
serial8250_clear_and_reinit_fifos(p);
p->ier |= UART_IER_RLSI | UART_IER_RDI;
- serial_port_out(&p->port, UART_IER, p->ier);
+ serial8250_set_IER(p, p->ier);
}
}
EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);
@@ -1710,7 +1732,7 @@ static void serial8250_disable_ms(struct uart_port *port)
mctrl_gpio_disable_ms(up->gpios);
up->ier &= ~UART_IER_MSI;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
}
static void serial8250_enable_ms(struct uart_port *port)
@@ -1726,7 +1748,7 @@ static void serial8250_enable_ms(struct uart_port *port)
up->ier |= UART_IER_MSI;
serial8250_rpm_get(up);
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -2176,9 +2198,10 @@ static void serial8250_put_poll_char(struct uart_port *port,
serial8250_rpm_get(up);
/*
* First save the IER then disable the interrupts
+ *
+ * Best-effort IER access because other CPUs are quiesced.
*/
- ier = serial_port_in(port, UART_IER);
- serial8250_clear_IER(up);
+ __serial8250_clear_IER(up, NULL, &ier);
wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
/*
@@ -2191,7 +2214,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
* and restore the IER
*/
wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
- serial_port_out(port, UART_IER, ier);
+ __serial8250_set_IER(up, NULL, ier);
serial8250_rpm_put(up);
}
@@ -2202,6 +2225,7 @@ int serial8250_do_startup(struct uart_port *port)
struct uart_8250_port *up = up_to_u8250p(port);
unsigned long flags;
unsigned char iir;
+ bool is_console;
int retval;
u16 lsr;
@@ -2219,21 +2243,25 @@ int serial8250_do_startup(struct uart_port *port)
serial8250_rpm_get(up);
if (port->type == PORT_16C950) {
/* Wake up and initialize UART */
+ spin_lock_irqsave(&port->lock, flags);
up->acr = 0;
serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
serial_port_out(port, UART_EFR, UART_EFR_ECB);
- serial_port_out(port, UART_IER, 0);
+ serial8250_set_IER(up, 0);
serial_port_out(port, UART_LCR, 0);
serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
serial_port_out(port, UART_EFR, UART_EFR_ECB);
serial_port_out(port, UART_LCR, 0);
+ spin_unlock_irqrestore(&port->lock, flags);
}
if (port->type == PORT_DA830) {
/* Reset the port */
- serial_port_out(port, UART_IER, 0);
+ spin_lock_irqsave(&port->lock, flags);
+ serial8250_set_IER(up, 0);
serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
+ spin_unlock_irqrestore(&port->lock, flags);
mdelay(10);
/* Enable Tx, Rx and free run mode */
@@ -2331,6 +2359,8 @@ int serial8250_do_startup(struct uart_port *port)
if (retval)
goto out;
+ is_console = serial8250_is_console(port);
+
if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) {
unsigned char iir1;
@@ -2347,6 +2377,9 @@ int serial8250_do_startup(struct uart_port *port)
*/
spin_lock_irqsave(&port->lock, flags);
+ if (is_console)
+ serial8250_enter_unsafe(up);
+
wait_for_xmitr(up, UART_LSR_THRE);
serial_port_out_sync(port, UART_IER, UART_IER_THRI);
udelay(1); /* allow THRE to set */
@@ -2357,6 +2390,9 @@ int serial8250_do_startup(struct uart_port *port)
iir = serial_port_in(port, UART_IIR);
serial_port_out(port, UART_IER, 0);
+ if (is_console)
+ serial8250_exit_unsafe(up);
+
spin_unlock_irqrestore(&port->lock, flags);
if (port->irqflags & IRQF_SHARED)
@@ -2411,10 +2447,14 @@ int serial8250_do_startup(struct uart_port *port)
* Do a quick test to see if we receive an interrupt when we enable
* the TX irq.
*/
+ if (is_console)
+ serial8250_enter_unsafe(up);
serial_port_out(port, UART_IER, UART_IER_THRI);
lsr = serial_port_in(port, UART_LSR);
iir = serial_port_in(port, UART_IIR);
serial_port_out(port, UART_IER, 0);
+ if (is_console)
+ serial8250_exit_unsafe(up);
if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
if (!(up->bugs & UART_BUG_TXEN)) {
@@ -2446,7 +2486,7 @@ int serial8250_do_startup(struct uart_port *port)
if (up->dma) {
const char *msg = NULL;
- if (uart_console(port))
+ if (is_console)
msg = "forbid DMA for kernel console";
else if (serial8250_request_dma(up))
msg = "failed to request DMA";
@@ -2497,7 +2537,7 @@ void serial8250_do_shutdown(struct uart_port *port)
*/
spin_lock_irqsave(&port->lock, flags);
up->ier = 0;
- serial_port_out(port, UART_IER, 0);
+ serial8250_set_IER(up, 0);
spin_unlock_irqrestore(&port->lock, flags);
synchronize_irq(port->irq);
@@ -2863,7 +2903,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
if (up->capabilities & UART_CAP_RTOIE)
up->ier |= UART_IER_RTOIE;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
if (up->capabilities & UART_CAP_EFR) {
unsigned char efr = 0;
@@ -3328,12 +3368,21 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults);
#ifdef CONFIG_SERIAL_8250_CONSOLE
-static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
+static bool serial8250_console_putchar(struct uart_port *port, unsigned char ch,
+ struct cons_write_context *wctxt)
{
struct uart_8250_port *up = up_to_u8250p(port);
wait_for_xmitr(up, UART_LSR_THRE);
+ if (!console_can_proceed(wctxt))
+ return false;
serial_port_out(port, UART_TX, ch);
+ if (ch == '\n')
+ up->console_newline_needed = false;
+ else
+ up->console_newline_needed = true;
+
+ return true;
}
/*
@@ -3362,33 +3411,119 @@ static void serial8250_console_restore(struct uart_8250_port *up)
serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
}
-/*
- * Print a string to the serial port using the device FIFO
- *
- * It sends fifosize bytes and then waits for the fifo
- * to get empty.
- */
-static void serial8250_console_fifo_write(struct uart_8250_port *up,
- const char *s, unsigned int count)
+static bool __serial8250_console_write(struct uart_port *port, struct cons_write_context *wctxt,
+ const char *s, unsigned int count,
+ bool (*putchar)(struct uart_port *, unsigned char, struct cons_write_context *))
{
- int i;
- const char *end = s + count;
- unsigned int fifosize = up->tx_loadsz;
- bool cr_sent = false;
+ bool finished = false;
+ unsigned int i;
- while (s != end) {
- wait_for_lsr(up, UART_LSR_THRE);
-
- for (i = 0; i < fifosize && s != end; ++i) {
- if (*s == '\n' && !cr_sent) {
- serial_out(up, UART_TX, '\r');
- cr_sent = true;
- } else {
- serial_out(up, UART_TX, *s++);
- cr_sent = false;
- }
+ for (i = 0; i < count; i++, s++) {
+ if (*s == '\n') {
+ if (!putchar(port, '\r', wctxt))
+ goto out;
}
+ if (!putchar(port, *s, wctxt))
+ goto out;
}
+ finished = true;
+out:
+ return finished;
+}
+
+static bool serial8250_console_write(struct uart_port *port, struct cons_write_context *wctxt,
+ const char *s, unsigned int count,
+ bool (*putchar)(struct uart_port *, unsigned char, struct cons_write_context *))
+{
+ return __serial8250_console_write(port, wctxt, s, count, putchar);
+}
+
+static bool atomic_print_line(struct uart_8250_port *up,
+ struct cons_write_context *wctxt)
+{
+ struct uart_port *port = &up->port;
+
+ if (up->console_newline_needed &&
+ !__serial8250_console_write(port, wctxt, "\n", 1, serial8250_console_putchar)) {
+ return false;
+ }
+
+ return __serial8250_console_write(port, wctxt, wctxt->outbuf, wctxt->len,
+ serial8250_console_putchar);
+}
+
+static void atomic_console_reacquire(struct cons_write_context *wctxt,
+ struct cons_write_context *wctxt_init)
+{
+ memcpy(wctxt, wctxt_init, sizeof(*wctxt));
+ while (!console_try_acquire(wctxt)) {
+ cpu_relax();
+ memcpy(wctxt, wctxt_init, sizeof(*wctxt));
+ }
+}
+
+bool serial8250_console_write_atomic(struct uart_8250_port *up,
+ struct cons_write_context *wctxt)
+{
+ struct cons_write_context wctxt_init = { };
+ struct cons_context *ctxt_init = &ACCESS_PRIVATE(&wctxt_init, ctxt);
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ bool finished = false;
+ unsigned int ier;
+
+ touch_nmi_watchdog();
+
+ /* With write_atomic, another context may hold the port->lock. */
+
+ ctxt_init->console = ctxt->console;
+ ctxt_init->prio = ctxt->prio;
+ ctxt_init->thread = ctxt->thread;
+
+ /*
+ * Enter unsafe in order to disable interrupts. If the console is
+ * lost before the interrupts are disabled, bail out because another
+ * context took over the printing. If the console is lost after the
+ * interrutps are disabled, the console must be reacquired in order
+ * to re-enable the interrupts. However in that case no printing is
+ * allowed because another context took over the printing.
+ */
+
+ if (!console_enter_unsafe(wctxt))
+ return false;
+
+ if (!__serial8250_clear_IER(up, wctxt, &ier))
+ return false;
+
+ if (!console_exit_unsafe(wctxt)) {
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ goto enable_irq;
+ }
+
+ if (!atomic_print_line(up, wctxt)) {
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ goto enable_irq;
+ }
+
+ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
+ finished = true;
+enable_irq:
+ /*
+ * Enter unsafe in order to enable interrupts. If the console is
+ * lost before the interrupts are enabled, the console must be
+ * reacquired in order to re-enable the interrupts.
+ */
+ for (;;) {
+ if (console_enter_unsafe(wctxt) &&
+ __serial8250_set_IER(up, wctxt, ier)) {
+ break;
+ }
+
+ /* HW-IRQs still disabled. Reacquire to enable them. */
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ }
+ console_exit_unsafe(wctxt);
+
+ return finished;
}
/*
@@ -3400,78 +3535,116 @@ static void serial8250_console_fifo_write(struct uart_8250_port *up,
* Doing runtime PM is really a bad idea for the kernel console.
* Thus, we assume the function is called when device is powered up.
*/
-void serial8250_console_write(struct uart_8250_port *up, const char *s,
- unsigned int count)
+bool serial8250_console_write_thread(struct uart_8250_port *up,
+ struct cons_write_context *wctxt)
{
+ struct cons_write_context wctxt_init = { };
+ struct cons_context *ctxt_init = &ACCESS_PRIVATE(&wctxt_init, ctxt);
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
struct uart_8250_em485 *em485 = up->em485;
struct uart_port *port = &up->port;
- unsigned long flags;
- unsigned int ier, use_fifo;
- int locked = 1;
+ unsigned int count = wctxt->len;
+ const char *s = wctxt->outbuf;
+ bool rs485_started = false;
+ bool finished = false;
+ unsigned int ier;
- touch_nmi_watchdog();
-
- if (oops_in_progress)
- locked = spin_trylock_irqsave(&port->lock, flags);
- else
- spin_lock_irqsave(&port->lock, flags);
+ ctxt_init->console = ctxt->console;
+ ctxt_init->prio = ctxt->prio;
+ ctxt_init->thread = ctxt->thread;
/*
- * First save the IER then disable the interrupts
+ * Enter unsafe in order to disable interrupts. If the console is
+ * lost before the interrupts are disabled, bail out because another
+ * context took over the printing. If the console is lost after the
+ * interrutps are disabled, the console must be reacquired in order
+ * to re-enable the interrupts. However in that case no printing is
+ * allowed because another context took over the printing.
*/
- ier = serial_port_in(port, UART_IER);
- serial8250_clear_IER(up);
+
+ if (!console_enter_unsafe(wctxt))
+ return false;
+
+ if (!__serial8250_clear_IER(up, wctxt, &ier))
+ return false;
+
+ if (!console_exit_unsafe(wctxt)) {
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ goto enable_irq;
+ }
/* check scratch reg to see if port powered off during system sleep */
if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
+ if (!console_enter_unsafe(wctxt)) {
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ goto enable_irq;
+ }
serial8250_console_restore(up);
+ if (!console_exit_unsafe(wctxt)) {
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ goto enable_irq;
+ }
up->canary = 0;
}
if (em485) {
- if (em485->tx_stopped)
+ if (em485->tx_stopped) {
+ if (!console_enter_unsafe(wctxt)) {
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ goto enable_irq;
+ }
up->rs485_start_tx(up);
- mdelay(port->rs485.delay_rts_before_send);
+ rs485_started = true;
+ if (!console_exit_unsafe(wctxt)) {
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ goto enable_irq;
+ }
+ }
+ if (port->rs485.delay_rts_before_send) {
+ mdelay(port->rs485.delay_rts_before_send);
+ if (!console_can_proceed(wctxt)) {
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ goto enable_irq;
+ }
+ }
}
- use_fifo = (up->capabilities & UART_CAP_FIFO) &&
- /*
- * BCM283x requires to check the fifo
- * after each byte.
- */
- !(up->capabilities & UART_CAP_MINI) &&
- /*
- * tx_loadsz contains the transmit fifo size
- */
- up->tx_loadsz > 1 &&
- (up->fcr & UART_FCR_ENABLE_FIFO) &&
- port->state &&
- test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) &&
- /*
- * After we put a data in the fifo, the controller will send
- * it regardless of the CTS state. Therefore, only use fifo
- * if we don't use control flow.
- */
- !(up->port.flags & UPF_CONS_FLOW);
+ if (!serial8250_console_write(port, wctxt, s, count, serial8250_console_putchar)) {
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ goto enable_irq;
+ }
- if (likely(use_fifo))
- serial8250_console_fifo_write(up, s, count);
- else
- uart_console_write(port, s, count, serial8250_console_putchar);
-
- /*
- * Finally, wait for transmitter to become empty
- * and restore the IER
- */
wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
-
+ finished = true;
+enable_irq:
+ /*
+ * Enter unsafe in order to stop rs485_tx. If the console is
+ * lost before the rs485_tx is stopped, the console must be
+ * reacquired in order to stop rs485_tx.
+ */
if (em485) {
mdelay(port->rs485.delay_rts_after_send);
- if (em485->tx_stopped)
+ if (em485->tx_stopped && rs485_started) {
+ while (!console_enter_unsafe(wctxt))
+ atomic_console_reacquire(wctxt, &wctxt_init);
up->rs485_stop_tx(up);
+ if (!console_exit_unsafe(wctxt))
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ }
}
- serial_port_out(port, UART_IER, ier);
+ /*
+ * Enter unsafe in order to enable interrupts. If the console is
+ * lost before the interrupts are enabled, the console must be
+ * reacquired in order to re-enable the interrupts.
+ */
+ for (;;) {
+ if (console_enter_unsafe(wctxt) &&
+ __serial8250_set_IER(up, wctxt, ier)) {
+ break;
+ }
+ atomic_console_reacquire(wctxt, &wctxt_init);
+ }
/*
* The receive handling will happen properly because the
@@ -3483,8 +3656,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
if (up->msr_saved_flags)
serial8250_modem_status(up);
- if (locked)
- spin_unlock_irqrestore(&port->lock, flags);
+ console_exit_unsafe(wctxt);
+
+ return finished;
}
static unsigned int probe_baud(struct uart_port *port)
@@ -3504,6 +3678,7 @@ static unsigned int probe_baud(struct uart_port *port)
int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
{
+ struct uart_8250_port *up = up_to_u8250p(port);
int baud = 9600;
int bits = 8;
int parity = 'n';
@@ -3513,6 +3688,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
if (!port->iobase && !port->membase)
return -ENODEV;
+ up->console_newline_needed = false;
+
if (options)
uart_parse_options(options, &baud, &parity, &bits, &flow);
else if (probe)
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index 5313aa31930f4..16715f01bdb5a 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -9,6 +9,7 @@ config SERIAL_8250
depends on !S390
select SERIAL_CORE
select SERIAL_MCTRL_GPIO if GPIOLIB
+ select HAVE_ATOMIC_CONSOLE
help
This selects whether you want to include the driver for the standard
serial ports. The standard answer is Y. People who might say N
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index d8c2f3455eeba..a4e142ac6ec46 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2319,18 +2319,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
{
struct uart_amba_port *uap = amba_ports[co->index];
unsigned int old_cr = 0, new_cr;
- unsigned long flags;
+ unsigned long flags = 0;
int locked = 1;
clk_enable(uap->clk);
- local_irq_save(flags);
+ /*
+ * local_irq_save(flags);
+ *
+ * This local_irq_save() is nonsense. If we come in via sysrq
+ * handling then interrupts are already disabled. Aside of
+ * that the port.sysrq check is racy on SMP regardless.
+ */
if (uap->port.sysrq)
locked = 0;
else if (oops_in_progress)
- locked = spin_trylock(&uap->port.lock);
+ locked = spin_trylock_irqsave(&uap->port.lock, flags);
else
- spin_lock(&uap->port.lock);
+ spin_lock_irqsave(&uap->port.lock, flags);
/*
* First save the CR then disable the interrupts
@@ -2356,8 +2362,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
pl011_write(old_cr, uap, REG_CR);
if (locked)
- spin_unlock(&uap->port.lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&uap->port.lock, flags);
clk_disable(uap->clk);
}
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 82d35dbbfa6cb..511cf17d87958 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1219,13 +1219,10 @@ serial_omap_console_write(struct console *co, const char *s,
unsigned int ier;
int locked = 1;
- local_irq_save(flags);
- if (up->port.sysrq)
- locked = 0;
- else if (oops_in_progress)
- locked = spin_trylock(&up->port.lock);
+ if (up->port.sysrq || oops_in_progress)
+ locked = spin_trylock_irqsave(&up->port.lock, flags);
else
- spin_lock(&up->port.lock);
+ spin_lock_irqsave(&up->port.lock, flags);
/*
* First save the IER then disable the interrupts
@@ -1252,8 +1249,7 @@ serial_omap_console_write(struct console *co, const char *s,
check_modem_status(up);
if (locked)
- spin_unlock(&up->port.lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&up->port.lock, flags);
}
static int __init
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 728cb72be0666..409892b777d16 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -2336,8 +2336,11 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
* able to Re-start_rx later.
*/
if (!console_suspend_enabled && uart_console(uport)) {
- if (uport->ops->start_rx)
+ if (uport->ops->start_rx) {
+ spin_lock_irq(&uport->lock);
uport->ops->stop_rx(uport);
+ spin_unlock_irq(&uport->lock);
+ }
goto unlock;
}
@@ -2430,8 +2433,11 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
if (console_suspend_enabled)
uart_change_pm(state, UART_PM_STATE_ON);
uport->ops->set_termios(uport, &termios, NULL);
- if (!console_suspend_enabled && uport->ops->start_rx)
+ if (!console_suspend_enabled && uport->ops->start_rx) {
+ spin_lock_irq(&uport->lock);
uport->ops->start_rx(uport);
+ spin_unlock_irq(&uport->lock);
+ }
if (console_suspend_enabled)
console_start(uport->cons);
}
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 8e3de07f103da..af933c7a61d98 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3543,8 +3543,13 @@ static ssize_t show_cons_active(struct device *dev,
for_each_console(c) {
if (!c->device)
continue;
- if (!c->write)
- continue;
+ if (c->flags & CON_NO_BKL) {
+ if (!(c->write_thread || c->write_atomic))
+ continue;
+ } else {
+ if (!c->write)
+ continue;
+ }
if ((c->flags & CON_ENABLED) == 0)
continue;
cs[i++] = c;
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index e0758fe7936dc..ab9f42d478c8c 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -21,12 +21,14 @@ static int show_console_dev(struct seq_file *m, void *v)
{ CON_ENABLED, 'E' },
{ CON_CONSDEV, 'C' },
{ CON_BOOT, 'B' },
+ { CON_NO_BKL, 'N' },
{ CON_PRINTBUFFER, 'p' },
{ CON_BRL, 'b' },
{ CON_ANYTIME, 'a' },
};
char flags[ARRAY_SIZE(con_flags) + 1];
struct console *con = v;
+ char con_write = '-';
unsigned int a;
dev_t dev = 0;
@@ -57,9 +59,15 @@ static int show_console_dev(struct seq_file *m, void *v)
seq_setwidth(m, 21 - 1);
seq_printf(m, "%s%d", con->name, con->index);
seq_pad(m, ' ');
- seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
- con->write ? 'W' : '-', con->unblank ? 'U' : '-',
- flags);
+ if (con->flags & CON_NO_BKL) {
+ if (con->write_thread || con->write_atomic)
+ con_write = 'W';
+ } else {
+ if (con->write)
+ con_write = 'W';
+ }
+ seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con_write,
+ con->unblank ? 'U' : '-', flags);
if (dev)
seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
diff --git a/include/linux/console.h b/include/linux/console.h
index d3195664baa5a..1e9d5bc8fa76e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -16,7 +16,9 @@
#include <linux/atomic.h>
#include <linux/bits.h>
+#include <linux/irq_work.h>
#include <linux/rculist.h>
+#include <linux/rcuwait.h>
#include <linux/types.h>
struct vc_data;
@@ -154,6 +156,10 @@ static inline int con_debug_leave(void)
* receiving the printk spam for obvious reasons.
* @CON_EXTENDED: The console supports the extended output format of
* /dev/kmesg which requires a larger output buffer.
+ * @CON_SUSPENDED: Indicates if a console is suspended. If true, the
+ * printing callbacks must not be called.
+ * @CON_NO_BKL: Console can operate outside of the BKL style console_lock
+ * constraints.
*/
enum cons_flags {
CON_PRINTBUFFER = BIT(0),
@@ -163,8 +169,133 @@ enum cons_flags {
CON_ANYTIME = BIT(4),
CON_BRL = BIT(5),
CON_EXTENDED = BIT(6),
+ CON_SUSPENDED = BIT(7),
+ CON_NO_BKL = BIT(8),
};
+/**
+ * struct cons_state - console state for NOBKL consoles
+ * @atom: Compound of the state fields for atomic operations
+ * @seq: Sequence for record tracking (64bit only)
+ * @bits: Compound of the state bits below
+ *
+ * @locked: Console is locked by a writer
+ * @unsafe: Console is busy in a non takeover region
+ * @thread: Current owner is the printk thread
+ * @cur_prio: The priority of the current output
+ * @req_prio: The priority of a handover request
+ * @cpu: The CPU on which the writer runs
+ *
+ * To be used for state read and preparation of atomic_long_cmpxchg()
+ * operations.
+ *
+ * The @req_prio field is particularly important to allow spin-waiting to
+ * timeout and give up without the risk of it being assigned the lock
+ * after giving up. The @req_prio field has a nice side-effect that it
+ * also makes it possible for a single read+cmpxchg in the common case of
+ * acquire and release.
+ */
+struct cons_state {
+ union {
+ unsigned long atom;
+ struct {
+#ifdef CONFIG_64BIT
+ u32 seq;
+#endif
+ union {
+ u32 bits;
+ struct {
+ u32 locked : 1;
+ u32 unsafe : 1;
+ u32 thread : 1;
+ u32 cur_prio : 2;
+ u32 req_prio : 2;
+ u32 cpu : 18;
+ };
+ };
+ };
+ };
+};
+
+/**
+ * cons_prio - console writer priority for NOBKL consoles
+ * @CONS_PRIO_NONE: Unused
+ * @CONS_PRIO_NORMAL: Regular printk
+ * @CONS_PRIO_EMERGENCY: Emergency output (WARN/OOPS...)
+ * @CONS_PRIO_PANIC: Panic output
+ * @CONS_PRIO_MAX: The number of priority levels
+ *
+ * Emergency output can carefully takeover the console even without consent
+ * of the owner, ideally only when @cons_state::unsafe is not set. Panic
+ * output can ignore the unsafe flag as a last resort. If panic output is
+ * active no takeover is possible until the panic output releases the
+ * console.
+ */
+enum cons_prio {
+ CONS_PRIO_NONE = 0,
+ CONS_PRIO_NORMAL,
+ CONS_PRIO_EMERGENCY,
+ CONS_PRIO_PANIC,
+ CONS_PRIO_MAX,
+};
+
+struct console;
+struct printk_buffers;
+
+/**
+ * struct cons_context - Context for console acquire/release
+ * @console: The associated console
+ * @state: The state at acquire time
+ * @old_state: The old state when try_acquire() failed for analysis
+ * by the caller
+ * @hov_state: The handover state for spin and cleanup
+ * @req_state: The request state for spin and cleanup
+ * @spinwait_max_us: Limit for spinwait acquire
+ * @oldseq: The sequence number at acquire()
+ * @newseq: The sequence number for progress
+ * @prio: Priority of the context
+ * @pbufs: Pointer to the text buffer for this context
+ * @dropped: Dropped counter for the current context
+ * @thread: The acquire is printk thread context
+ * @hostile: Hostile takeover requested. Cleared on normal
+ * acquire or friendly handover
+ * @spinwait: Spinwait on acquire if possible
+ * @backlog: Ringbuffer has pending records
+ */
+struct cons_context {
+ struct console *console;
+ struct cons_state state;
+ struct cons_state old_state;
+ struct cons_state hov_state;
+ struct cons_state req_state;
+ u64 oldseq;
+ u64 newseq;
+ unsigned int spinwait_max_us;
+ enum cons_prio prio;
+ struct printk_buffers *pbufs;
+ unsigned long dropped;
+ unsigned int thread : 1;
+ unsigned int hostile : 1;
+ unsigned int spinwait : 1;
+ unsigned int backlog : 1;
+};
+
+/**
+ * struct cons_write_context - Context handed to the write callbacks
+ * @ctxt: The core console context
+ * @outbuf: Pointer to the text buffer for output
+ * @len: Length to write
+ * @unsafe: Invoked in unsafe state due to force takeover
+ */
+struct cons_write_context {
+ struct cons_context __private ctxt;
+ char *outbuf;
+ unsigned int len;
+ bool unsafe;
+};
+
+struct cons_context_data;
+
/**
* struct console - The console descriptor structure
* @name: The name of the console driver
@@ -184,6 +315,18 @@ enum cons_flags {
* @dropped: Number of unreported dropped ringbuffer records
* @data: Driver private data
* @node: hlist node for the console list
+ *
+ * @atomic_state: State array for NOBKL consoles; real and handover
+ * @atomic_seq: Sequence for record tracking (32bit only)
+ * @thread_pbufs: Pointer to thread private buffer
+ * @kthread: Pointer to kernel thread
+ * @rcuwait: RCU wait for the kernel thread
+ * @irq_work: IRQ work for thread wakeup
+ * @kthread_waiting: Indicator whether the kthread is waiting to be woken
+ * @write_atomic: Write callback for atomic context
+ * @write_thread: Write callback for printk threaded printing
+ * @port_lock: Callback to lock/unlock the port lock
+ * @pcpu_data: Pointer to percpu context data
*/
struct console {
char name[16];
@@ -203,6 +346,23 @@ struct console {
unsigned long dropped;
void *data;
struct hlist_node node;
+
+ /* NOBKL console specific members */
+ atomic_long_t __private atomic_state[2];
+#ifndef CONFIG_64BIT
+ atomic_t __private atomic_seq;
+#endif
+ struct printk_buffers *thread_pbufs;
+ struct task_struct *kthread;
+ struct rcuwait rcuwait;
+ struct irq_work irq_work;
+ atomic_t kthread_waiting;
+
+ bool (*write_atomic)(struct console *con, struct cons_write_context *wctxt);
+ bool (*write_thread)(struct console *con, struct cons_write_context *wctxt);
+ void (*port_lock)(struct console *con, bool do_lock, unsigned long *flags);
+
+ struct cons_context_data __percpu *pcpu_data;
};
#ifdef CONFIG_LOCKDEP
@@ -329,6 +489,24 @@ static inline bool console_is_registered(const struct console *con)
lockdep_assert_console_list_lock_held(); \
hlist_for_each_entry(con, &console_list, node)
+#ifdef CONFIG_PRINTK
+extern enum cons_prio cons_atomic_enter(enum cons_prio prio);
+extern void cons_atomic_exit(enum cons_prio prio, enum cons_prio prev_prio);
+extern bool console_can_proceed(struct cons_write_context *wctxt);
+extern bool console_enter_unsafe(struct cons_write_context *wctxt);
+extern bool console_exit_unsafe(struct cons_write_context *wctxt);
+extern bool console_try_acquire(struct cons_write_context *wctxt);
+extern bool console_release(struct cons_write_context *wctxt);
+#else
+static inline enum cons_prio cons_atomic_enter(enum cons_prio prio) { return CONS_PRIO_NONE; }
+static inline void cons_atomic_exit(enum cons_prio prio, enum cons_prio prev_prio) { }
+static inline bool console_can_proceed(struct cons_write_context *wctxt) { return false; }
+static inline bool console_enter_unsafe(struct cons_write_context *wctxt) { return false; }
+static inline bool console_exit_unsafe(struct cons_write_context *wctxt) { return false; }
+static inline bool console_try_acquire(struct cons_write_context *wctxt) { return false; }
+static inline bool console_release(struct cons_write_context *wctxt) { return false; }
+#endif
+
extern int console_set_on_cmdline;
extern struct console *early_console;
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index d95ab85f96ba5..3dc3704a3cdbb 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -57,9 +57,15 @@
# define ARCH_EXIT_TO_USER_MODE_WORK (0)
#endif
+#ifdef CONFIG_PREEMPT_LAZY
+# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+#else
+# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED)
+#endif
+
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
+ _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
ARCH_EXIT_TO_USER_MODE_WORK)
/**
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a92bce40b04b3..bf82980f569df 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -605,6 +605,35 @@ extern void __raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq(unsigned int nr);
+#ifdef CONFIG_PREEMPT_RT
+DECLARE_PER_CPU(struct task_struct *, timersd);
+DECLARE_PER_CPU(unsigned long, pending_timer_softirq);
+
+extern void raise_timer_softirq(void);
+extern void raise_hrtimer_softirq(void);
+
+static inline unsigned int local_pending_timers(void)
+{
+ return __this_cpu_read(pending_timer_softirq);
+}
+
+#else
+static inline void raise_timer_softirq(void)
+{
+ raise_softirq(TIMER_SOFTIRQ);
+}
+
+static inline void raise_hrtimer_softirq(void)
+{
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+
+static inline unsigned int local_pending_timers(void)
+{
+ return local_softirq_pending();
+}
+#endif
+
DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
static inline struct task_struct *this_cpu_ksoftirqd(void)
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 09d4f17c8d3b6..7376c1df9c901 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -69,7 +69,10 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping,
BUG_ON(offset >= mapping->size);
phys_addr = mapping->base + offset;
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ migrate_disable();
pagefault_disable();
return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
}
@@ -79,7 +82,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
{
kunmap_local_indexed((void __force *)vaddr);
pagefault_enable();
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ migrate_enable();
}
static inline void __iomem *
@@ -162,7 +168,10 @@ static inline void __iomem *
io_mapping_map_atomic_wc(struct io_mapping *mapping,
unsigned long offset)
{
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ migrate_disable();
pagefault_disable();
return io_mapping_map_wc(mapping, offset, PAGE_SIZE);
}
@@ -172,7 +181,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
{
io_mapping_unmap(vaddr);
pagefault_enable();
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ migrate_enable();
}
static inline void __iomem *
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c35f04f636f15..0c014424b1312 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3202,7 +3202,11 @@ struct softnet_data {
int defer_count;
int defer_ipi_scheduled;
struct sk_buff *defer_list;
+#ifndef CONFIG_PREEMPT_RT
call_single_data_t defer_csd;
+#else
+ struct work_struct defer_work;
+#endif
};
static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 0df425bf9bd75..05338f00a5907 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -196,6 +196,20 @@ extern void preempt_count_sub(int val);
#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)
+#ifdef CONFIG_PREEMPT_LAZY
+#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
+#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
+#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
+#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
+#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
+#else
+#define add_preempt_lazy_count(val) do { } while (0)
+#define sub_preempt_lazy_count(val) do { } while (0)
+#define inc_preempt_lazy_count() do { } while (0)
+#define dec_preempt_lazy_count() do { } while (0)
+#define preempt_lazy_count() (0)
+#endif
+
#ifdef CONFIG_PREEMPT_COUNT
#define preempt_disable() \
@@ -204,6 +218,12 @@ do { \
barrier(); \
} while (0)
+#define preempt_lazy_disable() \
+do { \
+ inc_preempt_lazy_count(); \
+ barrier(); \
+} while (0)
+
#define sched_preempt_enable_no_resched() \
do { \
barrier(); \
@@ -218,15 +238,21 @@ do { \
#define preempt_enable() \
do { \
barrier(); \
- if (unlikely(preempt_count_dec_and_test())) \
+ if (unlikely(preempt_count_dec_and_test())) { \
+ instrumentation_begin(); \
__preempt_schedule(); \
+ instrumentation_end(); \
+ } \
} while (0)
#define preempt_enable_notrace() \
do { \
barrier(); \
- if (unlikely(__preempt_count_dec_and_test())) \
+ if (unlikely(__preempt_count_dec_and_test())) { \
+ instrumentation_begin(); \
__preempt_schedule_notrace(); \
+ instrumentation_end(); \
+ } \
} while (0)
#define preempt_check_resched() \
@@ -235,6 +261,18 @@ do { \
__preempt_schedule(); \
} while (0)
+/*
+ * open code preempt_check_resched() because it is not exported to modules and
+ * used by local_unlock() or bpf_enable_instrumentation().
+ */
+#define preempt_lazy_enable() \
+do { \
+ dec_preempt_lazy_count(); \
+ barrier(); \
+ if (should_resched(0)) \
+ __preempt_schedule(); \
+} while (0)
+
#else /* !CONFIG_PREEMPTION */
#define preempt_enable() \
do { \
@@ -242,6 +280,12 @@ do { \
preempt_count_dec(); \
} while (0)
+#define preempt_lazy_enable() \
+do { \
+ dec_preempt_lazy_count(); \
+ barrier(); \
+} while (0)
+
#define preempt_enable_notrace() \
do { \
barrier(); \
@@ -282,6 +326,9 @@ do { \
#define preempt_enable_notrace() barrier()
#define preemptible() 0
+#define preempt_lazy_disable() barrier()
+#define preempt_lazy_enable() barrier()
+
#endif /* CONFIG_PREEMPT_COUNT */
#ifdef MODULE
@@ -300,7 +347,7 @@ do { \
} while (0)
#define preempt_fold_need_resched() \
do { \
- if (tif_need_resched()) \
+ if (tif_need_resched_now()) \
set_preempt_need_resched(); \
} while (0)
@@ -416,8 +463,15 @@ extern void migrate_enable(void);
#else
-static inline void migrate_disable(void) { }
-static inline void migrate_enable(void) { }
+static inline void migrate_disable(void)
+{
+ preempt_lazy_disable();
+}
+
+static inline void migrate_enable(void)
+{
+ preempt_lazy_enable();
+}
#endif /* CONFIG_SMP */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 8ef499ab3c1ed..b55662624ff87 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -139,6 +139,7 @@ void early_printk(const char *s, ...) { }
#endif
struct dev_printk_info;
+struct cons_write_context;
#ifdef CONFIG_PRINTK
asmlinkage __printf(4, 0)
@@ -157,15 +158,17 @@ int _printk(const char *fmt, ...);
*/
__printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);
-extern void __printk_safe_enter(void);
-extern void __printk_safe_exit(void);
+extern void __printk_safe_enter(unsigned long *flags);
+extern void __printk_safe_exit(unsigned long *flags);
+extern void __printk_deferred_enter(void);
+extern void __printk_deferred_exit(void);
/*
* The printk_deferred_enter/exit macros are available only as a hack for
* some code paths that need to defer all printk console printing. Interrupts
* must be disabled for the deferred duration.
*/
-#define printk_deferred_enter __printk_safe_enter
-#define printk_deferred_exit __printk_safe_exit
+#define printk_deferred_enter() __printk_deferred_enter()
+#define printk_deferred_exit() __printk_deferred_exit()
/*
* Please don't use printk_ratelimit(), because it shares ratelimiting state
@@ -192,6 +195,8 @@ void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
extern asmlinkage void dump_stack(void) __cold;
void printk_trigger_flush(void);
+extern void cons_atomic_flush(struct cons_write_context *printk_caller_wctxt,
+ bool skip_unsafe);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
@@ -271,6 +276,12 @@ static inline void dump_stack(void)
static inline void printk_trigger_flush(void)
{
}
+
+static inline void cons_atomic_flush(struct cons_write_context *printk_caller_wctxt,
+ bool skip_unsafe)
+{
+}
+
#endif
#ifdef CONFIG_SMP
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a9..593d7e7c2e4c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -303,6 +303,11 @@ extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);
+
+extern void sched_submit_work(void);
+extern void sched_resume_work(void);
+extern void schedule_rtmutex(void);
+
#ifdef CONFIG_PREEMPT_RT
extern void schedule_rtlock(void);
#endif
@@ -2059,6 +2064,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
+#ifdef CONFIG_PREEMPT_LAZY
+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
+}
+
+static inline int need_resched_lazy(void)
+{
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int need_resched_now(void)
+{
+ return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#else
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
+static inline int need_resched_lazy(void) { return 0; }
+
+static inline int need_resched_now(void)
+{
+ return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#endif
+
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 20099268fa257..b38ce53576000 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -22,6 +22,7 @@ struct sighand_struct {
refcount_t count;
wait_queue_head_t signalfd_wqh;
struct k_sigaction action[_NSIG];
+ struct sigqueue *sigqueue_cache;
};
/*
@@ -135,7 +136,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
- int posix_timer_id;
+ unsigned int next_posix_timer_id;
struct list_head posix_timers;
/* ITIMER_REAL timer for the process */
@@ -349,6 +350,7 @@ extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern struct sigqueue *sigqueue_alloc(void);
extern void sigqueue_free(struct sigqueue *);
+extern void sigqueue_free_cached_entry(struct sigqueue *q);
extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 19376bee96676..9055a22992edc 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -125,6 +125,8 @@ struct uart_8250_port {
#define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
unsigned char msr_saved_flags;
+ bool console_newline_needed;
+
struct uart_8250_dma *dma;
const struct uart_8250_ops *ops;
@@ -139,6 +141,9 @@ struct uart_8250_port {
/* Serial port overrun backoff */
struct delayed_work overrun_backoff;
u32 overrun_backoff_time_ms;
+
+ struct cons_write_context wctxt;
+ int cookie;
};
static inline struct uart_8250_port *up_to_u8250p(struct uart_port *up)
@@ -178,8 +183,10 @@ void serial8250_tx_chars(struct uart_8250_port *up);
unsigned int serial8250_modem_status(struct uart_8250_port *up);
void serial8250_init_port(struct uart_8250_port *up);
void serial8250_set_defaults(struct uart_8250_port *up);
-void serial8250_console_write(struct uart_8250_port *up, const char *s,
- unsigned int count);
+bool serial8250_console_write_atomic(struct uart_8250_port *up,
+ struct cons_write_context *wctxt);
+bool serial8250_console_write_thread(struct uart_8250_port *up,
+ struct cons_write_context *wctxt);
int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
int serial8250_console_exit(struct uart_port *port);
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index c02646884fa83..c1acb81b81948 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -178,6 +178,26 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
#endif /* !CONFIG_GENERIC_ENTRY */
#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
+# ifdef CONFIG_PREEMPT_LAZY
+
+static __always_inline bool tif_need_resched(void)
+{
+ return read_thread_flags() & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY);
+}
+
+static __always_inline bool tif_need_resched_now(void)
+{
+ return arch_test_bit(TIF_NEED_RESCHED,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+static __always_inline bool tif_need_resched_lazy(void)
+{
+ return arch_test_bit(TIF_NEED_RESCHED_LAZY,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+# else /* !CONFIG_PREEMPT_LAZY */
static __always_inline bool tif_need_resched(void)
{
@@ -185,7 +205,38 @@ static __always_inline bool tif_need_resched(void)
(unsigned long *)(&current_thread_info()->flags));
}
-#else
+static __always_inline bool tif_need_resched_now(void)
+{
+ return tif_need_resched();
+}
+
+static __always_inline bool tif_need_resched_lazy(void)
+{
+ return false;
+}
+
+# endif /* CONFIG_PREEMPT_LAZY */
+#else /* !_ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
+# ifdef CONFIG_PREEMPT_LAZY
+
+static __always_inline bool tif_need_resched(void)
+{
+ return read_thread_flags() & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY);
+}
+
+static __always_inline bool tif_need_resched_now(void)
+{
+ return test_bit(TIF_NEED_RESCHED,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+static __always_inline bool tif_need_resched_lazy(void)
+{
+ return test_bit(TIF_NEED_RESCHED_LAZY,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+# else /* !CONFIG_PREEMPT_LAZY */
static __always_inline bool tif_need_resched(void)
{
@@ -193,6 +244,17 @@ static __always_inline bool tif_need_resched(void)
(unsigned long *)(&current_thread_info()->flags));
}
+static __always_inline bool tif_need_resched_now(void)
+{
+ return tif_need_resched();
+}
+
+static __always_inline bool tif_need_resched_lazy(void)
+{
+ return false;
+}
+
+# endif /* !CONFIG_PREEMPT_LAZY */
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0e373222a6df8..47017fcf5481f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -70,6 +70,7 @@ struct trace_entry {
unsigned char flags;
unsigned char preempt_count;
int pid;
+ unsigned char preempt_lazy_count;
};
#define TRACE_EVENT_TYPE_MAX \
@@ -158,9 +159,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry,
unsigned int trace_ctx)
{
entry->preempt_count = trace_ctx & 0xff;
+ entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff;
entry->pid = current->pid;
entry->type = type;
- entry->flags = trace_ctx >> 16;
+ entry->flags = trace_ctx >> 24;
}
unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
@@ -171,7 +173,13 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
+#ifdef CONFIG_PREEMPT_LAZY
+ TRACE_FLAG_PREEMPT_RESCHED = 0x00,
+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x20,
+#else
+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x00,
TRACE_FLAG_PREEMPT_RESCHED = 0x20,
+#endif
TRACE_FLAG_NMI = 0x40,
TRACE_FLAG_BH_OFF = 0x80,
};
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 3e8619c72f774..b4bc2828fa09f 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -158,7 +158,11 @@ DEFINE_EVENT(timer_class, timer_cancel,
{ HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \
{ HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \
{ HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \
- { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" })
+ { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }, \
+ { HRTIMER_MODE_ABS_HARD, "ABS|HARD" }, \
+ { HRTIMER_MODE_REL_HARD, "REL|HARD" }, \
+ { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" }, \
+ { HRTIMER_MODE_REL_PINNED_HARD, "REL|PINNED|HARD" })
/**
* hrtimer_init - called when the hrtimer is initialized
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a8214..260c08efeb486 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,5 +1,11 @@
# SPDX-License-Identifier: GPL-2.0-only
+config HAVE_PREEMPT_LAZY
+ bool
+
+config PREEMPT_LAZY
+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT
+
config PREEMPT_NONE_BUILD
bool
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 5c7e9ba7cd6b2..e9139dfc1f0a8 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -576,6 +576,8 @@ static void kdb_msg_write(const char *msg, int msg_len)
continue;
if (c == dbg_io_ops->cons)
continue;
+ if (!c->write)
+ continue;
/*
* Set oops_in_progress to encourage the console drivers to
* disregard their internal spin locks: in the current calling
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index be61332c66b54..c6301e520d47b 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -155,7 +155,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & _TIF_NEED_RESCHED_MASK)
schedule();
if (ti_work & _TIF_UPROBE)
@@ -386,7 +386,7 @@ void raw_irqentry_exit_cond_resched(void)
rcu_irq_exit_check_preempt();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
WARN_ON_ONCE(!on_thread_stack());
- if (need_resched())
+ if (should_resched(0))
preempt_schedule_irq();
}
}
diff --git a/kernel/fork.c b/kernel/fork.c
index ea332319dffea..d78954a3834cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1665,6 +1665,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
RCU_INIT_POINTER(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
+ sig->sigqueue_cache = NULL;
refcount_set(&sig->count, 1);
spin_lock_irq(&current->sighand->siglock);
@@ -1681,7 +1682,17 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
void __cleanup_sighand(struct sighand_struct *sighand)
{
if (refcount_dec_and_test(&sighand->count)) {
+ struct sigqueue *sigqueue = NULL;
+
signalfd_cleanup(sighand);
+ spin_lock_irq(&sighand->siglock);
+ if (sighand->sigqueue_cache) {
+ sigqueue = sighand->sigqueue_cache;
+ sighand->sigqueue_cache = NULL;
+ }
+ spin_unlock_irq(&sighand->siglock);
+
+ sigqueue_free_cached_entry(sigqueue);
/*
* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
* without an RCU grace period, see __lock_task_sighand().
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0408aab80941b..ee063d28f94f8 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -167,6 +167,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
#endif /* CONFIG_CRASH_CORE */
+#if defined(CONFIG_PREEMPT_RT)
+static ssize_t realtime_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -259,6 +268,9 @@ static struct attribute * kernel_attrs[] = {
#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
&rcu_normal_attr.attr,
+#endif
+#ifdef CONFIG_PREEMPT_RT
+ &realtime_attr.attr,
#endif
NULL
};
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 728f434de2bbf..479a9487edcc2 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -218,6 +218,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
return try_cmpxchg_acquire(&lock->owner, &old, new);
}
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
struct task_struct *old,
struct task_struct *new)
@@ -297,6 +302,24 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
}
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ /*
+ * With debug enabled rt_mutex_cmpxchg trylock() will always fail,
+ * which will unconditionally invoke sched_submit/resume_work() in
+ * the slow path of __rt_mutex_lock() and __ww_rt_mutex_lock() even
+ * in the non-contended case.
+ *
+ * Avoid that by using rt_mutex_slow_trylock() which is covered by
+ * the debug code and can acquire a non-contended rtmutex. On
+ * success the callsite avoids the sched_submit/resume_work()
+ * dance.
+ */
+ return rt_mutex_slowtrylock(lock);
+}
+
static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
struct task_struct *old,
struct task_struct *new)
@@ -1555,7 +1578,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
raw_spin_unlock_irq(&lock->wait_lock);
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
- schedule();
+ schedule_rtmutex();
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
@@ -1584,7 +1607,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
WARN(1, "rtmutex deadlock detected\n");
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ schedule_rtmutex();
}
}
@@ -1679,6 +1702,12 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
unsigned long flags;
int ret;
+ /*
+ * The task is about to sleep. Invoke sched_submit_work() before
+ * blocking as that might take locks and corrupt tsk::pi_blocked_on.
+ */
+ sched_submit_work();
+
/*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
* be called in early boot if the cmpxchg() fast path is disabled
@@ -1691,13 +1720,16 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ sched_resume_work();
return ret;
}
static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
unsigned int state)
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (likely(rt_mutex_try_acquire(lock)))
return 0;
return rt_mutex_slowlock(lock, NULL, state);
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index c201aadb93017..5be92ca5afabc 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -72,15 +72,6 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
int ret;
raw_spin_lock_irq(&rtm->wait_lock);
- /*
- * Allow readers, as long as the writer has not completely
- * acquired the semaphore for write.
- */
- if (atomic_read(&rwb->readers) != WRITER_BIAS) {
- atomic_inc(&rwb->readers);
- raw_spin_unlock_irq(&rtm->wait_lock);
- return 0;
- }
/*
* Call into the slow lock path with the rtmutex->wait_lock
@@ -140,10 +131,23 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
unsigned int state)
{
+ int ret;
+
+ lockdep_assert(!current->pi_blocked_on);
+
if (rwbase_read_trylock(rwb))
return 0;
- return __rwbase_read_lock(rwb, state);
+ /*
+ * The task is about to sleep. For rwsems this submits work as that
+ * might take locks and corrupt tsk::pi_blocked_on. Must be
+ * explicit here because __rwbase_read_lock() cannot invoke
+ * rt_mutex_slowlock(). NOP for rwlocks.
+ */
+ rwbase_sched_submit_work();
+ ret = __rwbase_read_lock(rwb, state);
+ rwbase_sched_resume_work();
+ return ret;
}
static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
@@ -239,7 +243,10 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
struct rt_mutex_base *rtm = &rwb->rtmutex;
unsigned long flags;
- /* Take the rtmutex as a first step */
+ /*
+ * Take the rtmutex as a first step. For rwsem this will also
+ * invoke sched_submit_work() to flush IO and workers.
+ */
if (rwbase_rtmutex_lock_state(rtm, state))
return -EINTR;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 9eabd585ce7af..e304db9ebfd95 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1415,6 +1415,12 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
#define rwbase_rtmutex_lock_state(rtm, state) \
__rt_mutex_lock(rtm, state)
+#define rwbase_sched_submit_work() \
+ sched_submit_work()
+
+#define rwbase_sched_resume_work() \
+ sched_resume_work()
+
#define rwbase_rtmutex_slowlock_locked(rtm, state) \
__rt_mutex_slowlock_locked(rtm, NULL, state)
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 48a19ed8486d8..9fe282cd145d9 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -37,6 +37,8 @@
static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
{
+ lockdep_assert(!current->pi_blocked_on);
+
if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
rtlock_slowlock(rtm);
}
@@ -159,6 +161,9 @@ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
return 0;
}
+static __always_inline void rwbase_sched_submit_work(void) { }
+static __always_inline void rwbase_sched_resume_work(void) { }
+
static __always_inline int
rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
{
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index d1473c624105c..c7196de838edc 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
}
mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
- if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) {
+ if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
if (ww_ctx)
ww_mutex_set_context_fastpath(lock, ww_ctx);
return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index 5cfea8302d23a..190f7f2bc6cfd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -275,6 +275,7 @@ static void panic_other_cpus_shutdown(bool crash_kexec)
*/
void panic(const char *fmt, ...)
{
+ enum cons_prio prev_prio;
static char buf[1024];
va_list args;
long i, i_next = 0, len;
@@ -322,6 +323,8 @@ void panic(const char *fmt, ...)
if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
panic_smp_self_stop();
+ prev_prio = cons_atomic_enter(CONS_PRIO_PANIC);
+
console_verbose();
bust_spinlocks(1);
va_start(args, fmt);
@@ -382,6 +385,8 @@ void panic(const char *fmt, ...)
if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
+ cons_atomic_flush(NULL, true);
+
console_unblank();
/*
@@ -406,6 +411,7 @@ void panic(const char *fmt, ...)
* We can't use the "normal" timers since we just panicked.
*/
pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
+ cons_atomic_flush(NULL, true);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
@@ -424,6 +430,7 @@ void panic(const char *fmt, ...)
*/
if (panic_reboot_mode != REBOOT_UNDEFINED)
reboot_mode = panic_reboot_mode;
+ cons_atomic_flush(NULL, true);
emergency_restart();
}
#ifdef __sparc__
@@ -436,12 +443,16 @@ void panic(const char *fmt, ...)
}
#endif
#if defined(CONFIG_S390)
+ cons_atomic_flush(NULL, true);
disabled_wait();
#endif
pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
/* Do not scroll important messages printed above */
suppress_printk = 1;
+
+ cons_atomic_exit(CONS_PRIO_PANIC, prev_prio);
+
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
@@ -652,6 +663,10 @@ struct warn_args {
void __warn(const char *file, int line, void *caller, unsigned taint,
struct pt_regs *regs, struct warn_args *args)
{
+ enum cons_prio prev_prio;
+
+ prev_prio = cons_atomic_enter(CONS_PRIO_EMERGENCY);
+
disable_trace_on_warning();
if (file)
@@ -682,6 +697,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
+
+ cons_atomic_exit(CONS_PRIO_EMERGENCY, prev_prio);
}
#ifndef __WARN_FLAGS
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index f5b388e810b9f..b36683bd2f821 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
+obj-$(CONFIG_PRINTK) += printk_safe.o printk_nobkl.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
obj-$(CONFIG_PRINTK_INDEX) += index.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2a17704136f1d..6631fd70542f9 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -3,6 +3,8 @@
* internal.h - printk internal definitions
*/
#include <linux/percpu.h>
+#include <linux/console.h>
+#include "printk_ringbuffer.h"
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
void __init printk_sysctl_init(void);
@@ -12,8 +14,13 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
#define printk_sysctl_init() do { } while (0)
#endif
-#ifdef CONFIG_PRINTK
+#define con_printk(lvl, con, fmt, ...) \
+ printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt), \
+ (con->flags & CON_NO_BKL) ? "" : "legacy ", \
+ (con->flags & CON_BOOT) ? "boot" : "", \
+ con->name, con->index, ##__VA_ARGS__)
+#ifdef CONFIG_PRINTK
#ifdef CONFIG_PRINTK_CALLER
#define PRINTK_PREFIX_MAX 48
#else
@@ -35,6 +42,12 @@ enum printk_info_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
+extern struct printk_ringbuffer *prb;
+extern bool have_bkl_console;
+extern bool printk_threads_enabled;
+
+extern bool have_boot_console;
+
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
@@ -45,28 +58,98 @@ __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
bool printk_percpu_data_ready(void);
+/*
+ * The printk_safe_enter()/_exit() macros mark code blocks using locks that
+ * would lead to deadlock if an interrupting context were to call printk()
+ * while the interrupted context was within such code blocks.
+ *
+ * When a CPU is in such a code block, an interrupting context calling
+ * printk() will only log the new message to the lockless ringbuffer and
+ * then trigger console printing using irqwork.
+ */
+
#define printk_safe_enter_irqsave(flags) \
do { \
- local_irq_save(flags); \
- __printk_safe_enter(); \
+ __printk_safe_enter(&flags); \
} while (0)
#define printk_safe_exit_irqrestore(flags) \
do { \
- __printk_safe_exit(); \
- local_irq_restore(flags); \
+ __printk_safe_exit(&flags); \
} while (0)
void defer_console_output(void);
u16 printk_parse_prefix(const char *text, int *level,
enum printk_info_flags *flags);
+
+u64 cons_read_seq(struct console *con);
+void cons_nobkl_cleanup(struct console *con);
+bool cons_nobkl_init(struct console *con);
+bool cons_alloc_percpu_data(struct console *con);
+void cons_kthread_create(struct console *con);
+void cons_wake_threads(void);
+void cons_force_seq(struct console *con, u64 seq);
+void console_bkl_kthread_create(void);
+
+/*
+ * Check if the given console is currently capable and allowed to print
+ * records. If the caller only works with certain types of consoles, the
+ * caller is responsible for checking the console type before calling
+ * this function.
+ */
+static inline bool console_is_usable(struct console *con, short flags)
+{
+ if (!(flags & CON_ENABLED))
+ return false;
+
+ if ((flags & CON_SUSPENDED))
+ return false;
+
+ /*
+ * The usability of a console varies depending on whether
+ * it is a NOBKL console or not.
+ */
+
+ if (flags & CON_NO_BKL) {
+ if (have_boot_console)
+ return false;
+
+ } else {
+ if (!con->write)
+ return false;
+ /*
+ * Console drivers may assume that per-cpu resources have
+ * been allocated. So unless they're explicitly marked as
+ * being able to cope (CON_ANYTIME) don't call them until
+ * this CPU is officially up.
+ */
+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * cons_kthread_wake - Wake up a printk thread
+ * @con: Console to operate on
+ */
+static inline void cons_kthread_wake(struct console *con)
+{
+ rcuwait_wake_up(&con->rcuwait);
+}
+
#else
#define PRINTK_PREFIX_MAX 0
#define PRINTK_MESSAGE_MAX 0
#define PRINTKRB_RECORD_MAX 0
+static inline void cons_kthread_wake(struct console *con) { }
+static inline void cons_kthread_create(struct console *con) { }
+#define printk_threads_enabled (false)
+
/*
* In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
@@ -76,8 +159,15 @@ u16 printk_parse_prefix(const char *text, int *level,
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
static inline bool printk_percpu_data_ready(void) { return false; }
+static inline bool cons_nobkl_init(struct console *con) { return true; }
+static inline void cons_nobkl_cleanup(struct console *con) { }
+static inline bool console_is_usable(struct console *con, short flags) { return false; }
+static inline void cons_force_seq(struct console *con, u64 seq) { }
+
#endif /* CONFIG_PRINTK */
+extern bool have_boot_console;
+
/**
* struct printk_buffers - Buffers to read/format/output printk messages.
* @outbuf: After formatting, contains text to output.
@@ -103,3 +193,28 @@ struct printk_message {
u64 seq;
unsigned long dropped;
};
+
+/**
+ * struct cons_context_data - console context data
+ * @wctxt: Write context per priority level
+ * @pbufs: Buffer for storing the text
+ *
+ * Used for early boot and for per CPU data.
+ *
+ * The write contexts are allocated to avoid having them on stack, e.g. in
+ * warn() or panic().
+ */
+struct cons_context_data {
+ struct cons_write_context wctxt[CONS_PRIO_MAX];
+ struct printk_buffers pbufs;
+};
+
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_supress);
+
+#ifdef CONFIG_PRINTK
+
+void console_prepend_dropped(struct printk_message *pmsg,
+ unsigned long dropped);
+
+#endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fd0c9f913940a..e2466366d4f84 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -442,6 +442,21 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);
+/*
+ * Specifies if a BKL console was ever registered. Used to determine if the
+ * console lock/unlock dance is needed for console printing.
+ */
+bool have_bkl_console;
+
+/*
+ * Specifies if a boot console is registered. Used to determine if NOBKL
+ * consoles may be used since NOBKL consoles cannot synchronize with boot
+ * consoles.
+ */
+bool have_boot_console;
+
+static int unregister_console_locked(struct console *console);
+
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
@@ -492,7 +507,7 @@ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
static struct printk_ringbuffer printk_rb_dynamic;
-static struct printk_ringbuffer *prb = &printk_rb_static;
+struct printk_ringbuffer *prb = &printk_rb_static;
/*
* We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
@@ -696,9 +711,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
return len;
}
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
- bool is_extended, bool may_supress);
-
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
atomic64_t seq;
@@ -1100,7 +1112,19 @@ static inline void log_buf_add_cpu(void) {}
static void __init set_percpu_data_ready(void)
{
+ struct hlist_node *tmp;
+ struct console *con;
+
+ console_list_lock();
+
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (!cons_alloc_percpu_data(con))
+ unregister_console_locked(con);
+ }
+
__printk_percpu_data_ready = true;
+
+ console_list_unlock();
}
static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
@@ -2276,6 +2300,7 @@ asmlinkage int vprintk_emit(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
+ struct cons_write_context wctxt = { };
int printed_len;
bool in_sched = false;
@@ -2296,16 +2321,25 @@ asmlinkage int vprintk_emit(int facility, int level,
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
+ /*
+ * The caller may be holding system-critical or
+ * timing-sensitive locks. Disable preemption during
+ * printing of all remaining records to all consoles so that
+ * this context can return as soon as possible. Hopefully
+ * another printk() caller will take over the printing.
+ */
+ preempt_disable();
+
+ /*
+ * Flush the non-BKL consoles. This only leads to direct atomic
+ * printing for non-BKL consoles that do not have a printer
+ * thread available. Otherwise the printer thread will perform
+ * the printing.
+ */
+ cons_atomic_flush(&wctxt, true);
+
/* If called from the scheduler, we can not call up(). */
- if (!in_sched) {
- /*
- * The caller may be holding system-critical or
- * timing-sensitive locks. Disable preemption during
- * printing of all remaining records to all consoles so that
- * this context can return as soon as possible. Hopefully
- * another printk() caller will take over the printing.
- */
- preempt_disable();
+ if (!in_sched && have_bkl_console && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
/*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers. With the
@@ -2314,10 +2348,15 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (console_trylock_spinning())
console_unlock();
- preempt_enable();
}
- wake_up_klogd();
+ preempt_enable();
+
+ cons_wake_threads();
+ if (in_sched)
+ defer_console_output();
+ else
+ wake_up_klogd();
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -2556,10 +2595,26 @@ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig
*/
void suspend_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
pr_flush(1000, true);
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see that they are suspended so that it
+ * is guaranteed that all printing has stopped when this function
+ * completes.
+ */
+ synchronize_srcu(&console_srcu);
+
console_lock();
console_suspended = 1;
up_console_sem();
@@ -2567,11 +2622,39 @@ void suspend_console(void)
void resume_console(void)
{
+ struct console *con;
+ short flags;
+ int cookie;
+
if (!console_suspend_enabled)
return;
down_console_sem();
console_suspended = 0;
console_unlock();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see they are no longer suspended so
+ * that they are guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ flags = console_srcu_read_flags(con);
+ if (flags & CON_NO_BKL)
+ cons_kthread_wake(con);
+ }
+ console_srcu_read_unlock(cookie);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && have_bkl_console)
+ wake_up_interruptible(&log_wait);
+
pr_flush(1000, true);
}
@@ -2586,7 +2669,7 @@ void resume_console(void)
*/
static int console_cpu_notify(unsigned int cpu)
{
- if (!cpuhp_tasks_frozen) {
+ if (!cpuhp_tasks_frozen && have_bkl_console) {
/* If trylock fails, someone else is doing the printing */
if (console_trylock())
console_unlock();
@@ -2661,33 +2744,6 @@ static bool abandon_console_lock_in_panic(void)
return atomic_read(&panic_cpu) != raw_smp_processor_id();
}
-/*
- * Check if the given console is currently capable and allowed to print
- * records.
- *
- * Requires the console_srcu_read_lock.
- */
-static inline bool console_is_usable(struct console *con)
-{
- short flags = console_srcu_read_flags(con);
-
- if (!(flags & CON_ENABLED))
- return false;
-
- if (!con->write)
- return false;
-
- /*
- * Console drivers may assume that per-cpu resources have been
- * allocated. So unless they're explicitly marked as being able to
- * cope (CON_ANYTIME) don't call them until this CPU is officially up.
- */
- if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
- return false;
-
- return true;
-}
-
static void __console_unlock(void)
{
console_locked = 0;
@@ -2709,7 +2765,7 @@ static void __console_unlock(void)
* If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
*/
#ifdef CONFIG_PRINTK
-static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
struct printk_buffers *pbufs = pmsg->pbufs;
const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
@@ -2741,7 +2797,8 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
pmsg->outbuf_len += len;
}
#else
-#define console_prepend_dropped(pmsg, dropped)
+static inline void console_prepend_dropped(struct printk_message *pmsg,
+ unsigned long dropped) { }
#endif /* CONFIG_PRINTK */
/*
@@ -2763,8 +2820,8 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
* of @pmsg are valid. (See the documentation of struct printk_message
* for information about the @pmsg fields.)
*/
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
- bool is_extended, bool may_suppress)
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_suppress)
{
static int panic_console_dropped;
@@ -2933,9 +2990,14 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
cookie = console_srcu_read_lock();
for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
bool progress;
- if (!console_is_usable(con))
+ /* console_flush_all() is only for legacy consoles. */
+ if (flags & CON_NO_BKL)
+ continue;
+
+ if (!console_is_usable(con, flags))
continue;
any_usable = true;
@@ -2973,30 +3035,13 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
return false;
}
-/**
- * console_unlock - unblock the console subsystem from printing
- *
- * Releases the console_lock which the caller holds to block printing of
- * the console subsystem.
- *
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
- *
- * console_unlock(); may be called from any context.
- */
-void console_unlock(void)
+static u64 console_flush_and_unlock(void)
{
bool do_cond_resched;
bool handover;
bool flushed;
u64 next_seq;
- if (console_suspended) {
- up_console_sem();
- return;
- }
-
/*
* Console drivers are called with interrupts disabled, so
* @console_may_schedule should be cleared before; however, we may
@@ -3033,6 +3078,39 @@ void console_unlock(void)
* fails, another context is already handling the printing.
*/
} while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
+
+ return next_seq;
+}
+
+/**
+ * console_unlock - unblock the console subsystem from printing
+ *
+ * Releases the console_lock which the caller holds to block printing of
+ * the console subsystem.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ if (console_suspended) {
+ up_console_sem();
+ return;
+ }
+
+ /*
+ * PREEMPT_RT relies on kthread and atomic consoles for printing.
+ * It never attempts to print from console_unlock().
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ __console_unlock();
+ return;
+ }
+
+ console_flush_and_unlock();
}
EXPORT_SYMBOL(console_unlock);
@@ -3057,6 +3135,9 @@ void console_unblank(void)
struct console *c;
int cookie;
+ if (!have_bkl_console)
+ return;
+
/*
* Stop console printing because the unblank() callback may
* assume the console is not within its write() callback.
@@ -3065,6 +3146,10 @@ void console_unblank(void)
* In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
+ /* Semaphores are not NMI-safe. */
+ if (in_nmi())
+ return;
+
if (down_trylock_console_sem() != 0)
return;
} else
@@ -3094,23 +3179,46 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
+ struct console *c;
+ short flags;
+ int cookie;
+ u64 seq;
+
+ seq = prb_first_valid_seq(prb);
+
+ /*
+ * Safely flush the atomic consoles before trying to flush any
+ * BKL/legacy consoles.
+ */
+ if (mode == CONSOLE_REPLAY_ALL) {
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ flags = console_srcu_read_flags(c);
+ if (flags & CON_NO_BKL)
+ cons_force_seq(c, seq);
+ }
+ console_srcu_read_unlock(cookie);
+ }
+ cons_atomic_flush(NULL, true);
+
+ if (!have_bkl_console)
+ return;
+
/*
* If someone else is holding the console lock, trylock will fail
* and may_schedule may be set. Ignore and proceed to unlock so
* that messages are flushed out. As this can be called from any
* context and we don't want to get preempted while flushing,
* ensure may_schedule is cleared.
+ *
+ * Since semaphores are not NMI-safe, the console lock must be
+ * ignored if the panic is in NMI context.
*/
- console_trylock();
+ if (!in_nmi())
+ console_trylock();
console_may_schedule = 0;
if (mode == CONSOLE_REPLAY_ALL) {
- struct console *c;
- int cookie;
- u64 seq;
-
- seq = prb_first_valid_seq(prb);
-
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
/*
@@ -3122,7 +3230,8 @@ void console_flush_on_panic(enum con_flush_mode mode)
}
console_srcu_read_unlock(cookie);
}
- console_unlock();
+ if (!in_nmi())
+ console_unlock();
}
/*
@@ -3179,13 +3288,118 @@ EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
+ short flags;
+
console_list_lock();
console_srcu_write_flags(console, console->flags | CON_ENABLED);
+ flags = console->flags;
console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. The related
+ * printing context must be able to see it is enabled so that
+ * it is guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
+ if (flags & CON_NO_BKL)
+ cons_kthread_wake(console);
+ else if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ wake_up_interruptible(&log_wait);
+
__pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);
+static struct task_struct *console_bkl_kthread;
+
+static bool printer_should_wake(u64 seq)
+{
+ bool available = false;
+ struct console *con;
+ int cookie;
+
+ if (kthread_should_stop())
+ return true;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+
+ if (flags & CON_NO_BKL)
+ continue;
+ if (!console_is_usable(con, flags))
+ continue;
+ /*
+ * It is safe to read @seq because only this
+ * thread context updates @seq.
+ */
+ if (prb_read_valid(prb, con->seq, NULL)) {
+ available = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+
+ return available;
+}
+
+static int console_bkl_kthread_func(void *unused)
+{
+ u64 seq = 0;
+ int error;
+
+ for (;;) {
+ error = wait_event_interruptible(log_wait, printer_should_wake(seq));
+
+ if (kthread_should_stop())
+ break;
+
+ if (error)
+ continue;
+
+ console_lock();
+ if (console_suspended)
+ up_console_sem();
+ else
+ seq = console_flush_and_unlock();
+ }
+ return 0;
+}
+
+void console_bkl_kthread_create(void)
+{
+ struct task_struct *kt;
+ struct console *c;
+
+ lockdep_assert_held(&console_mutex);
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ return;
+
+ if (!printk_threads_enabled || console_bkl_kthread)
+ return;
+
+ for_each_console(c) {
+ if (c->flags & CON_BOOT)
+ return;
+ }
+
+ kt = kthread_run(console_bkl_kthread_func, NULL, "pr/bkl");
+ if (IS_ERR(kt)) {
+ pr_err("unable to start BKL printing thread\n");
+ return;
+ }
+
+ console_bkl_kthread = kt;
+
+ /*
+ * It is important that console printing threads are scheduled
+ * shortly after a printk call and with generous runtime budgets.
+ */
+ sched_set_normal(console_bkl_kthread, -20);
+}
+
static int __read_mostly keep_bootcon;
static int __init keep_bootcon_setup(char *str)
@@ -3269,11 +3483,6 @@ static void try_enable_default_console(struct console *newcon)
newcon->flags |= CON_CONSDEV;
}
-#define con_printk(lvl, con, fmt, ...) \
- printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \
- (con->flags & CON_BOOT) ? "boot" : "", \
- con->name, con->index, ##__VA_ARGS__)
-
static void console_init_seq(struct console *newcon, bool bootcon_registered)
{
struct console *con;
@@ -3338,8 +3547,6 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered)
#define console_first() \
hlist_entry(console_list.first, struct console, node)
-static int unregister_console_locked(struct console *console);
-
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -3431,6 +3638,16 @@ void register_console(struct console *newcon)
newcon->dropped = 0;
console_init_seq(newcon, bootcon_registered);
+ if (!(newcon->flags & CON_NO_BKL)) {
+ have_bkl_console = true;
+ console_bkl_kthread_create();
+ } else if (!cons_nobkl_init(newcon)) {
+ goto unlock;
+ }
+
+ if (newcon->flags & CON_BOOT)
+ have_boot_console = true;
+
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
@@ -3474,6 +3691,9 @@ void register_console(struct console *newcon)
if (con->flags & CON_BOOT)
unregister_console_locked(con);
}
+
+ /* All boot consoles have been unregistered. */
+ have_boot_console = false;
}
unlock:
console_list_unlock();
@@ -3483,11 +3703,13 @@ EXPORT_SYMBOL(register_console);
/* Must be called under console_list_lock(). */
static int unregister_console_locked(struct console *console)
{
+ struct console *c;
+ bool is_boot_con;
int res;
lockdep_assert_console_list_lock_held();
- con_printk(KERN_INFO, console, "disabled\n");
+ is_boot_con = console->flags & CON_BOOT;
res = _braille_unregister_console(console);
if (res < 0)
@@ -3495,12 +3717,13 @@ static int unregister_console_locked(struct console *console)
if (res > 0)
return 0;
- /* Disable it unconditionally */
- console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
-
if (!console_is_registered_locked(console))
return -ENODEV;
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+
+ con_printk(KERN_INFO, console, "disabled\n");
+
hlist_del_init_rcu(&console->node);
/*
@@ -3522,11 +3745,23 @@ static int unregister_console_locked(struct console *console)
*/
synchronize_srcu(&console_srcu);
+ if (console->flags & CON_NO_BKL)
+ cons_nobkl_cleanup(console);
+
console_sysfs_notify();
if (console->exit)
res = console->exit(console);
+ /*
+ * Each time a boot console unregisters, try to start up the printing
+ * threads. They will only start if this was the last boot console.
+ */
+ if (is_boot_con) {
+ for_each_console(c)
+ cons_kthread_create(c);
+ }
+
return res;
}
@@ -3688,31 +3923,36 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
/*
* Hold the console_lock to guarantee safe access to
- * console->seq and to prevent changes to @console_suspended
- * until all consoles have been processed.
+ * console->seq.
*/
console_lock();
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
+ short flags;
+
if (con && con != c)
continue;
- if (!console_is_usable(c))
+
+ flags = console_srcu_read_flags(c);
+
+ if (!console_is_usable(c, flags))
continue;
+
+ /*
+ * Since the console is locked, use this opportunity
+ * to update console->seq for NOBKL consoles.
+ */
+ if (flags & CON_NO_BKL)
+ c->seq = cons_read_seq(c);
+
printk_seq = c->seq;
if (printk_seq < seq)
diff += seq - printk_seq;
}
console_srcu_read_unlock(cookie);
- /*
- * If consoles are suspended, it cannot be expected that they
- * make forward progress, so timeout immediately. @diff is
- * still used to return a valid flush status.
- */
- if (console_suspended)
- remaining = 0;
- else if (diff != last_diff && reset_on_progress)
+ if (diff != last_diff && reset_on_progress)
remaining = timeout_ms;
console_unlock();
@@ -3770,9 +4010,17 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
int pending = this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_OUTPUT) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* The BKL thread waits on @log_wait. */
+ pending |= PRINTK_PENDING_WAKEUP;
+ } else {
+ /*
+ * If trylock fails, some other context
+ * will do the printing.
+ */
+ if (console_trylock())
+ console_unlock();
+ }
}
if (pending & PRINTK_PENDING_WAKEUP)
@@ -3807,33 +4055,58 @@ static void __wake_up_klogd(int val)
preempt_enable();
}
+/**
+ * wake_up_klogd - Wake kernel logging daemon
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * and the console printing for those records is handled elsewhere. In
+ * this case only the logging daemon needs to be woken.
+ *
+ * Context: Any context.
+ */
void wake_up_klogd(void)
{
__wake_up_klogd(PRINTK_PENDING_WAKEUP);
}
+/**
+ * defer_console_output - Wake kernel logging daemon and trigger
+ * console printing in a deferred context
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * but the current context is unable to perform the console printing.
+ * This function also wakes the logging daemon.
+ *
+ * Context: Any context.
+ */
void defer_console_output(void)
{
+ int val = PRINTK_PENDING_WAKEUP;
+
/*
* New messages may have been added directly to the ringbuffer
* using vprintk_store(), so wake any waiters as well.
*/
- __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
+ if (have_bkl_console)
+ val |= PRINTK_PENDING_OUTPUT;
+ __wake_up_klogd(val);
}
void printk_trigger_flush(void)
{
+ struct cons_write_context wctxt = { };
+
+ preempt_disable();
+ cons_atomic_flush(&wctxt, true);
+ preempt_enable();
+
+ cons_wake_threads();
defer_console_output();
}
int vprintk_deferred(const char *fmt, va_list args)
{
- int r;
-
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
- defer_console_output();
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}
int _printk_deferred(const char *fmt, ...)
diff --git a/kernel/printk/printk_nobkl.c b/kernel/printk/printk_nobkl.c
new file mode 100644
index 0000000000000..e0b818a4f8b38
--- /dev/null
+++ b/kernel/printk/printk_nobkl.c
@@ -0,0 +1,1825 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+#include "printk_ringbuffer.h"
+#include "internal.h"
+/*
+ * Printk implementation for consoles that do not depend on the BKL style
+ * console_lock mechanism.
+ *
+ * Console is locked on a CPU when state::locked is set and state:cpu ==
+ * current CPU. This is valid for the current execution context.
+ *
+ * Nesting execution contexts on the same CPU can carefully take over
+ * if the driver allows reentrancy via state::unsafe = false. When the
+ * interrupted context resumes it checks the state before entering
+ * an unsafe region and aborts the operation if it detects a takeover.
+ *
+ * In case of panic or emergency the nesting context can take over the
+ * console forcefully. The write callback is then invoked with the unsafe
+ * flag set in the write context data, which allows the driver side to avoid
+ * locks and to evaluate the driver state so it can use an emergency path
+ * or repair the state instead of blindly assuming that it works.
+ *
+ * If the interrupted context touches the assigned record buffer after
+ * takeover, it does not cause harm because at the same execution level
+ * there is no concurrency on the same CPU. A threaded printer always has
+ * its own record buffer so it can never interfere with any of the per CPU
+ * record buffers.
+ *
+ * A concurrent writer on a different CPU can request to take over the
+ * console by:
+ *
+ * 1) Carefully writing the desired state into state[REQ]
+ * if there is no same or higher priority request pending.
+ * This locks state[REQ] except for higher priority
+ * waiters.
+ *
+ * 2) Setting state[CUR].req_prio unless a same or higher
+ * priority waiter won the race.
+ *
+ * 3) Carefully spin on state[CUR] until that is locked with the
+ * expected state. When the state is not the expected one then it
+ * has to verify that state[REQ] is still the same and that
+ * state[CUR] has not been taken over or unlocked.
+ *
+ * The unlocker hands over to state[REQ], but only if state[CUR]
+ * matches.
+ *
+ * In case that the owner does not react on the request and does not make
+ * observable progress, the waiter will timeout and can then decide to do
+ * a hostile takeover.
+ */
+
+#define copy_full_state(_dst, _src) do { _dst = _src; } while (0)
+#define copy_bit_state(_dst, _src) do { _dst.bits = _src.bits; } while (0)
+
+#ifdef CONFIG_64BIT
+#define copy_seq_state64(_dst, _src) do { _dst.seq = _src.seq; } while (0)
+#else
+#define copy_seq_state64(_dst, _src) do { } while (0)
+#endif
+
+enum state_selector {
+ CON_STATE_CUR,
+ CON_STATE_REQ,
+};
+
+/**
+ * cons_state_set - Helper function to set the console state
+ * @con: Console to update
+ * @which: Selects real state or handover state
+ * @new: The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use cons_state_try_cmpxchg().
+ */
+static inline void cons_state_set(struct console *con, enum state_selector which,
+ struct cons_state *new)
+{
+ atomic_long_set(&ACCESS_PRIVATE(con, atomic_state[which]), new->atom);
+}
+
+/**
+ * cons_state_read - Helper function to read the console state
+ * @con: Console to update
+ * @which: Selects real state or handover state
+ * @state: The state to store the result
+ */
+static inline void cons_state_read(struct console *con, enum state_selector which,
+ struct cons_state *state)
+{
+ state->atom = atomic_long_read(&ACCESS_PRIVATE(con, atomic_state[which]));
+}
+
+/**
+ * cons_state_try_cmpxchg() - Helper function for atomic_long_try_cmpxchg() on console state
+ * @con: Console to update
+ * @which: Selects real state or handover state
+ * @old: Old/expected state
+ * @new: New state
+ *
+ * Returns: True on success, false on fail
+ */
+static inline bool cons_state_try_cmpxchg(struct console *con,
+ enum state_selector which,
+ struct cons_state *old,
+ struct cons_state *new)
+{
+ return atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, atomic_state[which]),
+ &old->atom, new->atom);
+}
+
+/**
+ * cons_state_full_match - Check whether the full state matches
+ * @cur: The state to check
+ * @prev: The previous state
+ *
+ * Returns: True if matching, false otherwise.
+ *
+ * Check the full state including state::seq on 64bit. For take over
+ * detection.
+ */
+static inline bool cons_state_full_match(struct cons_state cur,
+ struct cons_state prev)
+{
+ /*
+ * req_prio can be set by a concurrent writer for friendly
+ * handover. Ignore it in the comparison.
+ */
+ cur.req_prio = prev.req_prio;
+ return cur.atom == prev.atom;
+}
+
+/**
+ * cons_state_bits_match - Check for matching state bits
+ * @cur: The state to check
+ * @prev: The previous state
+ *
+ * Returns: True if state matches, false otherwise.
+ *
+ * Contrary to cons_state_full_match this checks only the bits and ignores
+ * a sequence change on 64bits. On 32bit the two functions are identical.
+ */
+static inline bool cons_state_bits_match(struct cons_state cur, struct cons_state prev)
+{
+ /*
+ * req_prio can be set by a concurrent writer for friendly
+ * handover. Ignore it in the comparison.
+ */
+ cur.req_prio = prev.req_prio;
+ return cur.bits == prev.bits;
+}
+
+/**
+ * cons_check_panic - Check whether a remote CPU is in panic
+ *
+ * Returns: True if a remote CPU is in panic, false otherwise.
+ */
+static inline bool cons_check_panic(void)
+{
+ unsigned int pcpu = atomic_read(&panic_cpu);
+
+ return pcpu != PANIC_CPU_INVALID && pcpu != smp_processor_id();
+}
+
+static struct cons_context_data early_cons_ctxt_data __initdata;
+
+/**
+ * cons_context_set_pbufs - Set the output text buffer for the current context
+ * @ctxt: Pointer to the acquire context
+ *
+ * Buffer selection:
+ * 1) Early boot uses the global (initdata) buffer
+ * 2) Printer threads use the dynamically allocated per-console buffers
+ * 3) All other contexts use the per CPU buffers
+ *
+ * This guarantees that there is no concurrency on the output records ever.
+ * Early boot and per CPU nesting is not a problem. The takeover logic
+ * tells the interrupted context that the buffer has been overwritten.
+ *
+ * There are two critical regions that matter:
+ *
+ * 1) Context is filling the buffer with a record. After interruption
+ * it continues to sprintf() the record and before it goes to
+ * write it out, it checks the state, notices the takeover, discards
+ * the content and backs out.
+ *
+ * 2) Context is in a unsafe critical region in the driver. After
+ * interruption it might read overwritten data from the output
+ * buffer. When it leaves the critical region it notices and backs
+ * out. Hostile takeovers in driver critical regions are best effort
+ * and there is not much that can be done about that.
+ */
+static __ref void cons_context_set_pbufs(struct cons_context *ctxt)
+{
+ struct console *con = ctxt->console;
+
+ /* Thread context or early boot? */
+ if (ctxt->thread)
+ ctxt->pbufs = con->thread_pbufs;
+ else if (!con->pcpu_data)
+ ctxt->pbufs = &early_cons_ctxt_data.pbufs;
+ else
+ ctxt->pbufs = &(this_cpu_ptr(con->pcpu_data)->pbufs);
+}
+
+/**
+ * cons_seq_init - Helper function to initialize the console sequence
+ * @con: Console to work on
+ *
+ * Set @con->atomic_seq to the starting record, or if that record no
+ * longer exists, the oldest available record. For init only. Do not
+ * use for runtime updates.
+ */
+static void cons_seq_init(struct console *con)
+{
+ u32 seq = (u32)max_t(u64, con->seq, prb_first_valid_seq(prb));
+#ifdef CONFIG_64BIT
+ struct cons_state state;
+
+ cons_state_read(con, CON_STATE_CUR, &state);
+ state.seq = seq;
+ cons_state_set(con, CON_STATE_CUR, &state);
+#else
+ atomic_set(&ACCESS_PRIVATE(con, atomic_seq), seq);
+#endif
+}
+
+/**
+ * cons_force_seq - Force a specified sequence number for a console
+ * @con: Console to work on
+ * @seq: Sequence number to force
+ *
+ * This function is only intended to be used in emergency situations. In
+ * particular: console_flush_on_panic(CONSOLE_REPLAY_ALL)
+ */
+void cons_force_seq(struct console *con, u64 seq)
+{
+#ifdef CONFIG_64BIT
+ struct cons_state old;
+ struct cons_state new;
+
+ do {
+ cons_state_read(con, CON_STATE_CUR, &old);
+ copy_bit_state(new, old);
+ new.seq = seq;
+ } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new));
+#else
+ atomic_set(&ACCESS_PRIVATE(con, atomic_seq), seq);
+#endif
+}
+
+static inline u64 cons_expand_seq(u64 seq)
+{
+ u64 rbseq;
+
+ /*
+ * The provided sequence is only the lower 32bits of the ringbuffer
+ * sequence. It needs to be expanded to 64bit. Get the next sequence
+ * number from the ringbuffer and fold it.
+ */
+ rbseq = prb_next_seq(prb);
+ seq = rbseq - ((u32)rbseq - (u32)seq);
+
+ return seq;
+}
+
+/**
+ * cons_read_seq - Read the current console sequence
+ * @con: Console to read the sequence of
+ *
+ * Returns: Sequence number of the next record to print on @con.
+ */
+u64 cons_read_seq(struct console *con)
+{
+ u64 seq;
+#ifdef CONFIG_64BIT
+ struct cons_state state;
+
+ cons_state_read(con, CON_STATE_CUR, &state);
+ seq = state.seq;
+#else
+ seq = atomic_read(&ACCESS_PRIVATE(con, atomic_seq));
+#endif
+ return cons_expand_seq(seq);
+}
+
+/**
+ * cons_context_set_seq - Setup the context with the next sequence to print
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ *
+ * On return the retrieved sequence number is stored in ctxt->oldseq.
+ *
+ * The sequence number is safe in forceful takeover situations.
+ *
+ * Either the writer succeeded to update before it got interrupted
+ * or it failed. In the latter case the takeover will print the
+ * same line again.
+ *
+ * The sequence is only the lower 32bits of the ringbuffer sequence. The
+ * ringbuffer must be 2^31 records ahead to get out of sync. This needs
+ * some care when starting a console, i.e setting the sequence to 0 is
+ * wrong. It has to be set to the oldest valid sequence in the ringbuffer
+ * as that cannot be more than 2^31 records away
+ *
+ * On 64bit the 32bit sequence is part of console::state, which is saved
+ * in @ctxt->state. This prevents the 32bit update race.
+ */
+static void cons_context_set_seq(struct cons_context *ctxt)
+{
+#ifdef CONFIG_64BIT
+ ctxt->oldseq = ctxt->state.seq;
+#else
+ ctxt->oldseq = atomic_read(&ACCESS_PRIVATE(ctxt->console, atomic_seq));
+#endif
+ ctxt->oldseq = cons_expand_seq(ctxt->oldseq);
+ ctxt->newseq = ctxt->oldseq;
+}
+
+/**
+ * cons_seq_try_update - Try to update the console sequence number
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ *
+ * Returns: True if the console sequence was updated, false otherwise.
+ *
+ * Internal helper as the logic is different on 32bit and 64bit.
+ *
+ * On 32 bit the sequence is separate from state and therefore
+ * subject to a subtle race in the case of hostile takeovers.
+ *
+ * On 64 bit the sequence is part of the state and therefore safe
+ * vs. hostile takeovers.
+ *
+ * In case of fail the console has been taken over and @ctxt is
+ * invalid. Caller has to reacquire the console.
+ */
+#ifdef CONFIG_64BIT
+static bool cons_seq_try_update(struct cons_context *ctxt)
+{
+ struct console *con = ctxt->console;
+ struct cons_state old;
+ struct cons_state new;
+
+ cons_state_read(con, CON_STATE_CUR, &old);
+ do {
+ /* Make sure this context is still the owner. */
+ if (!cons_state_bits_match(old, ctxt->state))
+ return false;
+
+ /* Preserve bit state */
+ copy_bit_state(new, old);
+ new.seq = ctxt->newseq;
+
+ /*
+ * Can race with hostile takeover or with a handover
+ * request.
+ */
+ } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new));
+
+ copy_full_state(ctxt->state, new);
+ ctxt->oldseq = ctxt->newseq;
+
+ return true;
+}
+#else
+static bool cons_release(struct cons_context *ctxt);
+static bool cons_seq_try_update(struct cons_context *ctxt)
+{
+ struct console *con = ctxt->console;
+ struct cons_state state;
+ int pcpu;
+ u32 old;
+ u32 new;
+
+ /*
+ * There is a corner case that needs to be considered here:
+ *
+ * CPU0 CPU1
+ * printk()
+ * acquire() -> emergency
+ * write() acquire()
+ * update_seq()
+ * state == OK
+ * --> NMI
+ * takeover()
+ * <--- write()
+ * cmpxchg() succeeds update_seq()
+ * cmpxchg() fails
+ *
+ * There is nothing that can be done about this other than having
+ * yet another state bit that needs to be tracked and analyzed,
+ * but fails to cover the problem completely.
+ *
+ * No other scenarios expose such a problem. On same CPU takeovers
+ * the cmpxchg() always fails on the interrupted context after the
+ * interrupting context finished printing, but that's fine as it
+ * does not own the console anymore. The state check after the
+ * failed cmpxchg prevents that.
+ */
+ cons_state_read(con, CON_STATE_CUR, &state);
+ /* Make sure this context is still the owner. */
+ if (!cons_state_bits_match(state, ctxt->state))
+ return false;
+
+ /*
+ * Get the original sequence number that was retrieved
+ * from @con->atomic_seq. @con->atomic_seq should be still
+ * the same. 32bit truncates. See cons_context_set_seq().
+ */
+ old = (u32)ctxt->oldseq;
+ new = (u32)ctxt->newseq;
+ if (atomic_try_cmpxchg(&ACCESS_PRIVATE(con, atomic_seq), &old, new)) {
+ ctxt->oldseq = ctxt->newseq;
+ return true;
+ }
+
+ /*
+ * Reread the state. If this context does not own the console anymore
+ * then it cannot touch the sequence again.
+ */
+ cons_state_read(con, CON_STATE_CUR, &state);
+ if (!cons_state_bits_match(state, ctxt->state))
+ return false;
+
+ pcpu = atomic_read(&panic_cpu);
+ if (pcpu == smp_processor_id()) {
+ /*
+ * This is the panic CPU. Emitting a warning here does not
+ * help at all. The callchain is clear and the priority is
+ * to get the messages out. In the worst case duplicated
+ * ones. That's a job for postprocessing.
+ */
+ atomic_set(&ACCESS_PRIVATE(con, atomic_seq), new);
+ ctxt->oldseq = ctxt->newseq;
+ return true;
+ }
+
+ /*
+ * Only emit a warning when this happens outside of a panic
+ * situation as on panic it's neither useful nor helping to let the
+ * panic CPU get the important stuff out.
+ */
+ WARN_ON_ONCE(pcpu == PANIC_CPU_INVALID);
+
+ cons_release(ctxt);
+ return false;
+}
+#endif
+
+/**
+ * cons_cleanup_handover - Cleanup a handover request
+ * @ctxt: Pointer to acquire context
+ *
+ * @ctxt->hov_state contains the state to clean up
+ */
+static void cons_cleanup_handover(struct cons_context *ctxt)
+{
+ struct console *con = ctxt->console;
+ struct cons_state new;
+
+ /*
+ * No loop required. Either hov_state is still the same or
+ * not.
+ */
+ new.atom = 0;
+ cons_state_try_cmpxchg(con, CON_STATE_REQ, &ctxt->hov_state, &new);
+}
+
+/**
+ * cons_setup_handover - Setup a handover request
+ * @ctxt: Pointer to acquire context
+ *
+ * Returns: True if a handover request was setup, false otherwise.
+ *
+ * On success @ctxt->hov_state contains the requested handover state
+ *
+ * On failure this context is not allowed to request a handover from the
+ * current owner. Reasons would be priority too low or a remote CPU in panic.
+ * In both cases this context should give up trying to acquire the console.
+ */
+static bool cons_setup_handover(struct cons_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct cons_state old;
+ struct cons_state hstate = {
+ .locked = 1,
+ .cur_prio = ctxt->prio,
+ .cpu = cpu,
+ };
+
+ /*
+ * Try to store hstate in @con->atomic_state[REQ]. This might
+ * race with a higher priority waiter.
+ */
+ cons_state_read(con, CON_STATE_REQ, &old);
+ do {
+ if (cons_check_panic())
+ return false;
+
+ /* Same or higher priority waiter exists? */
+ if (old.cur_prio >= ctxt->prio)
+ return false;
+
+ } while (!cons_state_try_cmpxchg(con, CON_STATE_REQ, &old, &hstate));
+
+ /* Save that state for comparison in spinwait */
+ copy_full_state(ctxt->hov_state, hstate);
+ return true;
+}
+
+/**
+ * cons_setup_request - Setup a handover request in state[CUR]
+ * @ctxt: Pointer to acquire context
+ * @old: The state that was used to make the decision to spin wait
+ *
+ * Returns: True if a handover request was setup in state[CUR], false
+ * otherwise.
+ *
+ * On success @ctxt->req_state contains the request state that was set in
+ * state[CUR]
+ *
+ * On failure this context encountered unexpected state values. This
+ * context should retry the full handover request setup process (the
+ * handover request setup by cons_setup_handover() is now invalidated
+ * and must be performed again).
+ */
+static bool cons_setup_request(struct cons_context *ctxt, struct cons_state old)
+{
+ struct console *con = ctxt->console;
+ struct cons_state cur;
+ struct cons_state new;
+
+ /* Now set the request in state[CUR] */
+ cons_state_read(con, CON_STATE_CUR, &cur);
+ do {
+ if (cons_check_panic())
+ goto cleanup;
+
+ /* Bit state changed vs. the decision to spinwait? */
+ if (!cons_state_bits_match(cur, old))
+ goto cleanup;
+
+ /*
+ * A higher or equal priority context already setup a
+ * request?
+ */
+ if (cur.req_prio >= ctxt->prio)
+ goto cleanup;
+
+ /* Setup a request for handover. */
+ copy_full_state(new, cur);
+ new.req_prio = ctxt->prio;
+ } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &cur, &new));
+
+ /* Save that state for comparison in spinwait */
+ copy_bit_state(ctxt->req_state, new);
+ return true;
+
+cleanup:
+ cons_cleanup_handover(ctxt);
+ return false;
+}
+
+/**
+ * cons_try_acquire_spin - Complete the spinwait attempt
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ *
+ * @ctxt->hov_state contains the handover state that was set in
+ * state[REQ]
+ * @ctxt->req_state contains the request state that was set in
+ * state[CUR]
+ *
+ * Returns: 0 if successfully locked. -EBUSY on timeout. -EAGAIN on
+ * unexpected state values.
+ *
+ * On success @ctxt->state contains the new state that was set in
+ * state[CUR]
+ *
+ * On -EBUSY failure this context timed out. This context should either
+ * give up or attempt a hostile takeover.
+ *
+ * On -EAGAIN failure this context encountered unexpected state values.
+ * This context should retry the full handover request setup process (the
+ * handover request setup by cons_setup_handover() is now invalidated and
+ * must be performed again).
+ */
+static int cons_try_acquire_spin(struct cons_context *ctxt)
+{
+ struct console *con = ctxt->console;
+ struct cons_state cur;
+ struct cons_state new;
+ int err = -EAGAIN;
+ int timeout;
+
+ /* Now wait for the other side to hand over */
+ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
+ /* Timeout immediately if a remote panic is detected. */
+ if (cons_check_panic())
+ break;
+
+ cons_state_read(con, CON_STATE_CUR, &cur);
+
+ /*
+ * If the real state of the console matches the handover state
+ * that this context setup, then the handover was a success
+ * and this context is now the owner.
+ *
+ * Note that this might have raced with a new higher priority
+ * requester coming in after the lock was handed over.
+ * However, that requester will see that the owner changes and
+ * setup a new request for the current owner (this context).
+ */
+ if (cons_state_bits_match(cur, ctxt->hov_state))
+ goto success;
+
+ /*
+ * If state changed since the request was made, give up as
+ * it is no longer consistent. This must include
+ * state::req_prio since there could be a higher priority
+ * request available.
+ */
+ if (cur.bits != ctxt->req_state.bits)
+ goto cleanup;
+
+ /*
+ * Finally check whether the handover state is still
+ * the same.
+ */
+ cons_state_read(con, CON_STATE_REQ, &cur);
+ if (cur.atom != ctxt->hov_state.atom)
+ goto cleanup;
+
+ /* Account time */
+ if (timeout > 0)
+ udelay(1);
+ }
+
+ /*
+ * Timeout. Cleanup the handover state and carefully try to reset
+ * req_prio in the real state. The reset is important to ensure
+ * that the owner does not hand over the lock after this context
+ * has given up waiting.
+ */
+ cons_cleanup_handover(ctxt);
+
+ cons_state_read(con, CON_STATE_CUR, &cur);
+ do {
+ /*
+ * The timeout might have raced with the owner coming late
+ * and handing it over gracefully.
+ */
+ if (cons_state_bits_match(cur, ctxt->hov_state))
+ goto success;
+
+ /*
+ * Validate that the state matches with the state at request
+ * time. If this check fails, there is already a higher
+ * priority context waiting or the owner has changed (either
+ * by higher priority or by hostile takeover). In all fail
+ * cases this context is no longer in line for a handover to
+ * take place, so no reset is necessary.
+ */
+ if (cur.bits != ctxt->req_state.bits)
+ goto cleanup;
+
+ copy_full_state(new, cur);
+ new.req_prio = 0;
+ } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &cur, &new));
+ /* Reset worked. Report timeout. */
+ return -EBUSY;
+
+success:
+ /* Store the real state */
+ copy_full_state(ctxt->state, cur);
+ ctxt->hostile = false;
+ err = 0;
+
+cleanup:
+ cons_cleanup_handover(ctxt);
+ return err;
+}
+
+/**
+ * __cons_try_acquire - Try to acquire the console for printk output
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ *
+ * Returns: True if the acquire was successful. False on fail.
+ *
+ * In case of success @ctxt->state contains the acquisition
+ * state.
+ *
+ * In case of fail @ctxt->old_state contains the state
+ * that was read from @con->state for analysis by the caller.
+ */
+static bool __cons_try_acquire(struct cons_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ short flags = console_srcu_read_flags(con);
+ struct cons_state old;
+ struct cons_state new;
+ int err;
+
+ if (WARN_ON_ONCE(!(flags & CON_NO_BKL)))
+ return false;
+again:
+ cons_state_read(con, CON_STATE_CUR, &old);
+
+ /* Preserve it for the caller and for spinwait */
+ copy_full_state(ctxt->old_state, old);
+
+ if (cons_check_panic())
+ return false;
+
+ /* Set up the new state for takeover */
+ copy_full_state(new, old);
+ new.locked = 1;
+ new.thread = ctxt->thread;
+ new.cur_prio = ctxt->prio;
+ new.req_prio = CONS_PRIO_NONE;
+ new.cpu = cpu;
+
+ /* Attempt to acquire it directly if unlocked */
+ if (!old.locked) {
+ if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new))
+ goto again;
+
+ ctxt->hostile = false;
+ copy_full_state(ctxt->state, new);
+ goto success;
+ }
+
+ /*
+ * A threaded printer context will never spin or perform a
+ * hostile takeover. The atomic writer will wake the thread
+ * when it is done with the important output.
+ */
+ if (ctxt->thread)
+ return false;
+
+ /*
+ * If the active context is on the same CPU then there is
+ * obviously no handshake possible.
+ */
+ if (old.cpu == cpu)
+ goto check_hostile;
+
+ /*
+ * If a handover request with same or higher priority is already
+ * pending then this context cannot setup a handover request.
+ */
+ if (old.req_prio >= ctxt->prio)
+ goto check_hostile;
+
+ /*
+ * If the caller did not request spin-waiting then performing a
+ * handover is not an option.
+ */
+ if (!ctxt->spinwait)
+ goto check_hostile;
+
+ /*
+ * Setup the request in state[REQ]. If this fails then this
+ * context is not allowed to setup a handover request.
+ */
+ if (!cons_setup_handover(ctxt))
+ goto check_hostile;
+
+ /*
+ * Setup the request in state[CUR]. Hand in the state that was
+ * used to make the decision to spinwait above, for comparison. If
+ * this fails then unexpected state values were encountered and the
+ * full request setup process is retried.
+ */
+ if (!cons_setup_request(ctxt, old))
+ goto again;
+
+ /*
+ * Spin-wait to acquire the console. If this fails then unexpected
+ * state values were encountered (for example, a hostile takeover by
+ * another context) and the full request setup process is retried.
+ */
+ err = cons_try_acquire_spin(ctxt);
+ if (err) {
+ if (err == -EAGAIN)
+ goto again;
+ goto check_hostile;
+ }
+success:
+ /* Common updates on success */
+ cons_context_set_seq(ctxt);
+ cons_context_set_pbufs(ctxt);
+ return true;
+
+check_hostile:
+ if (!ctxt->hostile)
+ return false;
+
+ if (cons_check_panic())
+ return false;
+
+ if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new))
+ goto again;
+
+ copy_full_state(ctxt->state, new);
+ goto success;
+}
+
+/**
+ * cons_try_acquire - Try to acquire the console for printk output
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ *
+ * Returns: True if the acquire was successful. False on fail.
+ *
+ * In case of success @ctxt->state contains the acquisition
+ * state.
+ *
+ * In case of fail @ctxt->old_state contains the state
+ * that was read from @con->state for analysis by the caller.
+ */
+static bool cons_try_acquire(struct cons_context *ctxt)
+{
+ if (__cons_try_acquire(ctxt))
+ return true;
+
+ ctxt->state.atom = 0;
+ return false;
+}
+
+/**
+ * __cons_release - Release the console after output is done
+ * @ctxt: The acquire context that contains the state
+ * at cons_try_acquire()
+ *
+ * Returns: True if the release was regular
+ *
+ * False if the console is in unusable state or was handed over
+ * with handshake or taken over hostile without handshake.
+ *
+ * The return value tells the caller whether it needs to evaluate further
+ * printing.
+ */
+static bool __cons_release(struct cons_context *ctxt)
+{
+ struct console *con = ctxt->console;
+ short flags = console_srcu_read_flags(con);
+ struct cons_state hstate;
+ struct cons_state old;
+ struct cons_state new;
+
+ if (WARN_ON_ONCE(!(flags & CON_NO_BKL)))
+ return false;
+
+ cons_state_read(con, CON_STATE_CUR, &old);
+again:
+ if (!cons_state_bits_match(old, ctxt->state))
+ return false;
+
+ /* Release it directly when no handover request is pending. */
+ if (!old.req_prio)
+ goto unlock;
+
+ /* Read the handover target state */
+ cons_state_read(con, CON_STATE_REQ, &hstate);
+
+ /* If the waiter gave up hstate is 0 */
+ if (!hstate.atom)
+ goto unlock;
+
+ /*
+ * If a higher priority waiter raced against a lower priority
+ * waiter then unlock instead of handing over to either. The
+ * higher priority waiter will notice the updated state and
+ * retry.
+ */
+ if (hstate.cur_prio != old.req_prio)
+ goto unlock;
+
+ /* Switch the state and preserve the sequence on 64bit */
+ copy_bit_state(new, hstate);
+ copy_seq_state64(new, old);
+ if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new))
+ goto again;
+
+ return true;
+
+unlock:
+ /* Clear the state and preserve the sequence on 64bit */
+ new.atom = 0;
+ copy_seq_state64(new, old);
+ if (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &old, &new))
+ goto again;
+
+ return true;
+}
+
+bool printk_threads_enabled __ro_after_init;
+static bool printk_force_atomic __initdata;
+
+/**
+ * cons_release - Release the console after output is done
+ * @ctxt: The acquire context that contains the state
+ * at cons_try_acquire()
+ *
+ * Returns: True if the release was regular
+ *
+ * False if the console is in unusable state or was handed over
+ * with handshake or taken over hostile without handshake.
+ *
+ * The return value tells the caller whether it needs to evaluate further
+ * printing.
+ */
+static bool cons_release(struct cons_context *ctxt)
+{
+ bool ret = __cons_release(ctxt);
+
+ /* Invalidate the buffer pointer. It is no longer valid. */
+ ctxt->pbufs = NULL;
+
+ ctxt->state.atom = 0;
+ return ret;
+}
+
+bool console_try_acquire(struct cons_write_context *wctxt)
+{
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ return cons_try_acquire(ctxt);
+}
+EXPORT_SYMBOL_GPL(console_try_acquire);
+
+bool console_release(struct cons_write_context *wctxt)
+{
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ return cons_release(ctxt);
+}
+EXPORT_SYMBOL_GPL(console_release);
+
+/**
+ * cons_alloc_percpu_data - Allocate percpu data for a console
+ * @con: Console to allocate for
+ *
+ * Returns: True on success. False otherwise and the console cannot be used.
+ *
+ * If it is not yet possible to allocate per CPU data, success is returned.
+ * When per CPU data becomes possible, set_percpu_data_ready() will call
+ * this function again for all registered consoles.
+ */
+bool cons_alloc_percpu_data(struct console *con)
+{
+ if (!printk_percpu_data_ready())
+ return true;
+
+ con->pcpu_data = alloc_percpu(typeof(*con->pcpu_data));
+ if (con->pcpu_data)
+ return true;
+
+ con_printk(KERN_WARNING, con, "failed to allocate percpu buffers\n");
+ return false;
+}
+
+/**
+ * cons_free_percpu_data - Free percpu data of a console on unregister
+ * @con: Console to clean up
+ */
+static void cons_free_percpu_data(struct console *con)
+{
+ if (!con->pcpu_data)
+ return;
+
+ free_percpu(con->pcpu_data);
+ con->pcpu_data = NULL;
+}
+
+/**
+ * console_can_proceed - Check whether printing can proceed
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Returns: True if the state is correct. False if a handover
+ * has been requested or if the console was taken
+ * over.
+ *
+ * Must be invoked after the record was dumped into the assigned record
+ * buffer and at appropriate safe places in the driver. For unsafe driver
+ * sections see console_enter_unsafe().
+ *
+ * When this function returns false then the calling context is not allowed
+ * to go forward and has to back out immediately and carefully. The buffer
+ * content is no longer trusted either and the console lock is no longer
+ * held.
+ */
+bool console_can_proceed(struct cons_write_context *wctxt)
+{
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct cons_state state;
+
+ cons_state_read(con, CON_STATE_CUR, &state);
+ /* Store it for analysis or reuse */
+ copy_full_state(ctxt->old_state, state);
+
+ /* Make sure this context is still the owner. */
+ if (!cons_state_full_match(state, ctxt->state))
+ return false;
+
+ /*
+ * Having a safe point for take over and eventually a few
+ * duplicated characters or a full line is way better than a
+ * hostile takeover. Post processing can take care of the garbage.
+ * Continue if the requested priority is not sufficient.
+ */
+ if (state.req_prio <= state.cur_prio)
+ return true;
+
+ /*
+ * A console printer within an unsafe region is allowed to continue.
+ * It can perform the handover when exiting the safe region. Otherwise
+ * a hostile takeover will be necessary.
+ */
+ if (state.unsafe)
+ return true;
+
+ /* Release and hand over */
+ cons_release(ctxt);
+ /*
+ * This does not check whether the handover succeeded. The
+ * outermost callsite has to make the final decision whether printing
+ * should continue or not (via reacquire, possibly hostile). The
+ * console is unlocked already so go back all the way instead of
+ * trying to implement heuristics in tons of places.
+ */
+ return false;
+}
+EXPORT_SYMBOL_GPL(console_can_proceed);
+
+/**
+ * __console_update_unsafe - Update the unsafe bit in @con->atomic_state
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Returns: True if the state is correct. False if a handover
+ * has been requested or if the console was taken
+ * over.
+ *
+ * Must be invoked before an unsafe driver section is entered.
+ *
+ * When this function returns false then the calling context is not allowed
+ * to go forward and has to back out immediately and carefully. The buffer
+ * content is no longer trusted either and the console lock is no longer
+ * held.
+ *
+ * Internal helper to avoid duplicated code
+ */
+static bool __console_update_unsafe(struct cons_write_context *wctxt, bool unsafe)
+{
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct cons_state new;
+
+ do {
+ if (!console_can_proceed(wctxt))
+ return false;
+ /*
+ * console_can_proceed() saved the real state in
+ * ctxt->old_state
+ */
+ copy_full_state(new, ctxt->old_state);
+ new.unsafe = unsafe;
+
+ } while (!cons_state_try_cmpxchg(con, CON_STATE_CUR, &ctxt->old_state, &new));
+
+ copy_full_state(ctxt->state, new);
+ return true;
+}
+
+/**
+ * console_enter_unsafe - Enter an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Returns: True if the state is correct. False if a handover
+ * has been requested or if the console was taken
+ * over.
+ *
+ * Must be invoked before an unsafe driver section is entered.
+ *
+ * When this function returns false then the calling context is not allowed
+ * to go forward and has to back out immediately and carefully. The buffer
+ * content is no longer trusted either and the console lock is no longer
+ * held.
+ */
+bool console_enter_unsafe(struct cons_write_context *wctxt)
+{
+ return __console_update_unsafe(wctxt, true);
+}
+EXPORT_SYMBOL_GPL(console_enter_unsafe);
+
+/**
+ * console_exit_unsafe - Exit an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Returns: True if the state is correct. False if a handover
+ * has been requested or if the console was taken
+ * over.
+ *
+ * Must be invoked before an unsafe driver section is exited.
+ *
+ * When this function returns false then the calling context is not allowed
+ * to go forward and has to back out immediately and carefully. The buffer
+ * content is no longer trusted either and the console lock is no longer
+ * held.
+ */
+bool console_exit_unsafe(struct cons_write_context *wctxt)
+{
+ return __console_update_unsafe(wctxt, false);
+}
+EXPORT_SYMBOL_GPL(console_exit_unsafe);
+
+/**
+ * cons_get_record - Fill the buffer with the next pending ringbuffer record
+ * @wctxt: The write context which will be handed to the write function
+ *
+ * Returns: True if there are records available. If the next record should
+ * be printed, the output buffer is filled and @wctxt->outbuf
+ * points to the text to print. If @wctxt->outbuf is NULL after
+ * the call, the record should not be printed but the caller must
+ * still update the console sequence number.
+ *
+ * False means that there are no pending records anymore and the
+ * printing can stop.
+ */
+static bool cons_get_record(struct cons_write_context *wctxt)
+{
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ struct printk_message pmsg = {
+ .pbufs = ctxt->pbufs,
+ };
+
+ if (!printk_get_next_message(&pmsg, ctxt->newseq, is_extended, true))
+ return false;
+
+ ctxt->newseq = pmsg.seq;
+ ctxt->dropped += pmsg.dropped;
+
+ if (pmsg.outbuf_len == 0) {
+ wctxt->outbuf = NULL;
+ } else {
+ if (ctxt->dropped && !is_extended)
+ console_prepend_dropped(&pmsg, ctxt->dropped);
+ wctxt->outbuf = &pmsg.pbufs->outbuf[0];
+ }
+
+ wctxt->len = pmsg.outbuf_len;
+
+ return true;
+}
+
+/**
+ * cons_emit_record - Emit record in the acquired context
+ * @wctxt: The write context that will be handed to the write function
+ *
+ * Returns: False if the operation was aborted (takeover or handover).
+ * True otherwise
+ *
+ * When false is returned, the caller is not allowed to touch console state.
+ * The console is owned by someone else. If the caller wants to print more
+ * it has to reacquire the console first.
+ *
+ * When true is returned, @wctxt->ctxt.backlog indicates whether there are
+ * still records pending in the ringbuffer,
+ */
+static bool cons_emit_record(struct cons_write_context *wctxt)
+{
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ bool done = false;
+
+ /*
+ * @con->dropped is not protected in case of hostile takeovers so
+ * the update below is racy. Annotate it accordingly.
+ */
+ ctxt->dropped = data_race(READ_ONCE(con->dropped));
+
+ /* Fill the output buffer with the next record */
+ ctxt->backlog = cons_get_record(wctxt);
+ if (!ctxt->backlog)
+ return true;
+
+ /* Safety point. Don't touch state in case of takeover */
+ if (!console_can_proceed(wctxt))
+ return false;
+
+ /* Counterpart to the read above */
+ WRITE_ONCE(con->dropped, ctxt->dropped);
+
+ /*
+ * In case of skipped records, Update sequence state in @con.
+ */
+ if (!wctxt->outbuf)
+ goto update;
+
+ /* Tell the driver about potential unsafe state */
+ wctxt->unsafe = ctxt->state.unsafe;
+
+ if (!ctxt->thread && con->write_atomic) {
+ done = con->write_atomic(con, wctxt);
+ } else if (ctxt->thread && con->write_thread) {
+ done = con->write_thread(con, wctxt);
+ } else {
+ cons_release(ctxt);
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* If not done, the write was aborted due to takeover */
+ if (!done)
+ return false;
+
+ /* If there was a dropped message, it has now been output. */
+ if (ctxt->dropped) {
+ ctxt->dropped = 0;
+ /* Counterpart to the read above */
+ WRITE_ONCE(con->dropped, ctxt->dropped);
+ }
+update:
+ ctxt->newseq++;
+ /*
+ * The sequence update attempt is not part of console_release()
+ * because in panic situations the console is not released by
+ * the panic CPU until all records are written. On 32bit the
+ * sequence is separate from state anyway.
+ */
+ return cons_seq_try_update(ctxt);
+}
+
+/**
+ * cons_kthread_should_wakeup - Check whether the printk thread should wakeup
+ * @con: Console to operate on
+ * @ctxt: The acquire context that contains the state
+ * at console_acquire()
+ *
+ * Returns: True if the thread should shutdown or if the console is allowed to
+ * print and a record is available. False otherwise
+ *
+ * After the thread wakes up, it must first check if it should shutdown before
+ * attempting any printing.
+ */
+static bool cons_kthread_should_wakeup(struct console *con, struct cons_context *ctxt)
+{
+ bool is_usable;
+ short flags;
+ int cookie;
+
+ if (kthread_should_stop())
+ return true;
+
+ cookie = console_srcu_read_lock();
+ flags = console_srcu_read_flags(con);
+ is_usable = console_is_usable(con, flags);
+ console_srcu_read_unlock(cookie);
+
+ if (!is_usable)
+ return false;
+
+ /* This reads state and sequence on 64bit. On 32bit only state */
+ cons_state_read(con, CON_STATE_CUR, &ctxt->state);
+
+ /*
+ * Atomic printing is running on some other CPU. The owner
+ * will wake the console thread on unlock if necessary.
+ */
+ if (ctxt->state.locked)
+ return false;
+
+ /* Bring the sequence in @ctxt up to date */
+ cons_context_set_seq(ctxt);
+
+ return prb_read_valid(prb, ctxt->oldseq, NULL);
+}
+
+/**
+ * cons_kthread_func - The printk thread function
+ * @__console: Console to operate on
+ */
+static int cons_kthread_func(void *__console)
+{
+ struct console *con = __console;
+ struct cons_write_context wctxt = {
+ .ctxt.console = con,
+ .ctxt.prio = CONS_PRIO_NORMAL,
+ .ctxt.thread = 1,
+ };
+ struct cons_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ unsigned long flags;
+ short con_flags;
+ bool backlog;
+ int cookie;
+ int ret;
+
+ for (;;) {
+ atomic_inc(&con->kthread_waiting);
+
+ /*
+ * Provides a full memory barrier vs. cons_kthread_wake().
+ */
+ ret = rcuwait_wait_event(&con->rcuwait,
+ cons_kthread_should_wakeup(con, ctxt),
+ TASK_INTERRUPTIBLE);
+
+ atomic_dec(&con->kthread_waiting);
+
+ if (kthread_should_stop())
+ break;
+
+ /* Wait was interrupted by a spurious signal, go back to sleep */
+ if (ret)
+ continue;
+
+ for (;;) {
+ cookie = console_srcu_read_lock();
+
+ /*
+ * Ensure this stays on the CPU to make handover and
+ * takeover possible.
+ */
+ if (con->port_lock)
+ con->port_lock(con, true, &flags);
+ else
+ migrate_disable();
+
+ /*
+ * Try to acquire the console without attempting to
+ * take over. If an atomic printer wants to hand
+ * back to the thread it simply wakes it up.
+ */
+ if (!cons_try_acquire(ctxt))
+ break;
+
+ con_flags = console_srcu_read_flags(con);
+
+ if (console_is_usable(con, con_flags)) {
+ /*
+ * If the emit fails, this context is no
+ * longer the owner. Abort the processing and
+ * wait for new records to print.
+ */
+ if (!cons_emit_record(&wctxt))
+ break;
+ backlog = ctxt->backlog;
+ } else {
+ backlog = false;
+ }
+
+ /*
+ * If the release fails, this context was not the
+ * owner. Abort the processing and wait for new
+ * records to print.
+ */
+ if (!cons_release(ctxt))
+ break;
+
+ /* Backlog done? */
+ if (!backlog)
+ break;
+
+ if (con->port_lock)
+ con->port_lock(con, false, &flags);
+ else
+ migrate_enable();
+
+ console_srcu_read_unlock(cookie);
+
+ cond_resched();
+ }
+ if (con->port_lock)
+ con->port_lock(con, false, &flags);
+ else
+ migrate_enable();
+
+ console_srcu_read_unlock(cookie);
+ }
+ return 0;
+}
+
+/**
+ * cons_irq_work - irq work to wake printk thread
+ * @irq_work: The irq work to operate on
+ */
+static void cons_irq_work(struct irq_work *irq_work)
+{
+ struct console *con = container_of(irq_work, struct console, irq_work);
+
+ cons_kthread_wake(con);
+}
+
+/**
+ * cons_wake_threads - Wake up printing threads
+ *
+ * A printing thread is only woken if it is within the @kthread_waiting
+ * block. If it is not within the block (or enters the block later), it
+ * will see any new records and continue printing on its own.
+ */
+void cons_wake_threads(void)
+{
+ struct console *con;
+ int cookie;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ if (con->kthread && atomic_read(&con->kthread_waiting))
+ irq_work_queue(&con->irq_work);
+ }
+ console_srcu_read_unlock(cookie);
+}
+
+/**
+ * struct cons_cpu_state - Per CPU printk context state
+ * @prio: The current context priority level
+ * @nesting: Per priority nest counter
+ */
+struct cons_cpu_state {
+ enum cons_prio prio;
+ int nesting[CONS_PRIO_MAX];
+};
+
+static DEFINE_PER_CPU(struct cons_cpu_state, cons_pcpu_state);
+static struct cons_cpu_state early_cons_pcpu_state __initdata;
+
+/**
+ * cons_get_cpu_state - Get the per CPU console state pointer
+ *
+ * Returns either a pointer to the per CPU state of the current CPU or to
+ * the init data state during early boot.
+ */
+static __ref struct cons_cpu_state *cons_get_cpu_state(void)
+{
+ if (!printk_percpu_data_ready())
+ return &early_cons_pcpu_state;
+
+ return this_cpu_ptr(&cons_pcpu_state);
+}
+
+/**
+ * cons_get_wctxt - Get the write context for atomic printing
+ * @con: Console to operate on
+ * @prio: Priority of the context
+ *
+ * Returns either the per CPU context or the builtin context for
+ * early boot.
+ */
+static __ref struct cons_write_context *cons_get_wctxt(struct console *con,
+ enum cons_prio prio)
+{
+ if (!con->pcpu_data)
+ return &early_cons_ctxt_data.wctxt[prio];
+
+ return &this_cpu_ptr(con->pcpu_data)->wctxt[prio];
+}
+
+/**
+ * cons_atomic_try_acquire - Try to acquire the console for atomic printing
+ * @con: The console to acquire
+ * @ctxt: The console context instance to work on
+ * @prio: The priority of the current context
+ */
+static bool cons_atomic_try_acquire(struct console *con, struct cons_context *ctxt,
+ enum cons_prio prio, bool skip_unsafe)
+{
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->console = con;
+ ctxt->spinwait_max_us = 2000;
+ ctxt->prio = prio;
+ ctxt->spinwait = 1;
+
+ /* Try to acquire it directly or via a friendly handover */
+ if (cons_try_acquire(ctxt))
+ return true;
+
+ /* Investigate whether a hostile takeover is due */
+ if (ctxt->old_state.cur_prio >= prio)
+ return false;
+
+ if (!ctxt->old_state.unsafe || !skip_unsafe)
+ ctxt->hostile = 1;
+ return cons_try_acquire(ctxt);
+}
+
+/**
+ * cons_atomic_flush_con - Flush one console in atomic mode
+ * @wctxt: The write context struct to use for this context
+ * @con: The console to flush
+ * @prio: The priority of the current context
+ * @skip_unsafe: True, to avoid unsafe hostile takeovers
+ */
+static void cons_atomic_flush_con(struct cons_write_context *wctxt, struct console *con,
+ enum cons_prio prio, bool skip_unsafe)
+{
+ struct cons_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ bool wake_thread = false;
+ short flags;
+
+ if (!cons_atomic_try_acquire(con, ctxt, prio, skip_unsafe))
+ return;
+
+ do {
+ flags = console_srcu_read_flags(con);
+
+ if (!console_is_usable(con, flags))
+ break;
+
+ /*
+ * For normal prio messages let the printer thread handle
+ * the printing if it is available.
+ */
+ if (prio <= CONS_PRIO_NORMAL && con->kthread) {
+ wake_thread = true;
+ break;
+ }
+
+ /*
+ * cons_emit_record() returns false when the console was
+ * handed over or taken over. In both cases the context is
+ * no longer valid.
+ */
+ if (!cons_emit_record(wctxt))
+ return;
+ } while (ctxt->backlog);
+
+ cons_release(ctxt);
+
+ if (wake_thread && atomic_read(&con->kthread_waiting))
+ irq_work_queue(&con->irq_work);
+}
+
+/**
+ * cons_atomic_flush - Flush consoles in atomic mode if required
+ * @printk_caller_wctxt: The write context struct to use for this
+ * context (for printk() context only)
+ * @skip_unsafe: True, to avoid unsafe hostile takeovers
+ */
+void cons_atomic_flush(struct cons_write_context *printk_caller_wctxt, bool skip_unsafe)
+{
+ struct cons_write_context *wctxt;
+ struct cons_cpu_state *cpu_state;
+ struct console *con;
+ short flags;
+ int cookie;
+
+ cpu_state = cons_get_cpu_state();
+
+ /*
+ * When in an elevated priority, the printk() calls are not
+ * individually flushed. This is to allow the full output to
+ * be dumped to the ringbuffer before starting with printing
+ * the backlog.
+ */
+ if (cpu_state->prio > CONS_PRIO_NORMAL && printk_caller_wctxt)
+ return;
+
+ /*
+ * Let the outermost write of this priority print. This avoids
+ * nasty hackery for nested WARN() where the printing itself
+ * generates one.
+ *
+ * cpu_state->prio <= CONS_PRIO_NORMAL is not subject to nesting
+ * and can proceed in order to allow atomic printing when consoles
+ * do not have a printer thread.
+ */
+ if (cpu_state->prio > CONS_PRIO_NORMAL &&
+ cpu_state->nesting[cpu_state->prio] != 1)
+ return;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ if (!con->write_atomic)
+ continue;
+
+ flags = console_srcu_read_flags(con);
+
+ if (!console_is_usable(con, flags))
+ continue;
+
+ if (cpu_state->prio > CONS_PRIO_NORMAL || !con->kthread) {
+ if (printk_caller_wctxt)
+ wctxt = printk_caller_wctxt;
+ else
+ wctxt = cons_get_wctxt(con, cpu_state->prio);
+ cons_atomic_flush_con(wctxt, con, cpu_state->prio, skip_unsafe);
+ }
+ }
+ console_srcu_read_unlock(cookie);
+}
+
+/**
+ * cons_atomic_enter - Enter a context that enforces atomic printing
+ * @prio: Priority of the context
+ *
+ * Returns: The previous priority that needs to be fed into
+ * the corresponding cons_atomic_exit()
+ */
+enum cons_prio cons_atomic_enter(enum cons_prio prio)
+{
+ struct cons_cpu_state *cpu_state;
+ enum cons_prio prev_prio;
+
+ migrate_disable();
+ cpu_state = cons_get_cpu_state();
+
+ prev_prio = cpu_state->prio;
+ if (prev_prio < prio)
+ cpu_state->prio = prio;
+
+ /*
+ * Increment the nesting on @cpu_state->prio so a WARN()
+ * nested into a panic printout does not attempt to
+ * scribble state.
+ */
+ cpu_state->nesting[cpu_state->prio]++;
+
+ return prev_prio;
+}
+
+/**
+ * cons_atomic_exit - Exit a context that enforces atomic printing
+ * @prio: Priority of the context to leave
+ * @prev_prio: Priority of the previous context for restore
+ *
+ * @prev_prio is the priority returned by the corresponding cons_atomic_enter().
+ */
+void cons_atomic_exit(enum cons_prio prio, enum cons_prio prev_prio)
+{
+ struct cons_cpu_state *cpu_state;
+
+ cons_atomic_flush(NULL, true);
+
+ cpu_state = cons_get_cpu_state();
+
+ if (cpu_state->prio == CONS_PRIO_PANIC)
+ cons_atomic_flush(NULL, false);
+
+ /*
+ * Undo the nesting of cons_atomic_enter() at the CPU state
+ * priority.
+ */
+ cpu_state->nesting[cpu_state->prio]--;
+
+ /*
+ * Restore the previous priority, which was returned by
+ * cons_atomic_enter().
+ */
+ cpu_state->prio = prev_prio;
+
+ migrate_enable();
+}
+
+/**
+ * cons_kthread_stop - Stop a printk thread
+ * @con: Console to operate on
+ */
+static void cons_kthread_stop(struct console *con)
+{
+ lockdep_assert_console_list_lock_held();
+
+ if (!con->kthread)
+ return;
+
+ kthread_stop(con->kthread);
+ con->kthread = NULL;
+
+ kfree(con->thread_pbufs);
+ con->thread_pbufs = NULL;
+}
+
+/**
+ * cons_kthread_create - Create a printk thread
+ * @con: Console to operate on
+ *
+ * If it fails, let the console proceed. The atomic part might
+ * be usable and useful.
+ */
+void cons_kthread_create(struct console *con)
+{
+ struct task_struct *kt;
+ struct console *c;
+
+ lockdep_assert_console_list_lock_held();
+
+ if (!(con->flags & CON_NO_BKL) || !con->write_thread)
+ return;
+
+ if (!printk_threads_enabled || con->kthread)
+ return;
+
+ /*
+ * Printer threads cannot be started as long as any boot console is
+ * registered because there is no way to synchronize the hardware
+ * registers between boot console code and regular console code.
+ */
+ for_each_console(c) {
+ if (c->flags & CON_BOOT)
+ return;
+ }
+ have_boot_console = false;
+
+ con->thread_pbufs = kmalloc(sizeof(*con->thread_pbufs), GFP_KERNEL);
+ if (!con->thread_pbufs) {
+ con_printk(KERN_ERR, con, "failed to allocate printing thread buffers\n");
+ return;
+ }
+
+ kt = kthread_run(cons_kthread_func, con, "pr/%s%d", con->name, con->index);
+ if (IS_ERR(kt)) {
+ con_printk(KERN_ERR, con, "failed to start printing thread\n");
+ kfree(con->thread_pbufs);
+ con->thread_pbufs = NULL;
+ return;
+ }
+
+ con->kthread = kt;
+
+ /*
+ * It is important that console printing threads are scheduled
+ * shortly after a printk call and with generous runtime budgets.
+ */
+ sched_set_normal(con->kthread, -20);
+}
+
+static int __init printk_setup_threads(void)
+{
+ struct console *con;
+
+ if (printk_force_atomic)
+ return 0;
+
+ console_list_lock();
+ printk_threads_enabled = true;
+ for_each_console(con)
+ cons_kthread_create(con);
+ if (have_bkl_console)
+ console_bkl_kthread_create();
+ console_list_unlock();
+ return 0;
+}
+early_initcall(printk_setup_threads);
+
+/**
+ * cons_nobkl_init - Initialize the NOBKL console specific data
+ * @con: Console to initialize
+ *
+ * Returns: True on success. False otherwise and the console cannot be used.
+ */
+bool cons_nobkl_init(struct console *con)
+{
+ struct cons_state state = { };
+
+ if (!cons_alloc_percpu_data(con))
+ return false;
+
+ rcuwait_init(&con->rcuwait);
+ atomic_set(&con->kthread_waiting, 0);
+ init_irq_work(&con->irq_work, cons_irq_work);
+ cons_state_set(con, CON_STATE_CUR, &state);
+ cons_state_set(con, CON_STATE_REQ, &state);
+ cons_seq_init(con);
+ cons_kthread_create(con);
+ return true;
+}
+
+/**
+ * cons_nobkl_cleanup - Cleanup the NOBKL console specific data
+ * @con: Console to cleanup
+ */
+void cons_nobkl_cleanup(struct console *con)
+{
+ struct cons_state state = { };
+
+ cons_kthread_stop(con);
+ cons_state_set(con, CON_STATE_CUR, &state);
+ cons_state_set(con, CON_STATE_REQ, &state);
+ cons_free_percpu_data(con);
+}
+
+/**
+ * printk_kthread_shutdown - shutdown all threaded printers
+ *
+ * On system shutdown all threaded printers are stopped. This allows printk
+ * to transition back to atomic printing, thus providing a robust mechanism
+ * for the final shutdown/reboot messages to be output.
+ */
+static void printk_kthread_shutdown(void)
+{
+ struct console *con;
+
+ console_list_lock();
+ for_each_console(con) {
+ if (con->flags & CON_NO_BKL)
+ cons_kthread_stop(con);
+ }
+ console_list_unlock();
+}
+
+static struct syscore_ops printk_syscore_ops = {
+ .shutdown = printk_kthread_shutdown,
+};
+
+static int __init printk_init_ops(void)
+{
+ register_syscore_ops(&printk_syscore_ops);
+ return 0;
+}
+device_initcall(printk_init_ops);
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index ef0f9a2044da1..5c1470bd60bcb 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -12,18 +12,41 @@
#include "internal.h"
-static DEFINE_PER_CPU(int, printk_context);
+struct printk_context {
+ local_lock_t cpu;
+ int recursion;
+};
+
+static DEFINE_PER_CPU(struct printk_context, printk_context) = {
+ .cpu = INIT_LOCAL_LOCK(cpu),
+};
/* Can be preempted by NMI. */
-void __printk_safe_enter(void)
+void __printk_safe_enter(unsigned long *flags)
{
- this_cpu_inc(printk_context);
+ WARN_ON_ONCE(in_nmi());
+ local_lock_irqsave(&printk_context.cpu, *flags);
+ this_cpu_inc(printk_context.recursion);
}
/* Can be preempted by NMI. */
-void __printk_safe_exit(void)
+void __printk_safe_exit(unsigned long *flags)
{
- this_cpu_dec(printk_context);
+ WARN_ON_ONCE(in_nmi());
+ this_cpu_dec(printk_context.recursion);
+ local_unlock_irqrestore(&printk_context.cpu, *flags);
+}
+
+void __printk_deferred_enter(void)
+{
+ WARN_ON_ONCE(!in_atomic());
+ this_cpu_inc(printk_context.recursion);
+}
+
+void __printk_deferred_exit(void)
+{
+ WARN_ON_ONCE(!in_atomic());
+ this_cpu_dec(printk_context.recursion);
}
asmlinkage int vprintk(const char *fmt, va_list args)
@@ -38,13 +61,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if (this_cpu_read(printk_context) || in_nmi()) {
- int len;
-
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- defer_console_output();
- return len;
- }
+ if (this_cpu_read(printk_context.recursion) || in_nmi())
+ return vprintk_deferred(fmt, args);
/* No obstacles. */
return vprintk_default(fmt, args);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8e6c023212cb3..34f8adf7c0a39 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2407,6 +2407,12 @@ static int rcutorture_booster_init(unsigned int cpu)
WARN_ON_ONCE(!t);
sp.sched_priority = 2;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#ifdef CONFIG_PREEMPT_RT
+ t = per_cpu(timersd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#endif
}
/* Don't allow time recalculation while creating a new task. */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b10b8349bb2a4..804306204d0d0 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -8,6 +8,7 @@
*/
#include <linux/kvm_para.h>
+#include <linux/console.h>
//////////////////////////////////////////////////////////////////////////////
//
@@ -582,6 +583,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void)
static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
{
+ enum cons_prio prev_prio;
int cpu;
unsigned long flags;
unsigned long gpa;
@@ -597,6 +599,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
if (rcu_stall_is_suppressed())
return;
+ prev_prio = cons_atomic_enter(CONS_PRIO_EMERGENCY);
+
/*
* OK, time to rat on our buddy...
* See Documentation/RCU/stallwarn.rst for info on how to debug
@@ -651,6 +655,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
panic_on_rcu_stall();
rcu_force_quiescent_state(); /* Kick them all. */
+
+ cons_atomic_exit(CONS_PRIO_EMERGENCY, prev_prio);
}
static void print_cpu_stall(unsigned long gps)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f904..a57a1a3beeba1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1042,6 +1042,46 @@ void resched_curr(struct rq *rq)
trace_sched_wake_idle_without_ipi(cpu);
}
+#ifdef CONFIG_PREEMPT_LAZY
+
+static int tsk_is_polling(struct task_struct *p)
+{
+#ifdef TIF_POLLING_NRFLAG
+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+#else
+ return 0;
+#endif
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ int cpu;
+
+ if (!sched_feat(PREEMPT_LAZY)) {
+ resched_curr(rq);
+ return;
+ }
+
+ if (test_tsk_need_resched(curr))
+ return;
+
+ if (test_tsk_need_resched_lazy(curr))
+ return;
+
+ set_tsk_need_resched_lazy(curr);
+
+ cpu = cpu_of(rq);
+ if (cpu == smp_processor_id())
+ return;
+
+ /* NEED_RESCHED_LAZY must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(curr))
+ smp_send_reschedule(cpu);
+}
+#endif
+
void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -2230,6 +2270,7 @@ void migrate_disable(void)
preempt_disable();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
+ preempt_lazy_disable();
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
@@ -2265,6 +2306,7 @@ void migrate_enable(void)
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
+ preempt_lazy_enable();
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -3318,6 +3360,76 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
}
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * Consider:
+ *
+ * set_special_state(X);
+ *
+ * do_things()
+ * // Somewhere in there is an rtlock that can be contended:
+ * current_save_and_set_rtlock_wait_state();
+ * [...]
+ * schedule_rtlock(); (A)
+ * [...]
+ * current_restore_rtlock_saved_state();
+ *
+ * schedule(); (B)
+ *
+ * If p->saved_state is anything else than TASK_RUNNING, then p blocked on an
+ * rtlock (A) *before* voluntarily calling into schedule() (B) after setting its
+ * state to X. For things like ptrace (X=TASK_TRACED), the task could have more
+ * work to do upon acquiring the lock in do_things() before whoever called
+ * wait_task_inactive() should return. IOW, we have to wait for:
+ *
+ * p.saved_state = TASK_RUNNING
+ * p.__state = X
+ *
+ * which implies the task isn't blocked on an RT lock and got to schedule() (B).
+ *
+ * Also see comments in ttwu_state_match().
+ */
+
+static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state)
+{
+ unsigned long flags;
+ bool mismatch;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ if (READ_ONCE(p->__state) & match_state)
+ mismatch = false;
+ else if (READ_ONCE(p->saved_state) & match_state)
+ mismatch = false;
+ else
+ mismatch = true;
+
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return mismatch;
+}
+static __always_inline bool state_match(struct task_struct *p, unsigned int match_state,
+ bool *wait)
+{
+ if (READ_ONCE(p->__state) & match_state)
+ return true;
+ if (READ_ONCE(p->saved_state) & match_state) {
+ *wait = true;
+ return true;
+ }
+ return false;
+}
+#else
+static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state)
+{
+ return !(READ_ONCE(p->__state) & match_state);
+}
+static __always_inline bool state_match(struct task_struct *p, unsigned int match_state,
+ bool *wait)
+{
+ return (READ_ONCE(p->__state) & match_state);
+}
+#endif
+
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -3336,7 +3448,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
*/
unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
- int running, queued;
+ bool running, wait;
struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
@@ -3362,7 +3474,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
* is actually now running somewhere else!
*/
while (task_on_cpu(rq, p)) {
- if (!(READ_ONCE(p->__state) & match_state))
+ if (state_mismatch(p, match_state))
return 0;
cpu_relax();
}
@@ -3375,9 +3487,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
- queued = task_on_rq_queued(p);
+ wait = task_on_rq_queued(p);
ncsw = 0;
- if (READ_ONCE(p->__state) & match_state)
+
+ if (state_match(p, match_state, &wait))
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -3407,7 +3520,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
- if (unlikely(queued)) {
+ if (unlikely(wait)) {
ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -4712,6 +4825,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->on_cpu = 0;
#endif
init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(p)->preempt_lazy_count = 0;
+#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -6588,6 +6704,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
+ clear_tsk_need_resched_lazy(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
@@ -6648,14 +6765,11 @@ void __noreturn do_task_dead(void)
cpu_relax();
}
-static inline void sched_submit_work(struct task_struct *tsk)
+void sched_submit_work(void)
{
- unsigned int task_flags;
+ struct task_struct *tsk = current;
+ unsigned int task_flags = tsk->flags;
- if (task_is_running(tsk))
- return;
-
- task_flags = tsk->flags;
/*
* If a worker goes to sleep, notify and ask workqueue whether it
* wants to wake up a task to maintain concurrency.
@@ -6681,8 +6795,10 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_flush_plug(tsk->plug, true);
}
-static void sched_update_worker(struct task_struct *tsk)
+void sched_resume_work(void)
{
+ struct task_struct *tsk = current;
+
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
@@ -6691,20 +6807,29 @@ static void sched_update_worker(struct task_struct *tsk)
}
}
-asmlinkage __visible void __sched schedule(void)
+static void schedule_loop(unsigned int sched_mode)
{
- struct task_struct *tsk = current;
-
- sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(SM_NONE);
+ __schedule(sched_mode);
sched_preempt_enable_no_resched();
} while (need_resched());
- sched_update_worker(tsk);
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ if (!task_is_running(current))
+ sched_submit_work();
+ schedule_loop(SM_NONE);
+ sched_resume_work();
}
EXPORT_SYMBOL(schedule);
+void schedule_rtmutex(void)
+{
+ schedule_loop(SM_NONE);
+}
+
/*
* synchronize_rcu_tasks() makes sure that no task is stuck in preempted
* state (have scheduled out non-voluntarily) by making sure that all
@@ -6764,11 +6889,7 @@ void __sched schedule_preempt_disabled(void)
#ifdef CONFIG_PREEMPT_RT
void __sched notrace schedule_rtlock(void)
{
- do {
- preempt_disable();
- __schedule(SM_RTLOCK_WAIT);
- sched_preempt_enable_no_resched();
- } while (need_resched());
+ schedule_loop(SM_RTLOCK_WAIT);
}
NOKPROBE_SYMBOL(schedule_rtlock);
#endif
@@ -6802,6 +6923,30 @@ static void __sched notrace preempt_schedule_common(void)
} while (need_resched());
}
+#ifdef CONFIG_PREEMPT_LAZY
+/*
+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
+ * preempt_lazy_count counter >0.
+ */
+static __always_inline int preemptible_lazy(void)
+{
+ if (test_thread_flag(TIF_NEED_RESCHED))
+ return 1;
+ if (current_thread_info()->preempt_lazy_count)
+ return 0;
+ return 1;
+}
+
+#else
+
+static inline int preemptible_lazy(void)
+{
+ return 1;
+}
+
+#endif
+
#ifdef CONFIG_PREEMPTION
/*
* This is the entry point to schedule() from in-kernel preemption
@@ -6815,6 +6960,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
+ if (!preemptible_lazy())
+ return;
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
@@ -6862,6 +7009,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
if (likely(!preemptible()))
return;
+ if (!preemptible_lazy())
+ return;
+
do {
/*
* Because the function tracer can trace preempt_count_sub()
@@ -9167,7 +9317,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
/*
* The idle tasks have their own, simple scheduling class:
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ed89be0aa6503..46ffbbfde97b0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4948,7 +4948,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
@@ -4972,7 +4972,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
}
static void
@@ -5118,7 +5118,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
return;
}
/*
@@ -5267,7 +5267,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
}
static __always_inline
@@ -6142,7 +6142,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
if (delta < 0) {
if (task_current(rq, p))
- resched_curr(rq);
+ resched_curr_lazy(rq);
return;
}
hrtick_start(rq, delta);
@@ -7871,7 +7871,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
preempt:
- resched_curr(rq);
+ resched_curr_lazy(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
@@ -12036,7 +12036,7 @@ static void task_fork_fair(struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
+ resched_curr_lazy(rq);
}
se->vruntime -= cfs_rq->min_vruntime;
@@ -12063,7 +12063,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (task_current(rq, p)) {
if (p->prio > oldprio)
- resched_curr(rq);
+ resched_curr_lazy(rq);
} else
check_preempt_curr(rq, p, 0);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd33..e13090e33f3c4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -48,6 +48,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
#ifdef CONFIG_PREEMPT_RT
SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
#else
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3e8df6d31c1e3..6f272ef973675 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2370,6 +2370,15 @@ extern void reweight_task(struct task_struct *p, int prio);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_curr_lazy(struct rq *rq);
+#else
+static inline void resched_curr_lazy(struct rq *rq)
+{
+ resched_curr(rq);
+}
+#endif
+
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8cb28f1df2941..138d68cfc204d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -432,7 +432,18 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
return NULL;
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
- q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
+
+ if (!sigqueue_flags) {
+ struct sighand_struct *sighand = t->sighand;
+
+ lockdep_assert_held(&sighand->siglock);
+ if (sighand->sigqueue_cache) {
+ q = sighand->sigqueue_cache;
+ sighand->sigqueue_cache = NULL;
+ }
+ }
+ if (!q)
+ q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
} else {
print_dropped_signal(sig);
}
@@ -447,14 +458,43 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
return q;
}
-static void __sigqueue_free(struct sigqueue *q)
+static bool sigqueue_cleanup_accounting(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
- return;
+ return false;
if (q->ucounts) {
dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
q->ucounts = NULL;
}
+ return true;
+}
+
+static void __sigqueue_free(struct sigqueue *q)
+{
+ if (!sigqueue_cleanup_accounting(q))
+ return;
+ kmem_cache_free(sigqueue_cachep, q);
+}
+
+void sigqueue_free_cached_entry(struct sigqueue *q)
+{
+ if (!q)
+ return;
+ kmem_cache_free(sigqueue_cachep, q);
+}
+
+static void sigqueue_cache_or_free(struct sigqueue *q)
+{
+ struct sighand_struct *sighand = current->sighand;
+
+ if (!sigqueue_cleanup_accounting(q))
+ return;
+
+ lockdep_assert_held(&sighand->siglock);
+ if (!sighand->sigqueue_cache) {
+ sighand->sigqueue_cache = q;
+ return;
+ }
kmem_cache_free(sigqueue_cachep, q);
}
@@ -594,7 +634,7 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i
(info->si_code == SI_TIMER) &&
(info->si_sys_private);
- __sigqueue_free(first);
+ sigqueue_cache_or_free(first);
} else {
/*
* Ok, it wasn't in the queue. This must be
@@ -2296,15 +2336,31 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
do_notify_parent_cldstop(current, false, why);
/*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
+ * The previous do_notify_parent_cldstop() invocation woke ptracer.
+ * One a PREEMPTION kernel this can result in preemption requirement
+ * which will be fulfilled after read_unlock() and the ptracer will be
+ * put on the CPU.
+ * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
+ * this task wait in schedule(). If this task gets preempted then it
+ * remains enqueued on the runqueue. The ptracer will observe this and
+ * then sleep for a delay of one HZ tick. In the meantime this task
+ * gets scheduled, enters schedule() and will wait for the ptracer.
*
- * XXX: implement read_unlock_no_resched().
+ * This preemption point is not bad from correctness point of view but
+ * extends the runtime by one HZ tick time due to the ptracer's sleep.
+ * The preempt-disable section ensures that there will be no preemption
+ * between unlock and schedule() and so improving the performance since
+ * the ptracer has no reason to sleep.
+ *
+ * This optimisation is not doable on PREEMPT_RT due to the spinlock_t
+ * within the preempt-disable section.
*/
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
- preempt_enable_no_resched();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable_no_resched();
schedule();
cgroup_leave_frozen(true);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c8a6913c067d9..af9e879bbbf75 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -80,21 +80,6 @@ static void wakeup_softirqd(void)
wake_up_process(tsk);
}
-/*
- * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness,
- * unless we're doing some of the synchronous softirqs.
- */
-#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
-static bool ksoftirqd_running(unsigned long pending)
-{
- struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
- if (pending & SOFTIRQ_NOW_MASK)
- return false;
- return tsk && task_is_running(tsk) && !__kthread_should_park(tsk);
-}
-
#ifdef CONFIG_TRACE_IRQFLAGS
DEFINE_PER_CPU(int, hardirqs_enabled);
DEFINE_PER_CPU(int, hardirq_context);
@@ -236,7 +221,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
goto out;
pending = local_softirq_pending();
- if (!pending || ksoftirqd_running(pending))
+ if (!pending)
goto out;
/*
@@ -432,9 +417,6 @@ static inline bool should_wake_ksoftirqd(void)
static inline void invoke_softirq(void)
{
- if (ksoftirqd_running(local_softirq_pending()))
- return;
-
if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
@@ -468,7 +450,7 @@ asmlinkage __visible void do_softirq(void)
pending = local_softirq_pending();
- if (pending && !ksoftirqd_running(pending))
+ if (pending)
do_softirq_own_stack();
local_irq_restore(flags);
@@ -637,6 +619,24 @@ static inline void tick_irq_exit(void)
#endif
}
+#ifdef CONFIG_PREEMPT_RT
+DEFINE_PER_CPU(struct task_struct *, timersd);
+DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
+
+static void wake_timersd(void)
+{
+ struct task_struct *tsk = __this_cpu_read(timersd);
+
+ if (tsk)
+ wake_up_process(tsk);
+}
+
+#else
+
+static inline void wake_timersd(void) { }
+
+#endif
+
static inline void __irq_exit_rcu(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -649,6 +649,10 @@ static inline void __irq_exit_rcu(void)
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() &&
+ !(in_nmi() | in_hardirq()))
+ wake_timersd();
+
tick_irq_exit();
}
@@ -976,12 +980,70 @@ static struct smp_hotplug_thread softirq_threads = {
.thread_comm = "ksoftirqd/%u",
};
+#ifdef CONFIG_PREEMPT_RT
+static void timersd_setup(unsigned int cpu)
+{
+ sched_set_fifo_low(current);
+}
+
+static int timersd_should_run(unsigned int cpu)
+{
+ return local_pending_timers();
+}
+
+static void run_timersd(unsigned int cpu)
+{
+ unsigned int timer_si;
+
+ ksoftirqd_run_begin();
+
+ timer_si = local_pending_timers();
+ __this_cpu_write(pending_timer_softirq, 0);
+ or_softirq_pending(timer_si);
+
+ __do_softirq();
+
+ ksoftirqd_run_end();
+}
+
+static void raise_ktimers_thread(unsigned int nr)
+{
+ trace_softirq_raise(nr);
+ __this_cpu_or(pending_timer_softirq, 1 << nr);
+}
+
+void raise_hrtimer_softirq(void)
+{
+ raise_ktimers_thread(HRTIMER_SOFTIRQ);
+}
+
+void raise_timer_softirq(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ raise_ktimers_thread(TIMER_SOFTIRQ);
+ wake_timersd();
+ local_irq_restore(flags);
+}
+
+static struct smp_hotplug_thread timer_threads = {
+ .store = &timersd,
+ .setup = timersd_setup,
+ .thread_should_run = timersd_should_run,
+ .thread_fn = run_timersd,
+ .thread_comm = "ktimers/%u",
+};
+#endif
+
static __init int spawn_ksoftirqd(void)
{
cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
takeover_tasklets);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
-
+#ifdef CONFIG_PREEMPT_RT
+ BUG_ON(smpboot_register_percpu_thread(&timer_threads));
+#endif
return 0;
}
early_initcall(spawn_ksoftirqd);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e8c08292defcb..10c1246cdba76 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1805,7 +1805,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ raise_hrtimer_softirq();
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
@@ -1918,7 +1918,7 @@ void hrtimer_run_queues(void)
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ raise_hrtimer_softirq();
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 808a247205a9a..c0a32aab8f276 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -140,25 +140,29 @@ static struct k_itimer *posix_timer_by_id(timer_t id)
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int start, id;
- do {
+ /* Can be written by a different task concurrently in the loop below */
+ start = READ_ONCE(sig->next_posix_timer_id);
+
+ for (id = ~start; start != id; id++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ WRITE_ONCE(sig->next_posix_timer_id, (id + 1) & INT_MAX);
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
@@ -1037,27 +1041,59 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
}
/*
- * return timer owned by the process, used by exit_itimers
+ * Delete a timer if it is armed, remove it from the hash and schedule it
+ * for RCU freeing.
*/
static void itimer_delete(struct k_itimer *timer)
{
-retry_delete:
- spin_lock_irq(&timer->it_lock);
+ unsigned long flags;
+retry_delete:
+ /*
+ * irqsave is required to make timer_wait_running() work.
+ */
+ spin_lock_irqsave(&timer->it_lock, flags);
+
+ /*
+ * Even if the timer is not longer accessible from other tasks
+ * it still might be armed and queued in the underlying timer
+ * mechanism. Worse, that timer mechanism might run the expiry
+ * function concurrently.
+ */
if (timer_delete_hook(timer) == TIMER_RETRY) {
- spin_unlock_irq(&timer->it_lock);
+ /*
+ * Timer is expired concurrently, prevent livelocks
+ * and pointless spinning on RT.
+ *
+ * The CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y case is
+ * irrelevant here because obviously the exiting task
+ * cannot be expiring timer in task work concurrently.
+ * Ditto for CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n as the
+ * tick interrupt cannot run on this CPU because the above
+ * spin_lock disabled interrupts.
+ *
+ * timer_wait_running() drops timer::it_lock, which opens
+ * the possibility for another task to delete the timer.
+ *
+ * That's not possible here because this is invoked from
+ * do_exit() only for the last thread of the thread group.
+ * So no other task can access that timer.
+ */
+ if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
+ return;
+
goto retry_delete;
}
list_del(&timer->list);
- spin_unlock_irq(&timer->it_lock);
+ spin_unlock_irqrestore(&timer->it_lock, flags);
release_posix_timer(timer, IT_ID_SET);
}
/*
- * This is called by do_exit or de_thread, only when nobody else can
- * modify the signal->posix_timers list. Yet we need sighand->siglock
- * to prevent the race with /proc/pid/timers.
+ * Invoked from do_exit() when the last thread of a thread group exits.
+ * At that point no other task can access the timers of the dying
+ * task anymore.
*/
void exit_itimers(struct task_struct *tsk)
{
@@ -1067,10 +1103,12 @@ void exit_itimers(struct task_struct *tsk)
if (list_empty(&tsk->signal->posix_timers))
return;
+ /* Protect against concurrent read via /proc/$PID/timers */
spin_lock_irq(&tsk->sighand->siglock);
list_replace_init(&tsk->signal->posix_timers, &timers);
spin_unlock_irq(&tsk->sighand->siglock);
+ /* The timers are not longer accessible via tsk::signal */
while (!list_empty(&timers)) {
tmr = list_first_entry(&timers, struct k_itimer, list);
itimer_delete(tmr);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a46506f7ec6d0..1ae9e4e8a0715 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -789,7 +789,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
static inline bool local_timer_softirq_pending(void)
{
- return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
+ return local_pending_timers() & BIT(TIMER_SOFTIRQ);
}
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 63a8ce7177dd4..7cad6fe3c035c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2054,7 +2054,7 @@ static void run_local_timers(void)
if (time_before(jiffies, base->next_expiry))
return;
}
- raise_softirq(TIMER_SOFTIRQ);
+ raise_timer_softirq();
}
/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5909aaf2f4c08..2867def70f441 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2694,11 +2694,19 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
trace_flags |= TRACE_FLAG_BH_OFF;
- if (tif_need_resched())
+ if (tif_need_resched_now())
trace_flags |= TRACE_FLAG_NEED_RESCHED;
+#ifdef CONFIG_PREEMPT_LAZY
+ /* Run out of bits. Share the LAZY and PREEMPT_RESCHED */
+ if (need_resched_lazy())
+ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
+#else
if (test_preempt_need_resched())
trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
- return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+#endif
+
+ return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+ (preempt_lazy_count() & 0xff) << 16 |
(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
}
@@ -4287,15 +4295,17 @@ unsigned long trace_total_entries(struct trace_array *tr)
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n"
- "# / _-----=> irqs-off/BH-disabled\n"
- "# | / _----=> need-resched \n"
- "# || / _---=> hardirq/softirq \n"
- "# ||| / _--=> preempt-depth \n"
- "# |||| / _-=> migrate-disable \n"
- "# ||||| / delay \n"
- "# cmd pid |||||| time | caller \n"
- "# \\ / |||||| \\ | / \n");
+ seq_puts(m, "# _--------=> CPU# \n"
+ "# / _-------=> irqs-off/BH-disabled\n"
+ "# | / _------=> need-resched \n"
+ "# || / _-----=> need-resched-lazy\n"
+ "# ||| / _----=> hardirq/softirq \n"
+ "# |||| / _---=> preempt-depth \n"
+ "# ||||| / _--=> preempt-lazy-depth\n"
+ "# |||||| / _-=> migrate-disable \n"
+ "# ||||||| / delay \n"
+ "# cmd pid |||||||| time | caller \n"
+ "# \\ / |||||||| \\ | / \n");
}
static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@@ -4329,14 +4339,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
print_event_info(buf, m);
- seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space);
- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
- seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space);
- seq_printf(m, "# %.*s|||| / delay\n", prec, space);
- seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
- seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | ");
+ seq_printf(m, "# %.*s _-------=> irqs-off/BH-disabled\n", prec, space);
+ seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space);
+ seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space);
+ seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space);
+ seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space);
+ seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space);
+ seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space);
+ seq_printf(m, "# %.*s|||||| / delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | ");
}
void
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 654ffa40457aa..b2d52f8355b70 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -208,6 +208,7 @@ static int trace_define_common_fields(void)
/* Holds both preempt_count and migrate_disable */
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
+ __common_field(unsigned char, preempt_lazy_count);
return ret;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bd475a00f96d1..89d4a3bfdc6d5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
char hardsoft_irq;
char need_resched;
+ char need_resched_lazy;
char irqs_off;
int hardirq;
int softirq;
@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
TRACE_FLAG_PREEMPT_RESCHED)) {
+#ifndef CONFIG_PREEMPT_LAZY
case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'N';
break;
+#endif
case TRACE_FLAG_NEED_RESCHED:
need_resched = 'n';
break;
+#ifndef CONFIG_PREEMPT_LAZY
case TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'p';
break;
+#endif
default:
need_resched = '.';
break;
}
+ need_resched_lazy =
+ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
+
hardsoft_irq =
(nmi && hardirq) ? 'Z' :
nmi ? 'z' :
@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
softirq ? 's' :
'.' ;
- trace_seq_printf(s, "%c%c%c",
- irqs_off, need_resched, hardsoft_irq);
+ trace_seq_printf(s, "%c%c%c%c",
+ irqs_off, need_resched, need_resched_lazy,
+ hardsoft_irq);
if (entry->preempt_count & 0xf)
trace_seq_printf(s, "%x", entry->preempt_count & 0xf);
else
trace_seq_putc(s, '.');
+ if (entry->preempt_lazy_count)
+ trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+ else
+ trace_seq_putc(s, '.');
+
if (entry->preempt_count & 0xf0)
trace_seq_printf(s, "%x", entry->preempt_count >> 4);
else
diff --git a/localversion-rt b/localversion-rt
new file mode 100644
index 0000000000000..18777ec0c27d4
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
+-rt15
diff --git a/net/core/dev.c b/net/core/dev.c
index 1488f700bf819..8c3b70160be8c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4573,15 +4573,6 @@ static void rps_trigger_softirq(void *data)
#endif /* CONFIG_RPS */
-/* Called from hardirq (IPI) context */
-static void trigger_rx_softirq(void *data)
-{
- struct softnet_data *sd = data;
-
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- smp_store_release(&sd->defer_ipi_scheduled, 0);
-}
-
/*
* Check if this softnet_data structure is another cpu one
* If yes, queue it to our IPI list and return 1
@@ -6632,6 +6623,30 @@ static void skb_defer_free_flush(struct softnet_data *sd)
}
}
+#ifndef CONFIG_PREEMPT_RT
+/* Called from hardirq (IPI) context */
+static void trigger_rx_softirq(void *data)
+{
+ struct softnet_data *sd = data;
+
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ smp_store_release(&sd->defer_ipi_scheduled, 0);
+}
+
+#else
+
+static void trigger_rx_softirq(struct work_struct *defer_work)
+{
+ struct softnet_data *sd;
+
+ sd = container_of(defer_work, struct softnet_data, defer_work);
+ smp_store_release(&sd->defer_ipi_scheduled, 0);
+ local_bh_disable();
+ skb_defer_free_flush(sd);
+ local_bh_enable();
+}
+#endif
+
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -11409,7 +11424,11 @@ static int __init net_dev_init(void)
INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
sd->cpu = i;
#endif
+#ifndef CONFIG_PREEMPT_RT
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
+#else
+ INIT_WORK(&sd->defer_work, trigger_rx_softirq);
+#endif
spin_lock_init(&sd->defer_lock);
init_gro_hash(&sd->backlog);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 14bb41aafee30..3f8dac23205c6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6856,6 +6856,11 @@ nodefer: __kfree_skb(skb);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely).
*/
- if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
+ if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
+#ifndef CONFIG_PREEMPT_RT
smp_call_function_single_async(cpu, &sd->defer_csd);
+#else
+ schedule_work_on(cpu, &sd->defer_work);
+#endif
+ }
}