diff --git a/packages/kernel/linux/package.mk b/packages/kernel/linux/package.mk index e00829143..f48ce4a9f 100644 --- a/packages/kernel/linux/package.mk +++ b/packages/kernel/linux/package.mk @@ -4,7 +4,7 @@ PKG_NAME="linux" PKG_LICENSE="GPL" -PKG_VERSION="6.0.7" +PKG_VERSION="6.0.11" PKG_URL="https://www.kernel.org/pub/linux/kernel/v6.x/${PKG_NAME}-${PKG_VERSION}.tar.xz" PKG_SITE="http://www.kernel.org" PKG_DEPENDS_HOST="ccache:host rsync:host openssl:host" diff --git a/packages/kernel/linux/patches/X86_64/patch-6.0.5-rt14.patch b/packages/kernel/linux/patches/X86_64/patch-6.0.5-rt14.patch index c0c976eb9..2de168ec2 100644 --- a/packages/kernel/linux/patches/X86_64/patch-6.0.5-rt14.patch +++ b/packages/kernel/linux/patches/X86_64/patch-6.0.5-rt14.patch @@ -1,7 +1,43 @@ -diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig -index 11ecf09aadc86..98aa5a478719c 100644 ---- a/arch/arm/Kconfig -+++ b/arch/arm/Kconfig +diff -rupN linux.orig/arch/arm/include/asm/thread_info.h linux/arch/arm/include/asm/thread_info.h +--- linux.orig/arch/arm/include/asm/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/include/asm/thread_info.h 2022-12-04 10:40:26.676034147 -0500 +@@ -62,6 +62,7 @@ struct cpu_context_save { + struct thread_info { + unsigned long flags; /* low level flags */ + int preempt_count; /* 0 => preemptable, <0 => bug */ ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + __u32 cpu; /* cpu */ + __u32 cpu_domain; /* cpu domain */ + struct cpu_context_save cpu_context; /* cpu context */ +@@ -133,6 +134,7 @@ extern int vfp_restore_user_hwstate(stru + #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ + #define TIF_SECCOMP 7 /* seccomp syscall filtering active */ + #define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ ++#define TIF_NEED_RESCHED_LAZY 9 + + #define TIF_USING_IWMMXT 17 + #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ +@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(stru + #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) + #define _TIF_SECCOMP (1 << TIF_SECCOMP) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) + + /* Checks for any syscall work in entry-common.S */ +@@ -156,7 +159,8 @@ extern int vfp_restore_user_hwstate(stru + /* + * Change these and you break ASM code in entry-common.S + */ +-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ ++#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ ++ _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NOTIFY_SIGNAL) + +diff -rupN linux.orig/arch/arm/Kconfig linux/arch/arm/Kconfig +--- linux.orig/arch/arm/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/Kconfig 2022-12-04 10:40:26.676034147 -0500 @@ -33,6 +33,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW @@ -35,48 +71,9 @@ index 11ecf09aadc86..98aa5a478719c 100644 select RTC_LIB select SYS_SUPPORTS_APM_EMULATION select THREAD_INFO_IN_TASK -diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h -index aecc403b28804..1b56e56f8f415 100644 ---- a/arch/arm/include/asm/thread_info.h -+++ b/arch/arm/include/asm/thread_info.h -@@ -62,6 +62,7 @@ struct cpu_context_save { - struct thread_info { - unsigned long flags; /* low level flags */ - int preempt_count; /* 0 => preemptable, <0 => bug */ -+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ - __u32 cpu; /* cpu */ - __u32 cpu_domain; /* cpu domain */ - struct cpu_context_save cpu_context; /* cpu context */ -@@ -133,6 +134,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, - #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ - #define TIF_SECCOMP 7 /* seccomp syscall filtering active */ - #define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ -+#define TIF_NEED_RESCHED_LAZY 9 - - #define TIF_USING_IWMMXT 17 - #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ -@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, - #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) - #define _TIF_SECCOMP (1 << TIF_SECCOMP) - #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) - #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) - - /* Checks for any syscall work in entry-common.S */ -@@ -156,7 +159,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, - /* - * Change these and you break ASM code in entry-common.S - */ --#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ -+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ -+ _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NOTIFY_SIGNAL) - -diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c -index 2c8d76fd7c662..c3bdec7d2df9c 100644 ---- a/arch/arm/kernel/asm-offsets.c -+++ b/arch/arm/kernel/asm-offsets.c +diff -rupN linux.orig/arch/arm/kernel/asm-offsets.c linux/arch/arm/kernel/asm-offsets.c +--- linux.orig/arch/arm/kernel/asm-offsets.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/kernel/asm-offsets.c 2022-12-04 10:40:26.676034147 -0500 @@ -43,6 +43,7 @@ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); @@ -85,11 +82,10 @@ index 2c8d76fd7c662..c3bdec7d2df9c 100644 DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context)); -diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S -index c39303e5c2347..cfb4660e9feab 100644 ---- a/arch/arm/kernel/entry-armv.S -+++ b/arch/arm/kernel/entry-armv.S -@@ -222,11 +222,18 @@ ENDPROC(__dabt_svc) +diff -rupN linux.orig/arch/arm/kernel/entry-armv.S linux/arch/arm/kernel/entry-armv.S +--- linux.orig/arch/arm/kernel/entry-armv.S 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/kernel/entry-armv.S 2022-12-04 10:40:26.676034147 -0500 +@@ -222,11 +222,18 @@ __irq_svc: #ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count @@ -110,7 +106,7 @@ index c39303e5c2347..cfb4660e9feab 100644 #endif svc_exit r5, irq = 1 @ return from exception -@@ -241,8 +248,14 @@ ENDPROC(__irq_svc) +@@ -241,8 +248,14 @@ svc_preempt: 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED @@ -126,11 +122,10 @@ index c39303e5c2347..cfb4660e9feab 100644 #endif __und_fault: -diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c -index ea128e32e8ca8..3671a4214d6f4 100644 ---- a/arch/arm/kernel/signal.c -+++ b/arch/arm/kernel/signal.c -@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) +diff -rupN linux.orig/arch/arm/kernel/signal.c linux/arch/arm/kernel/signal.c +--- linux.orig/arch/arm/kernel/signal.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/kernel/signal.c 2022-12-04 10:40:26.676034147 -0500 +@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, un */ trace_hardirqs_off(); do { @@ -140,11 +135,10 @@ index ea128e32e8ca8..3671a4214d6f4 100644 schedule(); } else { if (unlikely(!user_mode(regs))) -diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c -index 46cccd6bf705a..480a1976a9dce 100644 ---- a/arch/arm/mm/fault.c -+++ b/arch/arm/mm/fault.c -@@ -421,6 +421,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, +diff -rupN linux.orig/arch/arm/mm/fault.c linux/arch/arm/mm/fault.c +--- linux.orig/arch/arm/mm/fault.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/mm/fault.c 2022-12-04 10:40:26.676034147 -0500 +@@ -421,6 +421,9 @@ do_translation_fault(unsigned long addr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); @@ -154,7 +148,7 @@ index 46cccd6bf705a..480a1976a9dce 100644 if (user_mode(regs)) goto bad_area; -@@ -491,6 +494,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, +@@ -491,6 +494,9 @@ do_translation_fault(unsigned long addr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -164,31 +158,10 @@ index 46cccd6bf705a..480a1976a9dce 100644 do_bad_area(addr, fsr, regs); return 0; } -diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 3795eb5ba1cdd..6922949e61b71 100644 ---- a/arch/arm64/Kconfig -+++ b/arch/arm64/Kconfig -@@ -93,6 +93,7 @@ config ARM64 - select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 - select ARCH_SUPPORTS_NUMA_BALANCING - select ARCH_SUPPORTS_PAGE_TABLE_CHECK -+ select ARCH_SUPPORTS_RT - select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT - select ARCH_WANT_DEFAULT_BPF_JIT - select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT -@@ -200,6 +201,7 @@ config ARM64 - select HAVE_PERF_USER_STACK_DUMP - select HAVE_PREEMPT_DYNAMIC_KEY - select HAVE_REGS_AND_STACK_ACCESS_API -+ select HAVE_PREEMPT_LAZY - select HAVE_POSIX_CPU_TIMERS_TASK_WORK - select HAVE_FUNCTION_ARG_ACCESS_API - select MMU_GATHER_RCU_TABLE_FREE -diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h -index 0159b625cc7f0..a5486918e5eeb 100644 ---- a/arch/arm64/include/asm/preempt.h -+++ b/arch/arm64/include/asm/preempt.h -@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_and_test(void) +diff -rupN linux.orig/arch/arm64/include/asm/preempt.h linux/arch/arm64/include/asm/preempt.h +--- linux.orig/arch/arm64/include/asm/preempt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/include/asm/preempt.h 2022-12-04 10:40:26.676034147 -0500 +@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_a * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE * pair. */ @@ -226,10 +199,9 @@ index 0159b625cc7f0..a5486918e5eeb 100644 } #ifdef CONFIG_PREEMPTION -diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h -index 848739c15de82..4b7148fd5551f 100644 ---- a/arch/arm64/include/asm/thread_info.h -+++ b/arch/arm64/include/asm/thread_info.h +diff -rupN linux.orig/arch/arm64/include/asm/thread_info.h linux/arch/arm64/include/asm/thread_info.h +--- linux.orig/arch/arm64/include/asm/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/include/asm/thread_info.h 2022-12-04 10:40:26.676034147 -0500 @@ -26,6 +26,7 @@ struct thread_info { #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ @@ -238,7 +210,7 @@ index 848739c15de82..4b7148fd5551f 100644 union { u64 preempt_count; /* 0 => preemptible, <0 => bug */ struct { -@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_struct *dst, +@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_str #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ @@ -246,7 +218,7 @@ index 848739c15de82..4b7148fd5551f 100644 #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ -@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_struct *dst, +@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_str #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) @@ -258,7 +230,7 @@ index 848739c15de82..4b7148fd5551f 100644 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ _TIF_NOTIFY_SIGNAL) -@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_struct *dst, +@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_str _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) @@ -267,10 +239,28 @@ index 848739c15de82..4b7148fd5551f 100644 #ifdef CONFIG_SHADOW_CALL_STACK #define INIT_SCS \ .scs_base = init_shadow_call_stack, \ -diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c -index 1197e7679882e..e74c0415f67ea 100644 ---- a/arch/arm64/kernel/asm-offsets.c -+++ b/arch/arm64/kernel/asm-offsets.c +diff -rupN linux.orig/arch/arm64/Kconfig linux/arch/arm64/Kconfig +--- linux.orig/arch/arm64/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/Kconfig 2022-12-04 10:40:26.676034147 -0500 +@@ -93,6 +93,7 @@ config ARM64 + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PAGE_TABLE_CHECK ++ select ARCH_SUPPORTS_RT + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +@@ -200,6 +201,7 @@ config ARM64 + select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_DYNAMIC_KEY + select HAVE_REGS_AND_STACK_ACCESS_API ++ select HAVE_PREEMPT_LAZY + select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_FUNCTION_ARG_ACCESS_API + select MMU_GATHER_RCU_TABLE_FREE +diff -rupN linux.orig/arch/arm64/kernel/asm-offsets.c linux/arch/arm64/kernel/asm-offsets.c +--- linux.orig/arch/arm64/kernel/asm-offsets.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/kernel/asm-offsets.c 2022-12-04 10:40:26.676034147 -0500 @@ -32,6 +32,7 @@ int main(void) DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); @@ -279,11 +269,10 @@ index 1197e7679882e..e74c0415f67ea 100644 #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif -diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c -index 9ad911f1647c8..545c41a84411e 100644 ---- a/arch/arm64/kernel/signal.c -+++ b/arch/arm64/kernel/signal.c -@@ -1103,7 +1103,7 @@ static void do_signal(struct pt_regs *regs) +diff -rupN linux.orig/arch/arm64/kernel/signal.c linux/arch/arm64/kernel/signal.c +--- linux.orig/arch/arm64/kernel/signal.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/kernel/signal.c 2022-12-04 10:40:26.676034147 -0500 +@@ -1103,7 +1103,7 @@ static void do_signal(struct pt_regs *re void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { do { @@ -292,34 +281,10 @@ index 9ad911f1647c8..545c41a84411e 100644 /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); -diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig -index cbe7bb029aec8..ad5bcc255f4e3 100644 ---- a/arch/powerpc/Kconfig -+++ b/arch/powerpc/Kconfig -@@ -149,6 +149,7 @@ config PPC - select ARCH_STACKWALK - select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x -+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK - select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_CMPXCHG_LOCKREF if PPC64 - select ARCH_USE_MEMTEST -@@ -241,8 +242,10 @@ config PPC - select HAVE_PERF_EVENTS_NMI if PPC64 - select HAVE_PERF_REGS - select HAVE_PERF_USER_STACK_DUMP -+ select HAVE_PREEMPT_LAZY - select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE -+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM - select HAVE_RSEQ - select HAVE_SETUP_PER_CPU_AREA if PPC64 - select HAVE_SOFTIRQ_ON_OWN_STACK -diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h -index 1c8460e235838..b1653c160bab9 100644 ---- a/arch/powerpc/include/asm/stackprotector.h -+++ b/arch/powerpc/include/asm/stackprotector.h -@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void) +diff -rupN linux.orig/arch/powerpc/include/asm/stackprotector.h linux/arch/powerpc/include/asm/stackprotector.h +--- linux.orig/arch/powerpc/include/asm/stackprotector.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/include/asm/stackprotector.h 2022-12-04 10:40:26.676034147 -0500 +@@ -24,7 +24,11 @@ static __always_inline void boot_init_st unsigned long canary; /* Try to get a semi random initial value. */ @@ -331,10 +296,9 @@ index 1c8460e235838..b1653c160bab9 100644 canary ^= mftb(); canary ^= LINUX_VERSION_CODE; canary &= CANARY_MASK; -diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h -index af58f1ed3952e..520864de8bb27 100644 ---- a/arch/powerpc/include/asm/thread_info.h -+++ b/arch/powerpc/include/asm/thread_info.h +diff -rupN linux.orig/arch/powerpc/include/asm/thread_info.h linux/arch/powerpc/include/asm/thread_info.h +--- linux.orig/arch/powerpc/include/asm/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/include/asm/thread_info.h 2022-12-04 10:40:26.676034147 -0500 @@ -53,6 +53,8 @@ struct thread_info { int preempt_count; /* 0 => preemptable, @@ -389,11 +353,32 @@ index af58f1ed3952e..520864de8bb27 100644 /* Bits in local_flags */ /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ -diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c -index f9db0a172401a..38aa3d06c632c 100644 ---- a/arch/powerpc/kernel/interrupt.c -+++ b/arch/powerpc/kernel/interrupt.c -@@ -184,7 +184,7 @@ interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs) +diff -rupN linux.orig/arch/powerpc/Kconfig linux/arch/powerpc/Kconfig +--- linux.orig/arch/powerpc/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/Kconfig 2022-12-04 10:40:26.676034147 -0500 +@@ -149,6 +149,7 @@ config PPC + select ARCH_STACKWALK + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if PPC64 + select ARCH_USE_MEMTEST +@@ -241,8 +242,10 @@ config PPC + select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RELIABLE_STACKTRACE ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select HAVE_RSEQ + select HAVE_SETUP_PER_CPU_AREA if PPC64 + select HAVE_SOFTIRQ_ON_OWN_STACK +diff -rupN linux.orig/arch/powerpc/kernel/interrupt.c linux/arch/powerpc/kernel/interrupt.c +--- linux.orig/arch/powerpc/kernel/interrupt.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/kernel/interrupt.c 2022-12-04 10:40:26.676034147 -0500 +@@ -184,7 +184,7 @@ again: ti_flags = read_thread_flags(); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); @@ -402,7 +387,7 @@ index f9db0a172401a..38aa3d06c632c 100644 schedule(); } else { /* -@@ -388,11 +388,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) +@@ -388,11 +388,15 @@ notrace unsigned long interrupt_exit_ker /* Returning to a kernel context with local irqs enabled. */ WARN_ON_ONCE(!(regs->msr & MSR_EE)); again: @@ -419,10 +404,9 @@ index f9db0a172401a..38aa3d06c632c 100644 } } -diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c -index dadfcef5d6db4..3bfe55d82b042 100644 ---- a/arch/powerpc/kernel/traps.c -+++ b/arch/powerpc/kernel/traps.c +diff -rupN linux.orig/arch/powerpc/kernel/traps.c linux/arch/powerpc/kernel/traps.c +--- linux.orig/arch/powerpc/kernel/traps.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/kernel/traps.c 2022-12-04 10:40:26.676034147 -0500 @@ -260,12 +260,17 @@ static char *get_mmu_str(void) static int __die(const char *str, struct pt_regs *regs, long err) @@ -442,10 +426,9 @@ index dadfcef5d6db4..3bfe55d82b042 100644 IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", -diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig -index dcb398d5e0093..2cfa432afdb12 100644 ---- a/arch/powerpc/kvm/Kconfig -+++ b/arch/powerpc/kvm/Kconfig +diff -rupN linux.orig/arch/powerpc/kvm/Kconfig linux/arch/powerpc/kvm/Kconfig +--- linux.orig/arch/powerpc/kvm/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/kvm/Kconfig 2022-12-04 10:40:26.676034147 -0500 @@ -221,6 +221,7 @@ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" @@ -454,10 +437,9 @@ index dcb398d5e0093..2cfa432afdb12 100644 select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING -diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c -index 561adac690229..61c4c0610aa6a 100644 ---- a/arch/powerpc/platforms/pseries/iommu.c -+++ b/arch/powerpc/platforms/pseries/iommu.c +diff -rupN linux.orig/arch/powerpc/platforms/pseries/iommu.c linux/arch/powerpc/platforms/pseries/iommu.c +--- linux.orig/arch/powerpc/platforms/pseries/iommu.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/platforms/pseries/iommu.c 2022-12-04 10:40:26.676034147 -0500 @@ -24,6 +24,7 @@ #include #include @@ -466,7 +448,7 @@ index 561adac690229..61c4c0610aa6a 100644 #include #include #include -@@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, +@@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned return ret; } @@ -481,7 +463,7 @@ index 561adac690229..61c4c0610aa6a 100644 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, -@@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +@@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(stru direction, attrs); } @@ -494,7 +476,7 @@ index 561adac690229..61c4c0610aa6a 100644 /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() -@@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +@@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(stru tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { @@ -509,7 +491,7 @@ index 561adac690229..61c4c0610aa6a 100644 } rpn = __pa(uaddr) >> tceshift; -@@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +@@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(stru tcenum += limit; } while (npages > 0 && !rc); @@ -518,7 +500,7 @@ index 561adac690229..61c4c0610aa6a 100644 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; -@@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, +@@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP( DMA_BIDIRECTIONAL, 0); } @@ -540,7 +522,7 @@ index 561adac690229..61c4c0610aa6a 100644 } proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; -@@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, +@@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP( /* error cleanup: caller will clear whole range */ @@ -549,31 +531,10 @@ index 561adac690229..61c4c0610aa6a 100644 return rc; } -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 159c025ebb03e..4d62ceece1bb0 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -109,6 +109,7 @@ config X86 - select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 - select ARCH_SUPPORTS_LTO_CLANG - select ARCH_SUPPORTS_LTO_CLANG_THIN -+ select ARCH_SUPPORTS_RT - select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_MEMTEST - select ARCH_USE_QUEUED_RWLOCKS -@@ -243,6 +244,7 @@ config X86 - select HAVE_PCI - select HAVE_PERF_REGS - select HAVE_PERF_USER_STACK_DUMP -+ select HAVE_PREEMPT_LAZY - select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT - select MMU_GATHER_MERGE_VMAS - select HAVE_POSIX_CPU_TIMERS_TASK_WORK -diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h -index 5f6daea1ee248..cd20b4a5719a4 100644 ---- a/arch/x86/include/asm/preempt.h -+++ b/arch/x86/include/asm/preempt.h -@@ -90,17 +90,48 @@ static __always_inline void __preempt_count_sub(int val) +diff -rupN linux.orig/arch/x86/include/asm/preempt.h linux/arch/x86/include/asm/preempt.h +--- linux.orig/arch/x86/include/asm/preempt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/x86/include/asm/preempt.h 2022-12-04 10:40:26.676034147 -0500 +@@ -90,17 +90,48 @@ static __always_inline void __preempt_co * a decrement which hits zero means we have no preempt_count and should * reschedule. */ @@ -623,10 +584,9 @@ index 5f6daea1ee248..cd20b4a5719a4 100644 } #ifdef CONFIG_PREEMPTION -diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h -index f0cb881c1d690..fd8fb76f324fc 100644 ---- a/arch/x86/include/asm/thread_info.h -+++ b/arch/x86/include/asm/thread_info.h +diff -rupN linux.orig/arch/x86/include/asm/thread_info.h linux/arch/x86/include/asm/thread_info.h +--- linux.orig/arch/x86/include/asm/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/x86/include/asm/thread_info.h 2022-12-04 10:40:26.676034147 -0500 @@ -57,6 +57,8 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ @@ -660,11 +620,29 @@ index f0cb881c1d690..fd8fb76f324fc 100644 #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) -diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c -index fac8ff983aec8..65fb9bad1577a 100644 ---- a/drivers/bcma/driver_gpio.c -+++ b/drivers/bcma/driver_gpio.c -@@ -115,7 +115,7 @@ static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id) +diff -rupN linux.orig/arch/x86/Kconfig linux/arch/x86/Kconfig +--- linux.orig/arch/x86/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/x86/Kconfig 2022-12-04 10:40:26.676034147 -0500 +@@ -109,6 +109,7 @@ config X86 + select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 + select ARCH_SUPPORTS_LTO_CLANG + select ARCH_SUPPORTS_LTO_CLANG_THIN ++ select ARCH_SUPPORTS_RT + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_MEMTEST + select ARCH_USE_QUEUED_RWLOCKS +@@ -243,6 +244,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_MERGE_VMAS + select HAVE_POSIX_CPU_TIMERS_TASK_WORK +diff -rupN linux.orig/drivers/bcma/driver_gpio.c linux/drivers/bcma/driver_gpio.c +--- linux.orig/drivers/bcma/driver_gpio.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/bcma/driver_gpio.c 2022-12-04 10:40:26.680034137 -0500 +@@ -115,7 +115,7 @@ static irqreturn_t bcma_gpio_irq_handler return IRQ_NONE; for_each_set_bit(gpio, &irqs, gc->ngpio) @@ -673,11 +651,10 @@ index fac8ff983aec8..65fb9bad1577a 100644 bcma_chipco_gpio_polarity(cc, irqs, val & irqs); return IRQ_HANDLED; -diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c -index 226ea76cc8197..4043d909d41bf 100644 ---- a/drivers/block/zram/zram_drv.c -+++ b/drivers/block/zram/zram_drv.c -@@ -60,6 +60,40 @@ static void zram_free_page(struct zram *zram, size_t index); +diff -rupN linux.orig/drivers/block/zram/zram_drv.c linux/drivers/block/zram/zram_drv.c +--- linux.orig/drivers/block/zram/zram_drv.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/block/zram/zram_drv.c 2022-12-04 10:40:26.680034137 -0500 +@@ -60,6 +60,40 @@ static void zram_free_page(struct zram * static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); @@ -718,7 +695,7 @@ index 226ea76cc8197..4043d909d41bf 100644 static int zram_slot_trylock(struct zram *zram, u32 index) { -@@ -75,6 +109,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) +@@ -75,6 +109,7 @@ static void zram_slot_unlock(struct zram { bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } @@ -726,7 +703,7 @@ index 226ea76cc8197..4043d909d41bf 100644 static inline bool init_done(struct zram *zram) { -@@ -1198,6 +1233,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) +@@ -1198,6 +1233,7 @@ static bool zram_meta_alloc(struct zram if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); @@ -734,10 +711,9 @@ index 226ea76cc8197..4043d909d41bf 100644 return true; } -diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h -index 80c3b43b4828f..ff021a9728d1e 100644 ---- a/drivers/block/zram/zram_drv.h -+++ b/drivers/block/zram/zram_drv.h +diff -rupN linux.orig/drivers/block/zram/zram_drv.h linux/drivers/block/zram/zram_drv.h +--- linux.orig/drivers/block/zram/zram_drv.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/block/zram/zram_drv.h 2022-12-04 10:40:26.680034137 -0500 @@ -63,6 +63,9 @@ struct zram_table_entry { unsigned long element; }; @@ -748,11 +724,10 @@ index 80c3b43b4828f..ff021a9728d1e 100644 #ifdef CONFIG_ZRAM_MEMORY_TRACKING ktime_t ac_time; #endif -diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c -index bcff6429e0b4f..4a9ae338a2bdf 100644 ---- a/drivers/char/tpm/tpm_tis.c -+++ b/drivers/char/tpm/tpm_tis.c -@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da +diff -rupN linux.orig/drivers/char/tpm/tpm_tis.c linux/drivers/char/tpm/tpm_tis.c +--- linux.orig/drivers/char/tpm/tpm_tis.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/char/tpm/tpm_tis.c 2022-12-04 10:40:26.680034137 -0500 +@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to return container_of(data, struct tpm_tis_tcg_phy, priv); } @@ -784,7 +759,7 @@ index bcff6429e0b4f..4a9ae338a2bdf 100644 static int interrupts = -1; module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); -@@ -185,12 +210,12 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, +@@ -185,12 +210,12 @@ static int tpm_tcg_write_bytes(struct tp switch (io_mode) { case TPM_TIS_PHYS_8: while (len--) @@ -799,11 +774,10 @@ index bcff6429e0b4f..4a9ae338a2bdf 100644 break; } -diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c -index 64cb060d9d753..77a41151c921b 100644 ---- a/drivers/gpio/gpio-mlxbf2.c -+++ b/drivers/gpio/gpio-mlxbf2.c -@@ -273,10 +273,8 @@ static irqreturn_t mlxbf2_gpio_irq_handler(int irq, void *ptr) +diff -rupN linux.orig/drivers/gpio/gpio-mlxbf2.c linux/drivers/gpio/gpio-mlxbf2.c +--- linux.orig/drivers/gpio/gpio-mlxbf2.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpio/gpio-mlxbf2.c 2022-12-04 10:40:26.680034137 -0500 +@@ -273,10 +273,8 @@ static irqreturn_t mlxbf2_gpio_irq_handl pending = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_CAUSE_EVTEN0); writel(pending, gs->gpio_io + YU_GPIO_CAUSE_OR_CLRCAUSE); @@ -816,23 +790,10 @@ index 64cb060d9d753..77a41151c921b 100644 return IRQ_RETVAL(pending); } -diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig -index 7ae3b7d67fcfc..844f54f1daea9 100644 ---- a/drivers/gpu/drm/i915/Kconfig -+++ b/drivers/gpu/drm/i915/Kconfig -@@ -3,7 +3,6 @@ config DRM_I915 - tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics" - depends on DRM - depends on X86 && PCI -- depends on !PREEMPT_RT - select INTEL_GTT if X86 - select INTERVAL_TREE - # we need shmfs for the swappable backing store, and in particular -diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c -index 4442aa355f868..23085e82c3ed5 100644 ---- a/drivers/gpu/drm/i915/display/intel_crtc.c -+++ b/drivers/gpu/drm/i915/display/intel_crtc.c -@@ -522,7 +522,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state) +diff -rupN linux.orig/drivers/gpu/drm/i915/display/intel_crtc.c linux/drivers/gpu/drm/i915/display/intel_crtc.c +--- linux.orig/drivers/gpu/drm/i915/display/intel_crtc.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/display/intel_crtc.c 2022-12-04 10:40:26.680034137 -0500 +@@ -522,7 +522,8 @@ void intel_pipe_update_start(struct inte */ intel_psr_wait_for_idle_locked(new_crtc_state); @@ -842,7 +803,7 @@ index 4442aa355f868..23085e82c3ed5 100644 crtc->debug.min_vbl = min; crtc->debug.max_vbl = max; -@@ -547,11 +548,13 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state) +@@ -547,11 +548,13 @@ void intel_pipe_update_start(struct inte break; } @@ -858,7 +819,7 @@ index 4442aa355f868..23085e82c3ed5 100644 } finish_wait(wq, &wait); -@@ -584,7 +587,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state) +@@ -584,7 +587,8 @@ void intel_pipe_update_start(struct inte return; irq_disable: @@ -868,7 +829,7 @@ index 4442aa355f868..23085e82c3ed5 100644 } #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) -@@ -685,7 +689,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) +@@ -685,7 +689,8 @@ void intel_pipe_update_end(struct intel_ */ intel_vrr_send_push(new_crtc_state); @@ -878,11 +839,10 @@ index 4442aa355f868..23085e82c3ed5 100644 if (intel_vgpu_active(dev_priv)) return; -diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c -index ecc990ec1b952..8d04b10681f0d 100644 ---- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c -+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c -@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) +diff -rupN linux.orig/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c linux/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +--- linux.orig/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c 2022-12-04 10:40:26.680034137 -0500 +@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct int /* Kick the work once more to drain the signalers, and disarm the irq */ irq_work_sync(&b->irq_work); while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { @@ -895,11 +855,10 @@ index ecc990ec1b952..8d04b10681f0d 100644 } } -diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c -index c718e6dc40b51..0e592999b7d60 100644 ---- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c -+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c -@@ -1302,7 +1302,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +diff -rupN linux.orig/drivers/gpu/drm/i915/gt/intel_execlists_submission.c linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +--- linux.orig/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 2022-12-04 10:40:26.680034137 -0500 +@@ -1302,7 +1302,7 @@ static void execlists_dequeue(struct int * and context switches) submission. */ @@ -908,7 +867,7 @@ index c718e6dc40b51..0e592999b7d60 100644 /* * If the queue is higher priority than the last -@@ -1402,7 +1402,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1402,7 +1402,7 @@ static void execlists_dequeue(struct int * Even if ELSP[1] is occupied and not worthy * of timeslices, our queue might be. */ @@ -917,7 +876,7 @@ index c718e6dc40b51..0e592999b7d60 100644 return; } } -@@ -1428,7 +1428,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1428,7 +1428,7 @@ static void execlists_dequeue(struct int if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.sched_engine->lock); @@ -926,7 +885,7 @@ index c718e6dc40b51..0e592999b7d60 100644 return; /* leave this for another sibling */ } -@@ -1590,7 +1590,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1590,7 +1590,7 @@ done: */ sched_engine->queue_priority_hint = queue_prio(sched_engine); i915_sched_engine_reset_on_empty(sched_engine); @@ -935,7 +894,7 @@ index c718e6dc40b51..0e592999b7d60 100644 /* * We can skip poking the HW if we ended up with exactly the same set -@@ -1616,13 +1616,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1616,13 +1616,6 @@ done: } } @@ -949,7 +908,7 @@ index c718e6dc40b51..0e592999b7d60 100644 static void clear_ports(struct i915_request **ports, int count) { memset_p((void **)ports, NULL, count); -@@ -2468,7 +2461,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) +@@ -2468,7 +2461,7 @@ static void execlists_submission_tasklet } if (!engine->execlists.pending[0]) { @@ -958,11 +917,10 @@ index c718e6dc40b51..0e592999b7d60 100644 start_timeslice(engine); } -diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c -index 73cebc6aa6507..98305fb393413 100644 ---- a/drivers/gpu/drm/i915/i915_irq.c -+++ b/drivers/gpu/drm/i915/i915_irq.c -@@ -917,7 +917,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, +diff -rupN linux.orig/drivers/gpu/drm/i915/i915_irq.c linux/drivers/gpu/drm/i915/i915_irq.c +--- linux.orig/drivers/gpu/drm/i915/i915_irq.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/i915_irq.c 2022-12-04 10:40:26.680034137 -0500 +@@ -917,7 +917,8 @@ static bool i915_get_crtc_scanoutpos(str */ spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); @@ -972,7 +930,7 @@ index 73cebc6aa6507..98305fb393413 100644 /* Get optional system timestamp before query. */ if (stime) -@@ -981,7 +982,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, +@@ -981,7 +982,8 @@ static bool i915_get_crtc_scanoutpos(str if (etime) *etime = ktime_get(); @@ -982,11 +940,10 @@ index 73cebc6aa6507..98305fb393413 100644 spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); -diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c -index 62fad16a55e84..af07927650b24 100644 ---- a/drivers/gpu/drm/i915/i915_request.c -+++ b/drivers/gpu/drm/i915/i915_request.c -@@ -612,7 +612,6 @@ bool __i915_request_submit(struct i915_request *request) +diff -rupN linux.orig/drivers/gpu/drm/i915/i915_request.c linux/drivers/gpu/drm/i915/i915_request.c +--- linux.orig/drivers/gpu/drm/i915/i915_request.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/i915_request.c 2022-12-04 10:40:26.680034137 -0500 +@@ -612,7 +612,6 @@ bool __i915_request_submit(struct i915_r RQ_TRACE(request, "\n"); @@ -994,7 +951,7 @@ index 62fad16a55e84..af07927650b24 100644 lockdep_assert_held(&engine->sched_engine->lock); /* -@@ -721,7 +720,6 @@ void __i915_request_unsubmit(struct i915_request *request) +@@ -721,7 +720,6 @@ void __i915_request_unsubmit(struct i915 */ RQ_TRACE(request, "\n"); @@ -1002,10 +959,9 @@ index 62fad16a55e84..af07927650b24 100644 lockdep_assert_held(&engine->sched_engine->lock); /* -diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h -index 37b5c9e9d260e..73f29d8008f0c 100644 ---- a/drivers/gpu/drm/i915/i915_trace.h -+++ b/drivers/gpu/drm/i915/i915_trace.h +diff -rupN linux.orig/drivers/gpu/drm/i915/i915_trace.h linux/drivers/gpu/drm/i915/i915_trace.h +--- linux.orig/drivers/gpu/drm/i915/i915_trace.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/i915_trace.h 2022-12-04 10:40:26.680034137 -0500 @@ -6,6 +6,10 @@ #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _I915_TRACE_H_ @@ -1017,7 +973,7 @@ index 37b5c9e9d260e..73f29d8008f0c 100644 #include #include #include -@@ -323,7 +327,7 @@ DEFINE_EVENT(i915_request, i915_request_add, +@@ -323,7 +327,7 @@ DEFINE_EVENT(i915_request, i915_request_ TP_ARGS(rq) ); @@ -1026,11 +982,10 @@ index 37b5c9e9d260e..73f29d8008f0c 100644 DEFINE_EVENT(i915_request, i915_request_guc_submit, TP_PROTO(struct i915_request *rq), TP_ARGS(rq) -diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h -index c10d68cdc3ca5..593f3a7e0e4fc 100644 ---- a/drivers/gpu/drm/i915/i915_utils.h -+++ b/drivers/gpu/drm/i915/i915_utils.h -@@ -294,7 +294,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) +diff -rupN linux.orig/drivers/gpu/drm/i915/i915_utils.h linux/drivers/gpu/drm/i915/i915_utils.h +--- linux.orig/drivers/gpu/drm/i915/i915_utils.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/i915_utils.h 2022-12-04 10:40:26.680034137 -0500 +@@ -294,7 +294,7 @@ wait_remaining_ms_from_jiffies(unsigned #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ @@ -1039,10 +994,20 @@ index c10d68cdc3ca5..593f3a7e0e4fc 100644 # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) #else # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) -diff --git a/drivers/net/ethernet/alacritech/slic.h b/drivers/net/ethernet/alacritech/slic.h -index 4eecbdfff3ff1..82071d0e5f7fc 100644 ---- a/drivers/net/ethernet/alacritech/slic.h -+++ b/drivers/net/ethernet/alacritech/slic.h +diff -rupN linux.orig/drivers/gpu/drm/i915/Kconfig linux/drivers/gpu/drm/i915/Kconfig +--- linux.orig/drivers/gpu/drm/i915/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/Kconfig 2022-12-04 10:40:26.680034137 -0500 +@@ -3,7 +3,6 @@ config DRM_I915 + tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics" + depends on DRM + depends on X86 && PCI +- depends on !PREEMPT_RT + select INTEL_GTT if X86 + select INTERVAL_TREE + # we need shmfs for the swappable backing store, and in particular +diff -rupN linux.orig/drivers/net/ethernet/alacritech/slic.h linux/drivers/net/ethernet/alacritech/slic.h +--- linux.orig/drivers/net/ethernet/alacritech/slic.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/alacritech/slic.h 2022-12-04 10:40:26.680034137 -0500 @@ -288,13 +288,13 @@ do { \ u64_stats_update_end(&(st)->syncp); \ } while (0) @@ -1063,11 +1028,10 @@ index 4eecbdfff3ff1..82071d0e5f7fc 100644 } struct slic_upr { -diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c -index 39242c5a17290..8f81d288c4880 100644 ---- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c -+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c -@@ -118,9 +118,9 @@ static void ena_safe_update_stat(u64 *src, u64 *dst, +diff -rupN linux.orig/drivers/net/ethernet/amazon/ena/ena_ethtool.c linux/drivers/net/ethernet/amazon/ena/ena_ethtool.c +--- linux.orig/drivers/net/ethernet/amazon/ena/ena_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/amazon/ena/ena_ethtool.c 2022-12-04 10:40:26.680034137 -0500 +@@ -118,9 +118,9 @@ static void ena_safe_update_stat(u64 *sr unsigned int start; do { @@ -1079,11 +1043,10 @@ index 39242c5a17290..8f81d288c4880 100644 } static void ena_queue_stats(struct ena_adapter *adapter, u64 **data) -diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c -index 6a356a6cee15a..1c5d482990806 100644 ---- a/drivers/net/ethernet/amazon/ena/ena_netdev.c -+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c -@@ -3270,10 +3270,10 @@ static void ena_get_stats64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/amazon/ena/ena_netdev.c linux/drivers/net/ethernet/amazon/ena/ena_netdev.c +--- linux.orig/drivers/net/ethernet/amazon/ena/ena_netdev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/amazon/ena/ena_netdev.c 2022-12-04 10:40:26.680034137 -0500 +@@ -3270,10 +3270,10 @@ static void ena_get_stats64(struct net_d tx_ring = &adapter->tx_ring[i]; do { @@ -1096,7 +1059,7 @@ index 6a356a6cee15a..1c5d482990806 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; -@@ -3281,20 +3281,20 @@ static void ena_get_stats64(struct net_device *netdev, +@@ -3281,20 +3281,20 @@ static void ena_get_stats64(struct net_d rx_ring = &adapter->rx_ring[i]; do { @@ -1121,11 +1084,10 @@ index 6a356a6cee15a..1c5d482990806 100644 stats->rx_dropped = rx_drops; stats->tx_dropped = tx_drops; -diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c -index 25129e723b575..1e8d902e1c8ea 100644 ---- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c -+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c -@@ -934,7 +934,7 @@ unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data) +diff -rupN linux.orig/drivers/net/ethernet/aquantia/atlantic/aq_ring.c linux/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +--- linux.orig/drivers/net/ethernet/aquantia/atlantic/aq_ring.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/aquantia/atlantic/aq_ring.c 2022-12-04 10:40:26.680034137 -0500 +@@ -934,7 +934,7 @@ unsigned int aq_ring_fill_stats_data(str /* This data should mimic aq_ethtool_queue_rx_stat_names structure */ do { count = 0; @@ -1134,7 +1096,7 @@ index 25129e723b575..1e8d902e1c8ea 100644 data[count] = self->stats.rx.packets; data[++count] = self->stats.rx.jumbo_packets; data[++count] = self->stats.rx.lro_packets; -@@ -951,15 +951,15 @@ unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data) +@@ -951,15 +951,15 @@ unsigned int aq_ring_fill_stats_data(str data[++count] = self->stats.rx.xdp_tx; data[++count] = self->stats.rx.xdp_invalid; data[++count] = self->stats.rx.xdp_redirect; @@ -1153,11 +1115,10 @@ index 25129e723b575..1e8d902e1c8ea 100644 } return ++count; -diff --git a/drivers/net/ethernet/asix/ax88796c_main.c b/drivers/net/ethernet/asix/ax88796c_main.c -index 6ba5b024a7be7..25e7beb68e515 100644 ---- a/drivers/net/ethernet/asix/ax88796c_main.c -+++ b/drivers/net/ethernet/asix/ax88796c_main.c -@@ -662,12 +662,12 @@ static void ax88796c_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/ethernet/asix/ax88796c_main.c linux/drivers/net/ethernet/asix/ax88796c_main.c +--- linux.orig/drivers/net/ethernet/asix/ax88796c_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/asix/ax88796c_main.c 2022-12-04 10:40:26.680034137 -0500 +@@ -662,12 +662,12 @@ static void ax88796c_get_stats64(struct s = per_cpu_ptr(ax_local->stats, cpu); do { @@ -1172,11 +1133,10 @@ index 6ba5b024a7be7..25e7beb68e515 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c -index e5857e88c2076..caf1714f36a18 100644 ---- a/drivers/net/ethernet/broadcom/b44.c -+++ b/drivers/net/ethernet/broadcom/b44.c -@@ -1680,7 +1680,7 @@ static void b44_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/broadcom/b44.c linux/drivers/net/ethernet/broadcom/b44.c +--- linux.orig/drivers/net/ethernet/broadcom/b44.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/broadcom/b44.c 2022-12-04 10:40:26.680034137 -0500 +@@ -1680,7 +1680,7 @@ static void b44_get_stats64(struct net_d unsigned int start; do { @@ -1185,7 +1145,7 @@ index e5857e88c2076..caf1714f36a18 100644 /* Convert HW stats into rtnl_link_stats64 stats. */ nstat->rx_packets = hwstat->rx_pkts; -@@ -1714,7 +1714,7 @@ static void b44_get_stats64(struct net_device *dev, +@@ -1714,7 +1714,7 @@ static void b44_get_stats64(struct net_d /* Carrier lost counter seems to be broken for some devices */ nstat->tx_carrier_errors = hwstat->tx_carrier_lost; #endif @@ -1194,7 +1154,7 @@ index e5857e88c2076..caf1714f36a18 100644 } -@@ -2082,12 +2082,12 @@ static void b44_get_ethtool_stats(struct net_device *dev, +@@ -2082,12 +2082,12 @@ static void b44_get_ethtool_stats(struct do { data_src = &hwstat->tx_good_octets; data_dst = data; @@ -1209,11 +1169,10 @@ index e5857e88c2076..caf1714f36a18 100644 } static void b44_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) -diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c -index 47fc8e6963d59..98d5bd15ee433 100644 ---- a/drivers/net/ethernet/broadcom/bcmsysport.c -+++ b/drivers/net/ethernet/broadcom/bcmsysport.c -@@ -457,10 +457,10 @@ static void bcm_sysport_update_tx_stats(struct bcm_sysport_priv *priv, +diff -rupN linux.orig/drivers/net/ethernet/broadcom/bcmsysport.c linux/drivers/net/ethernet/broadcom/bcmsysport.c +--- linux.orig/drivers/net/ethernet/broadcom/bcmsysport.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/broadcom/bcmsysport.c 2022-12-04 10:40:26.680034137 -0500 +@@ -457,10 +457,10 @@ static void bcm_sysport_update_tx_stats( for (q = 0; q < priv->netdev->num_tx_queues; q++) { ring = &priv->tx_rings[q]; do { @@ -1226,7 +1185,7 @@ index 47fc8e6963d59..98d5bd15ee433 100644 *tx_bytes += bytes; *tx_packets += packets; -@@ -504,9 +504,9 @@ static void bcm_sysport_get_stats(struct net_device *dev, +@@ -504,9 +504,9 @@ static void bcm_sysport_get_stats(struct if (s->stat_sizeof == sizeof(u64) && s->type == BCM_SYSPORT_STAT_NETDEV64) { do { @@ -1238,7 +1197,7 @@ index 47fc8e6963d59..98d5bd15ee433 100644 } else data[i] = *(u32 *)p; j++; -@@ -1878,10 +1878,10 @@ static void bcm_sysport_get_stats64(struct net_device *dev, +@@ -1878,10 +1878,10 @@ static void bcm_sysport_get_stats64(stru &stats->tx_packets); do { @@ -1251,11 +1210,10 @@ index 47fc8e6963d59..98d5bd15ee433 100644 } static void bcm_sysport_netif_start(struct net_device *dev) -diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c -index 6dae768671e3d..9e6de2f968fa3 100644 ---- a/drivers/net/ethernet/cortina/gemini.c -+++ b/drivers/net/ethernet/cortina/gemini.c -@@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/cortina/gemini.c linux/drivers/net/ethernet/cortina/gemini.c +--- linux.orig/drivers/net/ethernet/cortina/gemini.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/cortina/gemini.c 2022-12-04 10:40:26.680034137 -0500 +@@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_ /* Racing with RX NAPI */ do { @@ -1264,7 +1222,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 stats->rx_packets = port->stats.rx_packets; stats->rx_bytes = port->stats.rx_bytes; -@@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_device *netdev, +@@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_ stats->rx_crc_errors = port->stats.rx_crc_errors; stats->rx_frame_errors = port->stats.rx_frame_errors; @@ -1278,7 +1236,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 stats->tx_errors = port->stats.tx_errors; stats->tx_packets = port->stats.tx_packets; -@@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_device *netdev, +@@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_ stats->rx_missed_errors = port->stats.rx_missed_errors; stats->rx_fifo_errors = port->stats.rx_fifo_errors; @@ -1297,7 +1255,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 stats->rx_dropped += stats->rx_missed_errors; } -@@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, +@@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struc /* Racing with MIB interrupt */ do { p = values; @@ -1319,7 +1277,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 for (i = 0; i < RX_STATUS_NUM; i++) *p++ = port->rx_stats[i]; -@@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, +@@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struc *p++ = port->rx_csum_stats[i]; *p++ = port->rx_napi_exits; @@ -1335,7 +1293,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 for (i = 0; i < TX_MAX_FRAGS; i++) { *values++ = port->tx_frag_stats[i]; -@@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, +@@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struc *values++ = port->tx_frags_linearized; *values++ = port->tx_hw_csummed; @@ -1344,11 +1302,10 @@ index 6dae768671e3d..9e6de2f968fa3 100644 } static int gmac_get_ksettings(struct net_device *netdev, -diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c -index bd0df189d8719..39e7a4a3c15e6 100644 ---- a/drivers/net/ethernet/emulex/benet/be_ethtool.c -+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c -@@ -389,10 +389,10 @@ static void be_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/emulex/benet/be_ethtool.c linux/drivers/net/ethernet/emulex/benet/be_ethtool.c +--- linux.orig/drivers/net/ethernet/emulex/benet/be_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/emulex/benet/be_ethtool.c 2022-12-04 10:40:26.680034137 -0500 +@@ -389,10 +389,10 @@ static void be_get_ethtool_stats(struct struct be_rx_stats *stats = rx_stats(rxo); do { @@ -1361,7 +1318,7 @@ index bd0df189d8719..39e7a4a3c15e6 100644 for (i = 2; i < ETHTOOL_RXSTATS_NUM; i++) { p = (u8 *)stats + et_rx_stats[i].offset; -@@ -405,19 +405,19 @@ static void be_get_ethtool_stats(struct net_device *netdev, +@@ -405,19 +405,19 @@ static void be_get_ethtool_stats(struct struct be_tx_stats *stats = tx_stats(txo); do { @@ -1385,11 +1342,10 @@ index bd0df189d8719..39e7a4a3c15e6 100644 base += ETHTOOL_TXSTATS_NUM; } } -diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c -index 414362febbb9d..9350c901aa27b 100644 ---- a/drivers/net/ethernet/emulex/benet/be_main.c -+++ b/drivers/net/ethernet/emulex/benet/be_main.c -@@ -665,10 +665,10 @@ static void be_get_stats64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/emulex/benet/be_main.c linux/drivers/net/ethernet/emulex/benet/be_main.c +--- linux.orig/drivers/net/ethernet/emulex/benet/be_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/emulex/benet/be_main.c 2022-12-04 10:40:26.684034126 -0500 +@@ -665,10 +665,10 @@ static void be_get_stats64(struct net_de const struct be_rx_stats *rx_stats = rx_stats(rxo); do { @@ -1402,7 +1358,7 @@ index 414362febbb9d..9350c901aa27b 100644 stats->rx_packets += pkts; stats->rx_bytes += bytes; stats->multicast += rx_stats(rxo)->rx_mcast_pkts; -@@ -680,10 +680,10 @@ static void be_get_stats64(struct net_device *netdev, +@@ -680,10 +680,10 @@ static void be_get_stats64(struct net_de const struct be_tx_stats *tx_stats = tx_stats(txo); do { @@ -1415,7 +1371,7 @@ index 414362febbb9d..9350c901aa27b 100644 stats->tx_packets += pkts; stats->tx_bytes += bytes; } -@@ -2155,16 +2155,16 @@ static int be_get_new_eqd(struct be_eq_obj *eqo) +@@ -2155,16 +2155,16 @@ static int be_get_new_eqd(struct be_eq_o for_all_rx_queues_on_eq(adapter, eqo, rxo, i) { do { @@ -1436,10 +1392,9 @@ index 414362febbb9d..9350c901aa27b 100644 } /* Skip, if wrapped around or first calculation */ -diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h -index 671f51135c269..53b7e95213a85 100644 ---- a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h -+++ b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h +diff -rupN linux.orig/drivers/net/ethernet/fungible/funeth/funeth_txrx.h linux/drivers/net/ethernet/fungible/funeth/funeth_txrx.h +--- linux.orig/drivers/net/ethernet/fungible/funeth/funeth_txrx.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/fungible/funeth/funeth_txrx.h 2022-12-04 10:40:26.684034126 -0500 @@ -206,9 +206,9 @@ struct funeth_rxq { #define FUN_QSTAT_READ(q, seq, stats_copy) \ @@ -1452,11 +1407,10 @@ index 671f51135c269..53b7e95213a85 100644 #define FUN_INT_NAME_LEN (IFNAMSIZ + 16) -diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c -index 7b9a2d9d96243..50b384910c839 100644 ---- a/drivers/net/ethernet/google/gve/gve_ethtool.c -+++ b/drivers/net/ethernet/google/gve/gve_ethtool.c -@@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/google/gve/gve_ethtool.c linux/drivers/net/ethernet/google/gve/gve_ethtool.c +--- linux.orig/drivers/net/ethernet/google/gve/gve_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/google/gve/gve_ethtool.c 2022-12-04 10:40:26.684034126 -0500 +@@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device struct gve_rx_ring *rx = &priv->rx[ring]; start = @@ -1473,7 +1427,7 @@ index 7b9a2d9d96243..50b384910c839 100644 start)); rx_pkts += tmp_rx_pkts; rx_bytes += tmp_rx_bytes; -@@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device *netdev, +@@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device if (priv->tx) { do { start = @@ -1486,7 +1440,7 @@ index 7b9a2d9d96243..50b384910c839 100644 start)); tx_pkts += tmp_tx_pkts; tx_bytes += tmp_tx_bytes; -@@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device *netdev, +@@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device data[i++] = rx->fill_cnt - rx->cnt; do { start = @@ -1502,7 +1456,7 @@ index 7b9a2d9d96243..50b384910c839 100644 start)); data[i++] = tmp_rx_bytes; data[i++] = rx->rx_cont_packet_cnt; -@@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device *netdev, +@@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device } do { start = @@ -1514,11 +1468,10 @@ index 7b9a2d9d96243..50b384910c839 100644 start)); data[i++] = tmp_tx_bytes; data[i++] = tx->wake_queue; -diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c -index 044db3ebb071c..6cafee55efc32 100644 ---- a/drivers/net/ethernet/google/gve/gve_main.c -+++ b/drivers/net/ethernet/google/gve/gve_main.c -@@ -51,10 +51,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) +diff -rupN linux.orig/drivers/net/ethernet/google/gve/gve_main.c linux/drivers/net/ethernet/google/gve/gve_main.c +--- linux.orig/drivers/net/ethernet/google/gve/gve_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/google/gve/gve_main.c 2022-12-04 10:40:26.684034126 -0500 +@@ -51,10 +51,10 @@ static void gve_get_stats(struct net_dev for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { do { start = @@ -1531,7 +1484,7 @@ index 044db3ebb071c..6cafee55efc32 100644 start)); s->rx_packets += packets; s->rx_bytes += bytes; -@@ -64,10 +64,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) +@@ -64,10 +64,10 @@ static void gve_get_stats(struct net_dev for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { do { start = @@ -1544,7 +1497,7 @@ index 044db3ebb071c..6cafee55efc32 100644 start)); s->tx_packets += packets; s->tx_bytes += bytes; -@@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_priv *priv) +@@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_ } do { @@ -1556,11 +1509,10 @@ index 044db3ebb071c..6cafee55efc32 100644 stats[stats_idx++] = (struct stats) { .stat_name = cpu_to_be32(TX_WAKE_CNT), .value = cpu_to_be64(priv->tx[idx].wake_queue), -diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c -index 35d70041b9e84..f82e98263307a 100644 ---- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c -+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c -@@ -2486,7 +2486,7 @@ static void hns3_fetch_stats(struct rtnl_link_stats64 *stats, +diff -rupN linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +--- linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 2022-12-04 10:40:26.684034126 -0500 +@@ -2488,7 +2488,7 @@ static void hns3_fetch_stats(struct rtnl unsigned int start; do { @@ -1569,7 +1521,7 @@ index 35d70041b9e84..f82e98263307a 100644 if (is_tx) { stats->tx_bytes += ring->stats.tx_bytes; stats->tx_packets += ring->stats.tx_pkts; -@@ -2520,7 +2520,7 @@ static void hns3_fetch_stats(struct rtnl_link_stats64 *stats, +@@ -2522,7 +2522,7 @@ static void hns3_fetch_stats(struct rtnl stats->multicast += ring->stats.rx_multicast; stats->rx_length_errors += ring->stats.err_pkt_len; } @@ -1578,11 +1530,5909 @@ index 35d70041b9e84..f82e98263307a 100644 } static void hns3_nic_get_stats64(struct net_device *netdev, -diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c -index e5828a658caf4..a866bea651103 100644 ---- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c -+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c -@@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, struct hinic_rxq_stats *stats) +diff -rupN linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig +--- linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig 2022-12-04 10:40:18.116056079 -0500 +@@ -0,0 +1,5895 @@ ++// SPDX-License-Identifier: GPL-2.0+ ++// Copyright (c) 2016-2017 Hisilicon Limited. ++ ++#include ++#include ++#include ++#ifdef CONFIG_RFS_ACCEL ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "hnae3.h" ++#include "hns3_enet.h" ++/* All hns3 tracepoints are defined by the include below, which ++ * must be included exactly once across the whole kernel with ++ * CREATE_TRACE_POINTS defined ++ */ ++#define CREATE_TRACE_POINTS ++#include "hns3_trace.h" ++ ++#define hns3_set_field(origin, shift, val) ((origin) |= (val) << (shift)) ++#define hns3_tx_bd_count(S) DIV_ROUND_UP(S, HNS3_MAX_BD_SIZE) ++ ++#define hns3_rl_err(fmt, ...) \ ++ do { \ ++ if (net_ratelimit()) \ ++ netdev_err(fmt, ##__VA_ARGS__); \ ++ } while (0) ++ ++static void hns3_clear_all_ring(struct hnae3_handle *h, bool force); ++ ++static const char hns3_driver_name[] = "hns3"; ++static const char hns3_driver_string[] = ++ "Hisilicon Ethernet Network Driver for Hip08 Family"; ++static const char hns3_copyright[] = "Copyright (c) 2017 Huawei Corporation."; ++static struct hnae3_client client; ++ ++static int debug = -1; ++module_param(debug, int, 0); ++MODULE_PARM_DESC(debug, " Network interface message level setting"); ++ ++static unsigned int tx_sgl = 1; ++module_param(tx_sgl, uint, 0600); ++MODULE_PARM_DESC(tx_sgl, "Minimum number of frags when using dma_map_sg() to optimize the IOMMU mapping"); ++ ++static bool page_pool_enabled = true; ++module_param(page_pool_enabled, bool, 0400); ++ ++#define HNS3_SGL_SIZE(nfrag) (sizeof(struct scatterlist) * (nfrag) + \ ++ sizeof(struct sg_table)) ++#define HNS3_MAX_SGL_SIZE ALIGN(HNS3_SGL_SIZE(HNS3_MAX_TSO_BD_NUM), \ ++ dma_get_cache_alignment()) ++ ++#define DEFAULT_MSG_LEVEL (NETIF_MSG_PROBE | NETIF_MSG_LINK | \ ++ NETIF_MSG_IFDOWN | NETIF_MSG_IFUP) ++ ++#define HNS3_INNER_VLAN_TAG 1 ++#define HNS3_OUTER_VLAN_TAG 2 ++ ++#define HNS3_MIN_TX_LEN 33U ++#define HNS3_MIN_TUN_PKT_LEN 65U ++ ++/* hns3_pci_tbl - PCI Device ID Table ++ * ++ * Last entry must be all 0s ++ * ++ * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, ++ * Class, Class Mask, private data (not used) } ++ */ ++static const struct pci_device_id hns3_pci_tbl[] = { ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_200G_RDMA), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_VF), 0}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_RDMA_DCB_PFC_VF), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ /* required last entry */ ++ {0,} ++}; ++MODULE_DEVICE_TABLE(pci, hns3_pci_tbl); ++ ++#define HNS3_RX_PTYPE_ENTRY(ptype, l, s, t, h) \ ++ { ptype, \ ++ l, \ ++ CHECKSUM_##s, \ ++ HNS3_L3_TYPE_##t, \ ++ 1, \ ++ h} ++ ++#define HNS3_RX_PTYPE_UNUSED_ENTRY(ptype) \ ++ { ptype, 0, CHECKSUM_NONE, HNS3_L3_TYPE_PARSE_FAIL, 0, \ ++ PKT_HASH_TYPE_NONE } ++ ++static const struct hns3_rx_ptype hns3_rx_ptype_tbl[] = { ++ HNS3_RX_PTYPE_UNUSED_ENTRY(0), ++ HNS3_RX_PTYPE_ENTRY(1, 0, COMPLETE, ARP, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(2, 0, COMPLETE, RARP, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(3, 0, COMPLETE, LLDP, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(4, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(5, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(6, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(7, 0, COMPLETE, CNM, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(8, 0, NONE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(9), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(10), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(11), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(12), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(13), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(14), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(15), ++ HNS3_RX_PTYPE_ENTRY(16, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(17, 0, COMPLETE, IPV4, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(18, 0, COMPLETE, IPV4, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(19, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(20, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(21, 0, NONE, IPV4, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(22, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(23, 0, NONE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(24, 0, NONE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(25, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(26), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(27), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(28), ++ HNS3_RX_PTYPE_ENTRY(29, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(30, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(31, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(32, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(33, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(34, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(35, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(36, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(37, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(38), ++ HNS3_RX_PTYPE_ENTRY(39, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(40, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(41, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(42, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(43, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(44, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(45, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(46), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(47), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(48), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(49), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(50), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(51), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(52), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(53), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(54), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(55), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(56), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(57), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(58), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(59), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(60), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(61), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(62), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(63), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(64), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(65), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(66), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(67), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(68), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(69), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(70), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(71), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(72), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(73), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(74), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(75), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(76), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(77), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(78), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(79), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(80), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(81), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(82), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(83), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(84), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(85), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(86), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(87), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(88), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(89), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(90), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(91), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(92), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(93), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(94), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(95), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(96), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(97), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(98), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(99), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(100), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(101), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(102), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(103), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(104), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(105), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(106), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(107), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(108), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(109), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(110), ++ HNS3_RX_PTYPE_ENTRY(111, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(112, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(113, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(114, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(115, 0, NONE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(116, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(117, 0, NONE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(118, 0, NONE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(119, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(120), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(121), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(122), ++ HNS3_RX_PTYPE_ENTRY(123, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(124, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(125, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(126, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(127, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(128, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(129, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(130, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(131, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(132), ++ HNS3_RX_PTYPE_ENTRY(133, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(134, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(135, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(136, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(137, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(138, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(139, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(140), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(141), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(142), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(143), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(144), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(145), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(146), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(147), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(148), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(149), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(150), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(151), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(152), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(153), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(154), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(155), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(156), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(157), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(158), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(159), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(160), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(161), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(162), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(163), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(164), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(165), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(166), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(167), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(168), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(169), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(170), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(171), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(172), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(173), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(174), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(175), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(176), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(177), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(178), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(179), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(180), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(181), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(182), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(183), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(184), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(185), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(186), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(187), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(188), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(189), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(190), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(191), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(192), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(193), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(194), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(195), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(196), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(197), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(198), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(199), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(200), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(201), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(202), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(203), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(204), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(205), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(206), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(207), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(208), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(209), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(210), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(211), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(212), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(213), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(214), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(215), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(216), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(217), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(218), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(219), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(220), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(221), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(222), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(223), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(224), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(225), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(226), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(227), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(228), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(229), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(230), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(231), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(232), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(233), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(234), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(235), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(236), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(237), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(238), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(239), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(240), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(241), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(242), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(243), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(244), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(245), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(246), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(247), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(248), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(249), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(250), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(251), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(252), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(253), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(254), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(255), ++}; ++ ++#define HNS3_INVALID_PTYPE \ ++ ARRAY_SIZE(hns3_rx_ptype_tbl) ++ ++static irqreturn_t hns3_irq_handle(int irq, void *vector) ++{ ++ struct hns3_enet_tqp_vector *tqp_vector = vector; ++ ++ napi_schedule_irqoff(&tqp_vector->napi); ++ tqp_vector->event_cnt++; ++ ++ return IRQ_HANDLED; ++} ++ ++static void hns3_nic_uninit_irq(struct hns3_nic_priv *priv) ++{ ++ struct hns3_enet_tqp_vector *tqp_vectors; ++ unsigned int i; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vectors = &priv->tqp_vector[i]; ++ ++ if (tqp_vectors->irq_init_flag != HNS3_VECTOR_INITED) ++ continue; ++ ++ /* clear the affinity mask */ ++ irq_set_affinity_hint(tqp_vectors->vector_irq, NULL); ++ ++ /* release the irq resource */ ++ free_irq(tqp_vectors->vector_irq, tqp_vectors); ++ tqp_vectors->irq_init_flag = HNS3_VECTOR_NOT_INITED; ++ } ++} ++ ++static int hns3_nic_init_irq(struct hns3_nic_priv *priv) ++{ ++ struct hns3_enet_tqp_vector *tqp_vectors; ++ int txrx_int_idx = 0; ++ int rx_int_idx = 0; ++ int tx_int_idx = 0; ++ unsigned int i; ++ int ret; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vectors = &priv->tqp_vector[i]; ++ ++ if (tqp_vectors->irq_init_flag == HNS3_VECTOR_INITED) ++ continue; ++ ++ if (tqp_vectors->tx_group.ring && tqp_vectors->rx_group.ring) { ++ snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN, ++ "%s-%s-%s-%d", hns3_driver_name, ++ pci_name(priv->ae_handle->pdev), ++ "TxRx", txrx_int_idx++); ++ txrx_int_idx++; ++ } else if (tqp_vectors->rx_group.ring) { ++ snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN, ++ "%s-%s-%s-%d", hns3_driver_name, ++ pci_name(priv->ae_handle->pdev), ++ "Rx", rx_int_idx++); ++ } else if (tqp_vectors->tx_group.ring) { ++ snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN, ++ "%s-%s-%s-%d", hns3_driver_name, ++ pci_name(priv->ae_handle->pdev), ++ "Tx", tx_int_idx++); ++ } else { ++ /* Skip this unused q_vector */ ++ continue; ++ } ++ ++ tqp_vectors->name[HNAE3_INT_NAME_LEN - 1] = '\0'; ++ ++ irq_set_status_flags(tqp_vectors->vector_irq, IRQ_NOAUTOEN); ++ ret = request_irq(tqp_vectors->vector_irq, hns3_irq_handle, 0, ++ tqp_vectors->name, tqp_vectors); ++ if (ret) { ++ netdev_err(priv->netdev, "request irq(%d) fail\n", ++ tqp_vectors->vector_irq); ++ hns3_nic_uninit_irq(priv); ++ return ret; ++ } ++ ++ irq_set_affinity_hint(tqp_vectors->vector_irq, ++ &tqp_vectors->affinity_mask); ++ ++ tqp_vectors->irq_init_flag = HNS3_VECTOR_INITED; ++ } ++ ++ return 0; ++} ++ ++static void hns3_mask_vector_irq(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 mask_en) ++{ ++ writel(mask_en, tqp_vector->mask_addr); ++} ++ ++static void hns3_vector_enable(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ napi_enable(&tqp_vector->napi); ++ enable_irq(tqp_vector->vector_irq); ++ ++ /* enable vector */ ++ hns3_mask_vector_irq(tqp_vector, 1); ++} ++ ++static void hns3_vector_disable(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ /* disable vector */ ++ hns3_mask_vector_irq(tqp_vector, 0); ++ ++ disable_irq(tqp_vector->vector_irq); ++ napi_disable(&tqp_vector->napi); ++ cancel_work_sync(&tqp_vector->rx_group.dim.work); ++ cancel_work_sync(&tqp_vector->tx_group.dim.work); ++} ++ ++void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 rl_value) ++{ ++ u32 rl_reg = hns3_rl_usec_to_reg(rl_value); ++ ++ /* this defines the configuration for RL (Interrupt Rate Limiter). ++ * Rl defines rate of interrupts i.e. number of interrupts-per-second ++ * GL and RL(Rate Limiter) are 2 ways to acheive interrupt coalescing ++ */ ++ if (rl_reg > 0 && !tqp_vector->tx_group.coal.adapt_enable && ++ !tqp_vector->rx_group.coal.adapt_enable) ++ /* According to the hardware, the range of rl_reg is ++ * 0-59 and the unit is 4. ++ */ ++ rl_reg |= HNS3_INT_RL_ENABLE_MASK; ++ ++ writel(rl_reg, tqp_vector->mask_addr + HNS3_VECTOR_RL_OFFSET); ++} ++ ++void hns3_set_vector_coalesce_rx_gl(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 gl_value) ++{ ++ u32 new_val; ++ ++ if (tqp_vector->rx_group.coal.unit_1us) ++ new_val = gl_value | HNS3_INT_GL_1US; ++ else ++ new_val = hns3_gl_usec_to_reg(gl_value); ++ ++ writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET); ++} ++ ++void hns3_set_vector_coalesce_tx_gl(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 gl_value) ++{ ++ u32 new_val; ++ ++ if (tqp_vector->tx_group.coal.unit_1us) ++ new_val = gl_value | HNS3_INT_GL_1US; ++ else ++ new_val = hns3_gl_usec_to_reg(gl_value); ++ ++ writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET); ++} ++ ++void hns3_set_vector_coalesce_tx_ql(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 ql_value) ++{ ++ writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_TX_QL_OFFSET); ++} ++ ++void hns3_set_vector_coalesce_rx_ql(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 ql_value) ++{ ++ writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_RX_QL_OFFSET); ++} ++ ++static void hns3_vector_coalesce_init(struct hns3_enet_tqp_vector *tqp_vector, ++ struct hns3_nic_priv *priv) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); ++ struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal; ++ struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal; ++ struct hns3_enet_coalesce *ptx_coal = &priv->tx_coal; ++ struct hns3_enet_coalesce *prx_coal = &priv->rx_coal; ++ ++ tx_coal->adapt_enable = ptx_coal->adapt_enable; ++ rx_coal->adapt_enable = prx_coal->adapt_enable; ++ ++ tx_coal->int_gl = ptx_coal->int_gl; ++ rx_coal->int_gl = prx_coal->int_gl; ++ ++ rx_coal->flow_level = prx_coal->flow_level; ++ tx_coal->flow_level = ptx_coal->flow_level; ++ ++ /* device version above V3(include V3), GL can configure 1us ++ * unit, so uses 1us unit. ++ */ ++ if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) { ++ tx_coal->unit_1us = 1; ++ rx_coal->unit_1us = 1; ++ } ++ ++ if (ae_dev->dev_specs.int_ql_max) { ++ tx_coal->ql_enable = 1; ++ rx_coal->ql_enable = 1; ++ tx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max; ++ rx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max; ++ tx_coal->int_ql = ptx_coal->int_ql; ++ rx_coal->int_ql = prx_coal->int_ql; ++ } ++} ++ ++static void ++hns3_vector_coalesce_init_hw(struct hns3_enet_tqp_vector *tqp_vector, ++ struct hns3_nic_priv *priv) ++{ ++ struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal; ++ struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal; ++ struct hnae3_handle *h = priv->ae_handle; ++ ++ hns3_set_vector_coalesce_tx_gl(tqp_vector, tx_coal->int_gl); ++ hns3_set_vector_coalesce_rx_gl(tqp_vector, rx_coal->int_gl); ++ hns3_set_vector_coalesce_rl(tqp_vector, h->kinfo.int_rl_setting); ++ ++ if (tx_coal->ql_enable) ++ hns3_set_vector_coalesce_tx_ql(tqp_vector, tx_coal->int_ql); ++ ++ if (rx_coal->ql_enable) ++ hns3_set_vector_coalesce_rx_ql(tqp_vector, rx_coal->int_ql); ++} ++ ++static int hns3_nic_set_real_num_queue(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct hnae3_knic_private_info *kinfo = &h->kinfo; ++ struct hnae3_tc_info *tc_info = &kinfo->tc_info; ++ unsigned int queue_size = kinfo->num_tqps; ++ int i, ret; ++ ++ if (tc_info->num_tc <= 1 && !tc_info->mqprio_active) { ++ netdev_reset_tc(netdev); ++ } else { ++ ret = netdev_set_num_tc(netdev, tc_info->num_tc); ++ if (ret) { ++ netdev_err(netdev, ++ "netdev_set_num_tc fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ for (i = 0; i < tc_info->num_tc; i++) ++ netdev_set_tc_queue(netdev, i, tc_info->tqp_count[i], ++ tc_info->tqp_offset[i]); ++ } ++ ++ ret = netif_set_real_num_tx_queues(netdev, queue_size); ++ if (ret) { ++ netdev_err(netdev, ++ "netif_set_real_num_tx_queues fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ ret = netif_set_real_num_rx_queues(netdev, queue_size); ++ if (ret) { ++ netdev_err(netdev, ++ "netif_set_real_num_rx_queues fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++u16 hns3_get_max_available_channels(struct hnae3_handle *h) ++{ ++ u16 alloc_tqps, max_rss_size, rss_size; ++ ++ h->ae_algo->ops->get_tqps_and_rss_info(h, &alloc_tqps, &max_rss_size); ++ rss_size = alloc_tqps / h->kinfo.tc_info.num_tc; ++ ++ return min_t(u16, rss_size, max_rss_size); ++} ++ ++static void hns3_tqp_enable(struct hnae3_queue *tqp) ++{ ++ u32 rcb_reg; ++ ++ rcb_reg = hns3_read_dev(tqp, HNS3_RING_EN_REG); ++ rcb_reg |= BIT(HNS3_RING_EN_B); ++ hns3_write_dev(tqp, HNS3_RING_EN_REG, rcb_reg); ++} ++ ++static void hns3_tqp_disable(struct hnae3_queue *tqp) ++{ ++ u32 rcb_reg; ++ ++ rcb_reg = hns3_read_dev(tqp, HNS3_RING_EN_REG); ++ rcb_reg &= ~BIT(HNS3_RING_EN_B); ++ hns3_write_dev(tqp, HNS3_RING_EN_REG, rcb_reg); ++} ++ ++static void hns3_free_rx_cpu_rmap(struct net_device *netdev) ++{ ++#ifdef CONFIG_RFS_ACCEL ++ free_irq_cpu_rmap(netdev->rx_cpu_rmap); ++ netdev->rx_cpu_rmap = NULL; ++#endif ++} ++ ++static int hns3_set_rx_cpu_rmap(struct net_device *netdev) ++{ ++#ifdef CONFIG_RFS_ACCEL ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hns3_enet_tqp_vector *tqp_vector; ++ int i, ret; ++ ++ if (!netdev->rx_cpu_rmap) { ++ netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->vector_num); ++ if (!netdev->rx_cpu_rmap) ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vector = &priv->tqp_vector[i]; ++ ret = irq_cpu_rmap_add(netdev->rx_cpu_rmap, ++ tqp_vector->vector_irq); ++ if (ret) { ++ hns3_free_rx_cpu_rmap(netdev); ++ return ret; ++ } ++ } ++#endif ++ return 0; ++} ++ ++static int hns3_nic_net_up(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = priv->ae_handle; ++ int i, j; ++ int ret; ++ ++ ret = hns3_nic_reset_all_ring(h); ++ if (ret) ++ return ret; ++ ++ clear_bit(HNS3_NIC_STATE_DOWN, &priv->state); ++ ++ /* enable the vectors */ ++ for (i = 0; i < priv->vector_num; i++) ++ hns3_vector_enable(&priv->tqp_vector[i]); ++ ++ /* enable rcb */ ++ for (j = 0; j < h->kinfo.num_tqps; j++) ++ hns3_tqp_enable(h->kinfo.tqp[j]); ++ ++ /* start the ae_dev */ ++ ret = h->ae_algo->ops->start ? h->ae_algo->ops->start(h) : 0; ++ if (ret) { ++ set_bit(HNS3_NIC_STATE_DOWN, &priv->state); ++ while (j--) ++ hns3_tqp_disable(h->kinfo.tqp[j]); ++ ++ for (j = i - 1; j >= 0; j--) ++ hns3_vector_disable(&priv->tqp_vector[j]); ++ } ++ ++ return ret; ++} ++ ++static void hns3_config_xps(struct hns3_nic_priv *priv) ++{ ++ int i; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ struct hns3_enet_tqp_vector *tqp_vector = &priv->tqp_vector[i]; ++ struct hns3_enet_ring *ring = tqp_vector->tx_group.ring; ++ ++ while (ring) { ++ int ret; ++ ++ ret = netif_set_xps_queue(priv->netdev, ++ &tqp_vector->affinity_mask, ++ ring->tqp->tqp_index); ++ if (ret) ++ netdev_warn(priv->netdev, ++ "set xps queue failed: %d", ret); ++ ++ ring = ring->next; ++ } ++ } ++} ++ ++static int hns3_nic_net_open(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct hnae3_knic_private_info *kinfo; ++ int i, ret; ++ ++ if (hns3_nic_resetting(netdev)) ++ return -EBUSY; ++ ++ if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) { ++ netdev_warn(netdev, "net open repeatedly!\n"); ++ return 0; ++ } ++ ++ netif_carrier_off(netdev); ++ ++ ret = hns3_nic_set_real_num_queue(netdev); ++ if (ret) ++ return ret; ++ ++ ret = hns3_nic_net_up(netdev); ++ if (ret) { ++ netdev_err(netdev, "net up fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ kinfo = &h->kinfo; ++ for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) ++ netdev_set_prio_tc_map(netdev, i, kinfo->tc_info.prio_tc[i]); ++ ++ if (h->ae_algo->ops->set_timer_task) ++ h->ae_algo->ops->set_timer_task(priv->ae_handle, true); ++ ++ hns3_config_xps(priv); ++ ++ netif_dbg(h, drv, netdev, "net open\n"); ++ ++ return 0; ++} ++ ++static void hns3_reset_tx_queue(struct hnae3_handle *h) ++{ ++ struct net_device *ndev = h->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ struct netdev_queue *dev_queue; ++ u32 i; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ dev_queue = netdev_get_tx_queue(ndev, ++ priv->ring[i].queue_index); ++ netdev_tx_reset_queue(dev_queue); ++ } ++} ++ ++static void hns3_nic_net_down(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ const struct hnae3_ae_ops *ops; ++ int i; ++ ++ /* disable vectors */ ++ for (i = 0; i < priv->vector_num; i++) ++ hns3_vector_disable(&priv->tqp_vector[i]); ++ ++ /* disable rcb */ ++ for (i = 0; i < h->kinfo.num_tqps; i++) ++ hns3_tqp_disable(h->kinfo.tqp[i]); ++ ++ /* stop ae_dev */ ++ ops = priv->ae_handle->ae_algo->ops; ++ if (ops->stop) ++ ops->stop(priv->ae_handle); ++ ++ /* delay ring buffer clearing to hns3_reset_notify_uninit_enet ++ * during reset process, because driver may not be able ++ * to disable the ring through firmware when downing the netdev. ++ */ ++ if (!hns3_nic_resetting(netdev)) ++ hns3_clear_all_ring(priv->ae_handle, false); ++ ++ hns3_reset_tx_queue(priv->ae_handle); ++} ++ ++static int hns3_nic_net_stop(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state)) ++ return 0; ++ ++ netif_dbg(h, drv, netdev, "net stop\n"); ++ ++ if (h->ae_algo->ops->set_timer_task) ++ h->ae_algo->ops->set_timer_task(priv->ae_handle, false); ++ ++ netif_carrier_off(netdev); ++ netif_tx_disable(netdev); ++ ++ hns3_nic_net_down(netdev); ++ ++ return 0; ++} ++ ++static int hns3_nic_uc_sync(struct net_device *netdev, ++ const unsigned char *addr) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (h->ae_algo->ops->add_uc_addr) ++ return h->ae_algo->ops->add_uc_addr(h, addr); ++ ++ return 0; ++} ++ ++static int hns3_nic_uc_unsync(struct net_device *netdev, ++ const unsigned char *addr) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ /* need ignore the request of removing device address, because ++ * we store the device address and other addresses of uc list ++ * in the function's mac filter list. ++ */ ++ if (ether_addr_equal(addr, netdev->dev_addr)) ++ return 0; ++ ++ if (h->ae_algo->ops->rm_uc_addr) ++ return h->ae_algo->ops->rm_uc_addr(h, addr); ++ ++ return 0; ++} ++ ++static int hns3_nic_mc_sync(struct net_device *netdev, ++ const unsigned char *addr) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (h->ae_algo->ops->add_mc_addr) ++ return h->ae_algo->ops->add_mc_addr(h, addr); ++ ++ return 0; ++} ++ ++static int hns3_nic_mc_unsync(struct net_device *netdev, ++ const unsigned char *addr) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (h->ae_algo->ops->rm_mc_addr) ++ return h->ae_algo->ops->rm_mc_addr(h, addr); ++ ++ return 0; ++} ++ ++static u8 hns3_get_netdev_flags(struct net_device *netdev) ++{ ++ u8 flags = 0; ++ ++ if (netdev->flags & IFF_PROMISC) ++ flags = HNAE3_USER_UPE | HNAE3_USER_MPE | HNAE3_BPE; ++ else if (netdev->flags & IFF_ALLMULTI) ++ flags = HNAE3_USER_MPE; ++ ++ return flags; ++} ++ ++static void hns3_nic_set_rx_mode(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ u8 new_flags; ++ ++ new_flags = hns3_get_netdev_flags(netdev); ++ ++ __dev_uc_sync(netdev, hns3_nic_uc_sync, hns3_nic_uc_unsync); ++ __dev_mc_sync(netdev, hns3_nic_mc_sync, hns3_nic_mc_unsync); ++ ++ /* User mode Promisc mode enable and vlan filtering is disabled to ++ * let all packets in. ++ */ ++ h->netdev_flags = new_flags; ++ hns3_request_update_promisc_mode(h); ++} ++ ++void hns3_request_update_promisc_mode(struct hnae3_handle *handle) ++{ ++ const struct hnae3_ae_ops *ops = handle->ae_algo->ops; ++ ++ if (ops->request_update_promisc_mode) ++ ops->request_update_promisc_mode(handle); ++} ++ ++static u32 hns3_tx_spare_space(struct hns3_enet_ring *ring) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ u32 ntc, ntu; ++ ++ /* This smp_load_acquire() pairs with smp_store_release() in ++ * hns3_tx_spare_update() called in tx desc cleaning process. ++ */ ++ ntc = smp_load_acquire(&tx_spare->last_to_clean); ++ ntu = tx_spare->next_to_use; ++ ++ if (ntc > ntu) ++ return ntc - ntu - 1; ++ ++ /* The free tx buffer is divided into two part, so pick the ++ * larger one. ++ */ ++ return max(ntc, tx_spare->len - ntu) - 1; ++} ++ ++static void hns3_tx_spare_update(struct hns3_enet_ring *ring) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ ++ if (!tx_spare || ++ tx_spare->last_to_clean == tx_spare->next_to_clean) ++ return; ++ ++ /* This smp_store_release() pairs with smp_load_acquire() in ++ * hns3_tx_spare_space() called in xmit process. ++ */ ++ smp_store_release(&tx_spare->last_to_clean, ++ tx_spare->next_to_clean); ++} ++ ++static bool hns3_can_use_tx_bounce(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ u32 space) ++{ ++ u32 len = skb->len <= ring->tx_copybreak ? skb->len : ++ skb_headlen(skb); ++ ++ if (len > ring->tx_copybreak) ++ return false; ++ ++ if (ALIGN(len, dma_get_cache_alignment()) > space) { ++ hns3_ring_stats_update(ring, tx_spare_full); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ u32 space) ++{ ++ if (skb->len <= ring->tx_copybreak || !tx_sgl || ++ (!skb_has_frag_list(skb) && ++ skb_shinfo(skb)->nr_frags < tx_sgl)) ++ return false; ++ ++ if (space < HNS3_MAX_SGL_SIZE) { ++ hns3_ring_stats_update(ring, tx_spare_full); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) ++{ ++ u32 alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size; ++ struct hns3_tx_spare *tx_spare; ++ struct page *page; ++ dma_addr_t dma; ++ int order; ++ ++ if (!alloc_size) ++ return; ++ ++ order = get_order(alloc_size); ++ if (order >= MAX_ORDER) { ++ if (net_ratelimit()) ++ dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n"); ++ return; ++ } ++ ++ tx_spare = devm_kzalloc(ring_to_dev(ring), sizeof(*tx_spare), ++ GFP_KERNEL); ++ if (!tx_spare) { ++ /* The driver still work without the tx spare buffer */ ++ dev_warn(ring_to_dev(ring), "failed to allocate hns3_tx_spare\n"); ++ goto devm_kzalloc_error; ++ } ++ ++ page = alloc_pages_node(dev_to_node(ring_to_dev(ring)), ++ GFP_KERNEL, order); ++ if (!page) { ++ dev_warn(ring_to_dev(ring), "failed to allocate tx spare pages\n"); ++ goto alloc_pages_error; ++ } ++ ++ dma = dma_map_page(ring_to_dev(ring), page, 0, ++ PAGE_SIZE << order, DMA_TO_DEVICE); ++ if (dma_mapping_error(ring_to_dev(ring), dma)) { ++ dev_warn(ring_to_dev(ring), "failed to map pages for tx spare\n"); ++ goto dma_mapping_error; ++ } ++ ++ tx_spare->dma = dma; ++ tx_spare->buf = page_address(page); ++ tx_spare->len = PAGE_SIZE << order; ++ ring->tx_spare = tx_spare; ++ return; ++ ++dma_mapping_error: ++ put_page(page); ++alloc_pages_error: ++ devm_kfree(ring_to_dev(ring), tx_spare); ++devm_kzalloc_error: ++ ring->tqp->handle->kinfo.tx_spare_buf_size = 0; ++} ++ ++/* Use hns3_tx_spare_space() to make sure there is enough buffer ++ * before calling below function to allocate tx buffer. ++ */ ++static void *hns3_tx_spare_alloc(struct hns3_enet_ring *ring, ++ unsigned int size, dma_addr_t *dma, ++ u32 *cb_len) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ u32 ntu = tx_spare->next_to_use; ++ ++ size = ALIGN(size, dma_get_cache_alignment()); ++ *cb_len = size; ++ ++ /* Tx spare buffer wraps back here because the end of ++ * freed tx buffer is not enough. ++ */ ++ if (ntu + size > tx_spare->len) { ++ *cb_len += (tx_spare->len - ntu); ++ ntu = 0; ++ } ++ ++ tx_spare->next_to_use = ntu + size; ++ if (tx_spare->next_to_use == tx_spare->len) ++ tx_spare->next_to_use = 0; ++ ++ *dma = tx_spare->dma + ntu; ++ ++ return tx_spare->buf + ntu; ++} ++ ++static void hns3_tx_spare_rollback(struct hns3_enet_ring *ring, u32 len) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ ++ if (len > tx_spare->next_to_use) { ++ len -= tx_spare->next_to_use; ++ tx_spare->next_to_use = tx_spare->len - len; ++ } else { ++ tx_spare->next_to_use -= len; ++ } ++} ++ ++static void hns3_tx_spare_reclaim_cb(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ u32 ntc = tx_spare->next_to_clean; ++ u32 len = cb->length; ++ ++ tx_spare->next_to_clean += len; ++ ++ if (tx_spare->next_to_clean >= tx_spare->len) { ++ tx_spare->next_to_clean -= tx_spare->len; ++ ++ if (tx_spare->next_to_clean) { ++ ntc = 0; ++ len = tx_spare->next_to_clean; ++ } ++ } ++ ++ /* This tx spare buffer is only really reclaimed after calling ++ * hns3_tx_spare_update(), so it is still safe to use the info in ++ * the tx buffer to do the dma sync or sg unmapping after ++ * tx_spare->next_to_clean is moved forword. ++ */ ++ if (cb->type & (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) { ++ dma_addr_t dma = tx_spare->dma + ntc; ++ ++ dma_sync_single_for_cpu(ring_to_dev(ring), dma, len, ++ DMA_TO_DEVICE); ++ } else { ++ struct sg_table *sgt = tx_spare->buf + ntc; ++ ++ dma_unmap_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents, ++ DMA_TO_DEVICE); ++ } ++} ++ ++static int hns3_set_tso(struct sk_buff *skb, u32 *paylen_fdop_ol4cs, ++ u16 *mss, u32 *type_cs_vlan_tso, u32 *send_bytes) ++{ ++ u32 l4_offset, hdr_len; ++ union l3_hdr_info l3; ++ union l4_hdr_info l4; ++ u32 l4_paylen; ++ int ret; ++ ++ if (!skb_is_gso(skb)) ++ return 0; ++ ++ ret = skb_cow_head(skb, 0); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ l3.hdr = skb_network_header(skb); ++ l4.hdr = skb_transport_header(skb); ++ ++ /* Software should clear the IPv4's checksum field when tso is ++ * needed. ++ */ ++ if (l3.v4->version == 4) ++ l3.v4->check = 0; ++ ++ /* tunnel packet */ ++ if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | ++ SKB_GSO_GRE_CSUM | ++ SKB_GSO_UDP_TUNNEL | ++ SKB_GSO_UDP_TUNNEL_CSUM)) { ++ /* reset l3&l4 pointers from outer to inner headers */ ++ l3.hdr = skb_inner_network_header(skb); ++ l4.hdr = skb_inner_transport_header(skb); ++ ++ /* Software should clear the IPv4's checksum field when ++ * tso is needed. ++ */ ++ if (l3.v4->version == 4) ++ l3.v4->check = 0; ++ } ++ ++ /* normal or tunnel packet */ ++ l4_offset = l4.hdr - skb->data; ++ ++ /* remove payload length from inner pseudo checksum when tso */ ++ l4_paylen = skb->len - l4_offset; ++ ++ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { ++ hdr_len = sizeof(*l4.udp) + l4_offset; ++ csum_replace_by_diff(&l4.udp->check, ++ (__force __wsum)htonl(l4_paylen)); ++ } else { ++ hdr_len = (l4.tcp->doff << 2) + l4_offset; ++ csum_replace_by_diff(&l4.tcp->check, ++ (__force __wsum)htonl(l4_paylen)); ++ } ++ ++ *send_bytes = (skb_shinfo(skb)->gso_segs - 1) * hdr_len + skb->len; ++ ++ /* find the txbd field values */ ++ *paylen_fdop_ol4cs = skb->len - hdr_len; ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_TSO_B, 1); ++ ++ /* offload outer UDP header checksum */ ++ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) ++ hns3_set_field(*paylen_fdop_ol4cs, HNS3_TXD_OL4CS_B, 1); ++ ++ /* get MSS for TSO */ ++ *mss = skb_shinfo(skb)->gso_size; ++ ++ trace_hns3_tso(skb); ++ ++ return 0; ++} ++ ++static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto, ++ u8 *il4_proto) ++{ ++ union l3_hdr_info l3; ++ unsigned char *l4_hdr; ++ unsigned char *exthdr; ++ u8 l4_proto_tmp; ++ __be16 frag_off; ++ ++ /* find outer header point */ ++ l3.hdr = skb_network_header(skb); ++ l4_hdr = skb_transport_header(skb); ++ ++ if (skb->protocol == htons(ETH_P_IPV6)) { ++ exthdr = l3.hdr + sizeof(*l3.v6); ++ l4_proto_tmp = l3.v6->nexthdr; ++ if (l4_hdr != exthdr) ++ ipv6_skip_exthdr(skb, exthdr - skb->data, ++ &l4_proto_tmp, &frag_off); ++ } else if (skb->protocol == htons(ETH_P_IP)) { ++ l4_proto_tmp = l3.v4->protocol; ++ } else { ++ return -EINVAL; ++ } ++ ++ *ol4_proto = l4_proto_tmp; ++ ++ /* tunnel packet */ ++ if (!skb->encapsulation) { ++ *il4_proto = 0; ++ return 0; ++ } ++ ++ /* find inner header point */ ++ l3.hdr = skb_inner_network_header(skb); ++ l4_hdr = skb_inner_transport_header(skb); ++ ++ if (l3.v6->version == 6) { ++ exthdr = l3.hdr + sizeof(*l3.v6); ++ l4_proto_tmp = l3.v6->nexthdr; ++ if (l4_hdr != exthdr) ++ ipv6_skip_exthdr(skb, exthdr - skb->data, ++ &l4_proto_tmp, &frag_off); ++ } else if (l3.v4->version == 4) { ++ l4_proto_tmp = l3.v4->protocol; ++ } ++ ++ *il4_proto = l4_proto_tmp; ++ ++ return 0; ++} ++ ++/* when skb->encapsulation is 0, skb->ip_summed is CHECKSUM_PARTIAL ++ * and it is udp packet, which has a dest port as the IANA assigned. ++ * the hardware is expected to do the checksum offload, but the ++ * hardware will not do the checksum offload when udp dest port is ++ * 4789, 4790 or 6081. ++ */ ++static bool hns3_tunnel_csum_bug(struct sk_buff *skb) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(skb->dev); ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); ++ union l4_hdr_info l4; ++ ++ /* device version above V3(include V3), the hardware can ++ * do this checksum offload. ++ */ ++ if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) ++ return false; ++ ++ l4.hdr = skb_transport_header(skb); ++ ++ if (!(!skb->encapsulation && ++ (l4.udp->dest == htons(IANA_VXLAN_UDP_PORT) || ++ l4.udp->dest == htons(GENEVE_UDP_PORT) || ++ l4.udp->dest == htons(IANA_VXLAN_GPE_UDP_PORT)))) ++ return false; ++ ++ return true; ++} ++ ++static void hns3_set_outer_l2l3l4(struct sk_buff *skb, u8 ol4_proto, ++ u32 *ol_type_vlan_len_msec) ++{ ++ u32 l2_len, l3_len, l4_len; ++ unsigned char *il2_hdr; ++ union l3_hdr_info l3; ++ union l4_hdr_info l4; ++ ++ l3.hdr = skb_network_header(skb); ++ l4.hdr = skb_transport_header(skb); ++ ++ /* compute OL2 header size, defined in 2 Bytes */ ++ l2_len = l3.hdr - skb->data; ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L2LEN_S, l2_len >> 1); ++ ++ /* compute OL3 header size, defined in 4 Bytes */ ++ l3_len = l4.hdr - l3.hdr; ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L3LEN_S, l3_len >> 2); ++ ++ il2_hdr = skb_inner_mac_header(skb); ++ /* compute OL4 header size, defined in 4 Bytes */ ++ l4_len = il2_hdr - l4.hdr; ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L4LEN_S, l4_len >> 2); ++ ++ /* define outer network header type */ ++ if (skb->protocol == htons(ETH_P_IP)) { ++ if (skb_is_gso(skb)) ++ hns3_set_field(*ol_type_vlan_len_msec, ++ HNS3_TXD_OL3T_S, ++ HNS3_OL3T_IPV4_CSUM); ++ else ++ hns3_set_field(*ol_type_vlan_len_msec, ++ HNS3_TXD_OL3T_S, ++ HNS3_OL3T_IPV4_NO_CSUM); ++ } else if (skb->protocol == htons(ETH_P_IPV6)) { ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_OL3T_S, ++ HNS3_OL3T_IPV6); ++ } ++ ++ if (ol4_proto == IPPROTO_UDP) ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_TUNTYPE_S, ++ HNS3_TUN_MAC_IN_UDP); ++ else if (ol4_proto == IPPROTO_GRE) ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_TUNTYPE_S, ++ HNS3_TUN_NVGRE); ++} ++ ++static void hns3_set_l3_type(struct sk_buff *skb, union l3_hdr_info l3, ++ u32 *type_cs_vlan_tso) ++{ ++ if (l3.v4->version == 4) { ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S, ++ HNS3_L3T_IPV4); ++ ++ /* the stack computes the IP header already, the only time we ++ * need the hardware to recompute it is in the case of TSO. ++ */ ++ if (skb_is_gso(skb)) ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3CS_B, 1); ++ } else if (l3.v6->version == 6) { ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S, ++ HNS3_L3T_IPV6); ++ } ++} ++ ++static int hns3_set_l4_csum_length(struct sk_buff *skb, union l4_hdr_info l4, ++ u32 l4_proto, u32 *type_cs_vlan_tso) ++{ ++ /* compute inner(/normal) L4 header size, defined in 4 Bytes */ ++ switch (l4_proto) { ++ case IPPROTO_TCP: ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S, ++ HNS3_L4T_TCP); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S, ++ l4.tcp->doff); ++ break; ++ case IPPROTO_UDP: ++ if (hns3_tunnel_csum_bug(skb)) { ++ int ret = skb_put_padto(skb, HNS3_MIN_TUN_PKT_LEN); ++ ++ return ret ? ret : skb_checksum_help(skb); ++ } ++ ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S, ++ HNS3_L4T_UDP); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S, ++ (sizeof(struct udphdr) >> 2)); ++ break; ++ case IPPROTO_SCTP: ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S, ++ HNS3_L4T_SCTP); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S, ++ (sizeof(struct sctphdr) >> 2)); ++ break; ++ default: ++ /* drop the skb tunnel packet if hardware don't support, ++ * because hardware can't calculate csum when TSO. ++ */ ++ if (skb_is_gso(skb)) ++ return -EDOM; ++ ++ /* the stack computes the IP header already, ++ * driver calculate l4 checksum when not TSO. ++ */ ++ return skb_checksum_help(skb); ++ } ++ ++ return 0; ++} ++ ++static int hns3_set_l2l3l4(struct sk_buff *skb, u8 ol4_proto, ++ u8 il4_proto, u32 *type_cs_vlan_tso, ++ u32 *ol_type_vlan_len_msec) ++{ ++ unsigned char *l2_hdr = skb->data; ++ u32 l4_proto = ol4_proto; ++ union l4_hdr_info l4; ++ union l3_hdr_info l3; ++ u32 l2_len, l3_len; ++ ++ l4.hdr = skb_transport_header(skb); ++ l3.hdr = skb_network_header(skb); ++ ++ /* handle encapsulation skb */ ++ if (skb->encapsulation) { ++ /* If this is a not UDP/GRE encapsulation skb */ ++ if (!(ol4_proto == IPPROTO_UDP || ol4_proto == IPPROTO_GRE)) { ++ /* drop the skb tunnel packet if hardware don't support, ++ * because hardware can't calculate csum when TSO. ++ */ ++ if (skb_is_gso(skb)) ++ return -EDOM; ++ ++ /* the stack computes the IP header already, ++ * driver calculate l4 checksum when not TSO. ++ */ ++ return skb_checksum_help(skb); ++ } ++ ++ hns3_set_outer_l2l3l4(skb, ol4_proto, ol_type_vlan_len_msec); ++ ++ /* switch to inner header */ ++ l2_hdr = skb_inner_mac_header(skb); ++ l3.hdr = skb_inner_network_header(skb); ++ l4.hdr = skb_inner_transport_header(skb); ++ l4_proto = il4_proto; ++ } ++ ++ hns3_set_l3_type(skb, l3, type_cs_vlan_tso); ++ ++ /* compute inner(/normal) L2 header size, defined in 2 Bytes */ ++ l2_len = l3.hdr - l2_hdr; ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L2LEN_S, l2_len >> 1); ++ ++ /* compute inner(/normal) L3 header size, defined in 4 Bytes */ ++ l3_len = l4.hdr - l3.hdr; ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3LEN_S, l3_len >> 2); ++ ++ return hns3_set_l4_csum_length(skb, l4, l4_proto, type_cs_vlan_tso); ++} ++ ++static int hns3_handle_vtags(struct hns3_enet_ring *tx_ring, ++ struct sk_buff *skb) ++{ ++ struct hnae3_handle *handle = tx_ring->tqp->handle; ++ struct hnae3_ae_dev *ae_dev; ++ struct vlan_ethhdr *vhdr; ++ int rc; ++ ++ if (!(skb->protocol == htons(ETH_P_8021Q) || ++ skb_vlan_tag_present(skb))) ++ return 0; ++ ++ /* For HW limitation on HNAE3_DEVICE_VERSION_V2, if port based insert ++ * VLAN enabled, only one VLAN header is allowed in skb, otherwise it ++ * will cause RAS error. ++ */ ++ ae_dev = pci_get_drvdata(handle->pdev); ++ if (unlikely(skb_vlan_tagged_multi(skb) && ++ ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2 && ++ handle->port_base_vlan_state == ++ HNAE3_PORT_BASE_VLAN_ENABLE)) ++ return -EINVAL; ++ ++ if (skb->protocol == htons(ETH_P_8021Q) && ++ !(handle->kinfo.netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) { ++ /* When HW VLAN acceleration is turned off, and the stack ++ * sets the protocol to 802.1q, the driver just need to ++ * set the protocol to the encapsulated ethertype. ++ */ ++ skb->protocol = vlan_get_protocol(skb); ++ return 0; ++ } ++ ++ if (skb_vlan_tag_present(skb)) { ++ /* Based on hw strategy, use out_vtag in two layer tag case, ++ * and use inner_vtag in one tag case. ++ */ ++ if (skb->protocol == htons(ETH_P_8021Q) && ++ handle->port_base_vlan_state == ++ HNAE3_PORT_BASE_VLAN_DISABLE) ++ rc = HNS3_OUTER_VLAN_TAG; ++ else ++ rc = HNS3_INNER_VLAN_TAG; ++ ++ skb->protocol = vlan_get_protocol(skb); ++ return rc; ++ } ++ ++ rc = skb_cow_head(skb, 0); ++ if (unlikely(rc < 0)) ++ return rc; ++ ++ vhdr = (struct vlan_ethhdr *)skb->data; ++ vhdr->h_vlan_TCI |= cpu_to_be16((skb->priority << VLAN_PRIO_SHIFT) ++ & VLAN_PRIO_MASK); ++ ++ skb->protocol = vlan_get_protocol(skb); ++ return 0; ++} ++ ++/* check if the hardware is capable of checksum offloading */ ++static bool hns3_check_hw_tx_csum(struct sk_buff *skb) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(skb->dev); ++ ++ /* Kindly note, due to backward compatibility of the TX descriptor, ++ * HW checksum of the non-IP packets and GSO packets is handled at ++ * different place in the following code ++ */ ++ if (skb_csum_is_sctp(skb) || skb_is_gso(skb) || ++ !test_bit(HNS3_NIC_STATE_HW_TX_CSUM_ENABLE, &priv->state)) ++ return false; ++ ++ return true; ++} ++ ++struct hns3_desc_param { ++ u32 paylen_ol4cs; ++ u32 ol_type_vlan_len_msec; ++ u32 type_cs_vlan_tso; ++ u16 mss_hw_csum; ++ u16 inner_vtag; ++ u16 out_vtag; ++}; ++ ++static void hns3_init_desc_data(struct sk_buff *skb, struct hns3_desc_param *pa) ++{ ++ pa->paylen_ol4cs = skb->len; ++ pa->ol_type_vlan_len_msec = 0; ++ pa->type_cs_vlan_tso = 0; ++ pa->mss_hw_csum = 0; ++ pa->inner_vtag = 0; ++ pa->out_vtag = 0; ++} ++ ++static int hns3_handle_vlan_info(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ struct hns3_desc_param *param) ++{ ++ int ret; ++ ++ ret = hns3_handle_vtags(ring, skb); ++ if (unlikely(ret < 0)) { ++ hns3_ring_stats_update(ring, tx_vlan_err); ++ return ret; ++ } else if (ret == HNS3_INNER_VLAN_TAG) { ++ param->inner_vtag = skb_vlan_tag_get(skb); ++ param->inner_vtag |= (skb->priority << VLAN_PRIO_SHIFT) & ++ VLAN_PRIO_MASK; ++ hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_VLAN_B, 1); ++ } else if (ret == HNS3_OUTER_VLAN_TAG) { ++ param->out_vtag = skb_vlan_tag_get(skb); ++ param->out_vtag |= (skb->priority << VLAN_PRIO_SHIFT) & ++ VLAN_PRIO_MASK; ++ hns3_set_field(param->ol_type_vlan_len_msec, HNS3_TXD_OVLAN_B, ++ 1); ++ } ++ return 0; ++} ++ ++static int hns3_handle_csum_partial(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ struct hns3_desc_cb *desc_cb, ++ struct hns3_desc_param *param) ++{ ++ u8 ol4_proto, il4_proto; ++ int ret; ++ ++ if (hns3_check_hw_tx_csum(skb)) { ++ /* set checksum start and offset, defined in 2 Bytes */ ++ hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_CSUM_START_S, ++ skb_checksum_start_offset(skb) >> 1); ++ hns3_set_field(param->ol_type_vlan_len_msec, ++ HNS3_TXD_CSUM_OFFSET_S, ++ skb->csum_offset >> 1); ++ param->mss_hw_csum |= BIT(HNS3_TXD_HW_CS_B); ++ return 0; ++ } ++ ++ skb_reset_mac_len(skb); ++ ++ ret = hns3_get_l4_protocol(skb, &ol4_proto, &il4_proto); ++ if (unlikely(ret < 0)) { ++ hns3_ring_stats_update(ring, tx_l4_proto_err); ++ return ret; ++ } ++ ++ ret = hns3_set_l2l3l4(skb, ol4_proto, il4_proto, ++ ¶m->type_cs_vlan_tso, ++ ¶m->ol_type_vlan_len_msec); ++ if (unlikely(ret < 0)) { ++ hns3_ring_stats_update(ring, tx_l2l3l4_err); ++ return ret; ++ } ++ ++ ret = hns3_set_tso(skb, ¶m->paylen_ol4cs, ¶m->mss_hw_csum, ++ ¶m->type_cs_vlan_tso, &desc_cb->send_bytes); ++ if (unlikely(ret < 0)) { ++ hns3_ring_stats_update(ring, tx_tso_err); ++ return ret; ++ } ++ return 0; ++} ++ ++static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, struct hns3_desc *desc, ++ struct hns3_desc_cb *desc_cb) ++{ ++ struct hns3_desc_param param; ++ int ret; ++ ++ hns3_init_desc_data(skb, ¶m); ++ ret = hns3_handle_vlan_info(ring, skb, ¶m); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ desc_cb->send_bytes = skb->len; ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ ret = hns3_handle_csum_partial(ring, skb, desc_cb, ¶m); ++ if (ret) ++ return ret; ++ } ++ ++ /* Set txbd */ ++ desc->tx.ol_type_vlan_len_msec = ++ cpu_to_le32(param.ol_type_vlan_len_msec); ++ desc->tx.type_cs_vlan_tso_len = cpu_to_le32(param.type_cs_vlan_tso); ++ desc->tx.paylen_ol4cs = cpu_to_le32(param.paylen_ol4cs); ++ desc->tx.mss_hw_csum = cpu_to_le16(param.mss_hw_csum); ++ desc->tx.vlan_tag = cpu_to_le16(param.inner_vtag); ++ desc->tx.outer_vlan_tag = cpu_to_le16(param.out_vtag); ++ ++ return 0; ++} ++ ++static int hns3_fill_desc(struct hns3_enet_ring *ring, dma_addr_t dma, ++ unsigned int size) ++{ ++#define HNS3_LIKELY_BD_NUM 1 ++ ++ struct hns3_desc *desc = &ring->desc[ring->next_to_use]; ++ unsigned int frag_buf_num; ++ int k, sizeoflast; ++ ++ if (likely(size <= HNS3_MAX_BD_SIZE)) { ++ desc->addr = cpu_to_le64(dma); ++ desc->tx.send_size = cpu_to_le16(size); ++ desc->tx.bdtp_fe_sc_vld_ra_ri = ++ cpu_to_le16(BIT(HNS3_TXD_VLD_B)); ++ ++ trace_hns3_tx_desc(ring, ring->next_to_use); ++ ring_ptr_move_fw(ring, next_to_use); ++ return HNS3_LIKELY_BD_NUM; ++ } ++ ++ frag_buf_num = hns3_tx_bd_count(size); ++ sizeoflast = size % HNS3_MAX_BD_SIZE; ++ sizeoflast = sizeoflast ? sizeoflast : HNS3_MAX_BD_SIZE; ++ ++ /* When frag size is bigger than hardware limit, split this frag */ ++ for (k = 0; k < frag_buf_num; k++) { ++ /* now, fill the descriptor */ ++ desc->addr = cpu_to_le64(dma + HNS3_MAX_BD_SIZE * k); ++ desc->tx.send_size = cpu_to_le16((k == frag_buf_num - 1) ? ++ (u16)sizeoflast : (u16)HNS3_MAX_BD_SIZE); ++ desc->tx.bdtp_fe_sc_vld_ra_ri = ++ cpu_to_le16(BIT(HNS3_TXD_VLD_B)); ++ ++ trace_hns3_tx_desc(ring, ring->next_to_use); ++ /* move ring pointer to next */ ++ ring_ptr_move_fw(ring, next_to_use); ++ ++ desc = &ring->desc[ring->next_to_use]; ++ } ++ ++ return frag_buf_num; ++} ++ ++static int hns3_map_and_fill_desc(struct hns3_enet_ring *ring, void *priv, ++ unsigned int type) ++{ ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; ++ struct device *dev = ring_to_dev(ring); ++ unsigned int size; ++ dma_addr_t dma; ++ ++ if (type & (DESC_TYPE_FRAGLIST_SKB | DESC_TYPE_SKB)) { ++ struct sk_buff *skb = (struct sk_buff *)priv; ++ ++ size = skb_headlen(skb); ++ if (!size) ++ return 0; ++ ++ dma = dma_map_single(dev, skb->data, size, DMA_TO_DEVICE); ++ } else if (type & DESC_TYPE_BOUNCE_HEAD) { ++ /* Head data has been filled in hns3_handle_tx_bounce(), ++ * just return 0 here. ++ */ ++ return 0; ++ } else { ++ skb_frag_t *frag = (skb_frag_t *)priv; ++ ++ size = skb_frag_size(frag); ++ if (!size) ++ return 0; ++ ++ dma = skb_frag_dma_map(dev, frag, 0, size, DMA_TO_DEVICE); ++ } ++ ++ if (unlikely(dma_mapping_error(dev, dma))) { ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ return -ENOMEM; ++ } ++ ++ desc_cb->priv = priv; ++ desc_cb->length = size; ++ desc_cb->dma = dma; ++ desc_cb->type = type; ++ ++ return hns3_fill_desc(ring, dma, size); ++} ++ ++static unsigned int hns3_skb_bd_num(struct sk_buff *skb, unsigned int *bd_size, ++ unsigned int bd_num) ++{ ++ unsigned int size; ++ int i; ++ ++ size = skb_headlen(skb); ++ while (size > HNS3_MAX_BD_SIZE) { ++ bd_size[bd_num++] = HNS3_MAX_BD_SIZE; ++ size -= HNS3_MAX_BD_SIZE; ++ ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ if (size) { ++ bd_size[bd_num++] = size; ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ size = skb_frag_size(frag); ++ if (!size) ++ continue; ++ ++ while (size > HNS3_MAX_BD_SIZE) { ++ bd_size[bd_num++] = HNS3_MAX_BD_SIZE; ++ size -= HNS3_MAX_BD_SIZE; ++ ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ bd_size[bd_num++] = size; ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ return bd_num; ++} ++ ++static unsigned int hns3_tx_bd_num(struct sk_buff *skb, unsigned int *bd_size, ++ u8 max_non_tso_bd_num, unsigned int bd_num, ++ unsigned int recursion_level) ++{ ++#define HNS3_MAX_RECURSION_LEVEL 24 ++ ++ struct sk_buff *frag_skb; ++ ++ /* If the total len is within the max bd limit */ ++ if (likely(skb->len <= HNS3_MAX_BD_SIZE && !recursion_level && ++ !skb_has_frag_list(skb) && ++ skb_shinfo(skb)->nr_frags < max_non_tso_bd_num)) ++ return skb_shinfo(skb)->nr_frags + 1U; ++ ++ if (unlikely(recursion_level >= HNS3_MAX_RECURSION_LEVEL)) ++ return UINT_MAX; ++ ++ bd_num = hns3_skb_bd_num(skb, bd_size, bd_num); ++ if (!skb_has_frag_list(skb) || bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ ++ skb_walk_frags(skb, frag_skb) { ++ bd_num = hns3_tx_bd_num(frag_skb, bd_size, max_non_tso_bd_num, ++ bd_num, recursion_level + 1); ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ return bd_num; ++} ++ ++static unsigned int hns3_gso_hdr_len(struct sk_buff *skb) ++{ ++ if (!skb->encapsulation) ++ return skb_tcp_all_headers(skb); ++ ++ return skb_inner_tcp_all_headers(skb); ++} ++ ++/* HW need every continuous max_non_tso_bd_num buffer data to be larger ++ * than MSS, we simplify it by ensuring skb_headlen + the first continuous ++ * max_non_tso_bd_num - 1 frags to be larger than gso header len + mss, ++ * and the remaining continuous max_non_tso_bd_num - 1 frags to be larger ++ * than MSS except the last max_non_tso_bd_num - 1 frags. ++ */ ++static bool hns3_skb_need_linearized(struct sk_buff *skb, unsigned int *bd_size, ++ unsigned int bd_num, u8 max_non_tso_bd_num) ++{ ++ unsigned int tot_len = 0; ++ int i; ++ ++ for (i = 0; i < max_non_tso_bd_num - 1U; i++) ++ tot_len += bd_size[i]; ++ ++ /* ensure the first max_non_tso_bd_num frags is greater than ++ * mss + header ++ */ ++ if (tot_len + bd_size[max_non_tso_bd_num - 1U] < ++ skb_shinfo(skb)->gso_size + hns3_gso_hdr_len(skb)) ++ return true; ++ ++ /* ensure every continuous max_non_tso_bd_num - 1 buffer is greater ++ * than mss except the last one. ++ */ ++ for (i = 0; i < bd_num - max_non_tso_bd_num; i++) { ++ tot_len -= bd_size[i]; ++ tot_len += bd_size[i + max_non_tso_bd_num - 1U]; ++ ++ if (tot_len < skb_shinfo(skb)->gso_size) ++ return true; ++ } ++ ++ return false; ++} ++ ++void hns3_shinfo_pack(struct skb_shared_info *shinfo, __u32 *size) ++{ ++ int i; ++ ++ for (i = 0; i < MAX_SKB_FRAGS; i++) ++ size[i] = skb_frag_size(&shinfo->frags[i]); ++} ++ ++static int hns3_skb_linearize(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ unsigned int bd_num) ++{ ++ /* 'bd_num == UINT_MAX' means the skb' fraglist has a ++ * recursion level of over HNS3_MAX_RECURSION_LEVEL. ++ */ ++ if (bd_num == UINT_MAX) { ++ hns3_ring_stats_update(ring, over_max_recursion); ++ return -ENOMEM; ++ } ++ ++ /* The skb->len has exceeded the hw limitation, linearization ++ * will not help. ++ */ ++ if (skb->len > HNS3_MAX_TSO_SIZE || ++ (!skb_is_gso(skb) && skb->len > HNS3_MAX_NON_TSO_SIZE)) { ++ hns3_ring_stats_update(ring, hw_limitation); ++ return -ENOMEM; ++ } ++ ++ if (__skb_linearize(skb)) { ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring, ++ struct net_device *netdev, ++ struct sk_buff *skb) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ u8 max_non_tso_bd_num = priv->max_non_tso_bd_num; ++ unsigned int bd_size[HNS3_MAX_TSO_BD_NUM + 1U]; ++ unsigned int bd_num; ++ ++ bd_num = hns3_tx_bd_num(skb, bd_size, max_non_tso_bd_num, 0, 0); ++ if (unlikely(bd_num > max_non_tso_bd_num)) { ++ if (bd_num <= HNS3_MAX_TSO_BD_NUM && skb_is_gso(skb) && ++ !hns3_skb_need_linearized(skb, bd_size, bd_num, ++ max_non_tso_bd_num)) { ++ trace_hns3_over_max_bd(skb); ++ goto out; ++ } ++ ++ if (hns3_skb_linearize(ring, skb, bd_num)) ++ return -ENOMEM; ++ ++ bd_num = hns3_tx_bd_count(skb->len); ++ ++ hns3_ring_stats_update(ring, tx_copy); ++ } ++ ++out: ++ if (likely(ring_space(ring) >= bd_num)) ++ return bd_num; ++ ++ netif_stop_subqueue(netdev, ring->queue_index); ++ smp_mb(); /* Memory barrier before checking ring_space */ ++ ++ /* Start queue in case hns3_clean_tx_ring has just made room ++ * available and has not seen the queue stopped state performed ++ * by netif_stop_subqueue above. ++ */ ++ if (ring_space(ring) >= bd_num && netif_carrier_ok(netdev) && ++ !test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) { ++ netif_start_subqueue(netdev, ring->queue_index); ++ return bd_num; ++ } ++ ++ hns3_ring_stats_update(ring, tx_busy); ++ ++ return -EBUSY; ++} ++ ++static void hns3_clear_desc(struct hns3_enet_ring *ring, int next_to_use_orig) ++{ ++ struct device *dev = ring_to_dev(ring); ++ unsigned int i; ++ ++ for (i = 0; i < ring->desc_num; i++) { ++ struct hns3_desc *desc = &ring->desc[ring->next_to_use]; ++ struct hns3_desc_cb *desc_cb; ++ ++ memset(desc, 0, sizeof(*desc)); ++ ++ /* check if this is where we started */ ++ if (ring->next_to_use == next_to_use_orig) ++ break; ++ ++ /* rollback one */ ++ ring_ptr_move_bw(ring, next_to_use); ++ ++ desc_cb = &ring->desc_cb[ring->next_to_use]; ++ ++ if (!desc_cb->dma) ++ continue; ++ ++ /* unmap the descriptor dma address */ ++ if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB)) ++ dma_unmap_single(dev, desc_cb->dma, desc_cb->length, ++ DMA_TO_DEVICE); ++ else if (desc_cb->type & ++ (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) ++ hns3_tx_spare_rollback(ring, desc_cb->length); ++ else if (desc_cb->length) ++ dma_unmap_page(dev, desc_cb->dma, desc_cb->length, ++ DMA_TO_DEVICE); ++ ++ desc_cb->length = 0; ++ desc_cb->dma = 0; ++ desc_cb->type = DESC_TYPE_UNKNOWN; ++ } ++} ++ ++static int hns3_fill_skb_to_desc(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, unsigned int type) ++{ ++ struct sk_buff *frag_skb; ++ int i, ret, bd_num = 0; ++ ++ ret = hns3_map_and_fill_desc(ring, skb, type); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ bd_num += ret; ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ ret = hns3_map_and_fill_desc(ring, frag, DESC_TYPE_PAGE); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ bd_num += ret; ++ } ++ ++ skb_walk_frags(skb, frag_skb) { ++ ret = hns3_fill_skb_to_desc(ring, frag_skb, ++ DESC_TYPE_FRAGLIST_SKB); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ bd_num += ret; ++ } ++ ++ return bd_num; ++} ++ ++static void hns3_tx_push_bd(struct hns3_enet_ring *ring, int num) ++{ ++#define HNS3_BYTES_PER_64BIT 8 ++ ++ struct hns3_desc desc[HNS3_MAX_PUSH_BD_NUM] = {}; ++ int offset = 0; ++ ++ /* make sure everything is visible to device before ++ * excuting tx push or updating doorbell ++ */ ++ dma_wmb(); ++ ++ do { ++ int idx = (ring->next_to_use - num + ring->desc_num) % ++ ring->desc_num; ++ ++ u64_stats_update_begin(&ring->syncp); ++ ring->stats.tx_push++; ++ u64_stats_update_end(&ring->syncp); ++ memcpy(&desc[offset], &ring->desc[idx], ++ sizeof(struct hns3_desc)); ++ offset++; ++ } while (--num); ++ ++ __iowrite64_copy(ring->tqp->mem_base, desc, ++ (sizeof(struct hns3_desc) * HNS3_MAX_PUSH_BD_NUM) / ++ HNS3_BYTES_PER_64BIT); ++ ++ io_stop_wc(); ++} ++ ++static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring) ++{ ++#define HNS3_MEM_DOORBELL_OFFSET 64 ++ ++ __le64 bd_num = cpu_to_le64((u64)ring->pending_buf); ++ ++ /* make sure everything is visible to device before ++ * excuting tx push or updating doorbell ++ */ ++ dma_wmb(); ++ ++ __iowrite64_copy(ring->tqp->mem_base + HNS3_MEM_DOORBELL_OFFSET, ++ &bd_num, 1); ++ u64_stats_update_begin(&ring->syncp); ++ ring->stats.tx_mem_doorbell += ring->pending_buf; ++ u64_stats_update_end(&ring->syncp); ++ ++ io_stop_wc(); ++} ++ ++static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num, ++ bool doorbell) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ /* when tx push is enabled, the packet whose number of BD below ++ * HNS3_MAX_PUSH_BD_NUM can be pushed directly. ++ */ ++ if (test_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state) && num && ++ !ring->pending_buf && num <= HNS3_MAX_PUSH_BD_NUM && doorbell) { ++ hns3_tx_push_bd(ring, num); ++ WRITE_ONCE(ring->last_to_use, ring->next_to_use); ++ return; ++ } ++ ++ ring->pending_buf += num; ++ ++ if (!doorbell) { ++ hns3_ring_stats_update(ring, tx_more); ++ return; ++ } ++ ++ if (ring->tqp->mem_base) ++ hns3_tx_mem_doorbell(ring); ++ else ++ writel(ring->pending_buf, ++ ring->tqp->io_base + HNS3_RING_TX_RING_TAIL_REG); ++ ++ ring->pending_buf = 0; ++ WRITE_ONCE(ring->last_to_use, ring->next_to_use); ++} ++ ++static void hns3_tsyn(struct net_device *netdev, struct sk_buff *skb, ++ struct hns3_desc *desc) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (!(h->ae_algo->ops->set_tx_hwts_info && ++ h->ae_algo->ops->set_tx_hwts_info(h, skb))) ++ return; ++ ++ desc->tx.bdtp_fe_sc_vld_ra_ri |= cpu_to_le16(BIT(HNS3_TXD_TSYN_B)); ++} ++ ++static int hns3_handle_tx_bounce(struct hns3_enet_ring *ring, ++ struct sk_buff *skb) ++{ ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; ++ unsigned int type = DESC_TYPE_BOUNCE_HEAD; ++ unsigned int size = skb_headlen(skb); ++ dma_addr_t dma; ++ int bd_num = 0; ++ u32 cb_len; ++ void *buf; ++ int ret; ++ ++ if (skb->len <= ring->tx_copybreak) { ++ size = skb->len; ++ type = DESC_TYPE_BOUNCE_ALL; ++ } ++ ++ /* hns3_can_use_tx_bounce() is called to ensure the below ++ * function can always return the tx buffer. ++ */ ++ buf = hns3_tx_spare_alloc(ring, size, &dma, &cb_len); ++ ++ ret = skb_copy_bits(skb, 0, buf, size); ++ if (unlikely(ret < 0)) { ++ hns3_tx_spare_rollback(ring, cb_len); ++ hns3_ring_stats_update(ring, copy_bits_err); ++ return ret; ++ } ++ ++ desc_cb->priv = skb; ++ desc_cb->length = cb_len; ++ desc_cb->dma = dma; ++ desc_cb->type = type; ++ ++ bd_num += hns3_fill_desc(ring, dma, size); ++ ++ if (type == DESC_TYPE_BOUNCE_HEAD) { ++ ret = hns3_fill_skb_to_desc(ring, skb, ++ DESC_TYPE_BOUNCE_HEAD); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ bd_num += ret; ++ } ++ ++ dma_sync_single_for_device(ring_to_dev(ring), dma, size, ++ DMA_TO_DEVICE); ++ ++ hns3_ring_stats_update(ring, tx_bounce); ++ ++ return bd_num; ++} ++ ++static int hns3_handle_tx_sgl(struct hns3_enet_ring *ring, ++ struct sk_buff *skb) ++{ ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; ++ u32 nfrag = skb_shinfo(skb)->nr_frags + 1; ++ struct sg_table *sgt; ++ int i, bd_num = 0; ++ dma_addr_t dma; ++ u32 cb_len; ++ int nents; ++ ++ if (skb_has_frag_list(skb)) ++ nfrag = HNS3_MAX_TSO_BD_NUM; ++ ++ /* hns3_can_use_tx_sgl() is called to ensure the below ++ * function can always return the tx buffer. ++ */ ++ sgt = hns3_tx_spare_alloc(ring, HNS3_SGL_SIZE(nfrag), ++ &dma, &cb_len); ++ ++ /* scatterlist follows by the sg table */ ++ sgt->sgl = (struct scatterlist *)(sgt + 1); ++ sg_init_table(sgt->sgl, nfrag); ++ nents = skb_to_sgvec(skb, sgt->sgl, 0, skb->len); ++ if (unlikely(nents < 0)) { ++ hns3_tx_spare_rollback(ring, cb_len); ++ hns3_ring_stats_update(ring, skb2sgl_err); ++ return -ENOMEM; ++ } ++ ++ sgt->orig_nents = nents; ++ sgt->nents = dma_map_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents, ++ DMA_TO_DEVICE); ++ if (unlikely(!sgt->nents)) { ++ hns3_tx_spare_rollback(ring, cb_len); ++ hns3_ring_stats_update(ring, map_sg_err); ++ return -ENOMEM; ++ } ++ ++ desc_cb->priv = skb; ++ desc_cb->length = cb_len; ++ desc_cb->dma = dma; ++ desc_cb->type = DESC_TYPE_SGL_SKB; ++ ++ for (i = 0; i < sgt->nents; i++) ++ bd_num += hns3_fill_desc(ring, sg_dma_address(sgt->sgl + i), ++ sg_dma_len(sgt->sgl + i)); ++ hns3_ring_stats_update(ring, tx_sgl); ++ ++ return bd_num; ++} ++ ++static int hns3_handle_desc_filling(struct hns3_enet_ring *ring, ++ struct sk_buff *skb) ++{ ++ u32 space; ++ ++ if (!ring->tx_spare) ++ goto out; ++ ++ space = hns3_tx_spare_space(ring); ++ ++ if (hns3_can_use_tx_sgl(ring, skb, space)) ++ return hns3_handle_tx_sgl(ring, skb); ++ ++ if (hns3_can_use_tx_bounce(ring, skb, space)) ++ return hns3_handle_tx_bounce(ring, skb); ++ ++out: ++ return hns3_fill_skb_to_desc(ring, skb, DESC_TYPE_SKB); ++} ++ ++static int hns3_handle_skb_desc(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ struct hns3_desc_cb *desc_cb, ++ int next_to_use_head) ++{ ++ int ret; ++ ++ ret = hns3_fill_skb_desc(ring, skb, &ring->desc[ring->next_to_use], ++ desc_cb); ++ if (unlikely(ret < 0)) ++ goto fill_err; ++ ++ /* 'ret < 0' means filling error, 'ret == 0' means skb->len is ++ * zero, which is unlikely, and 'ret > 0' means how many tx desc ++ * need to be notified to the hw. ++ */ ++ ret = hns3_handle_desc_filling(ring, skb); ++ if (likely(ret > 0)) ++ return ret; ++ ++fill_err: ++ hns3_clear_desc(ring, next_to_use_head); ++ return ret; ++} ++ ++netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hns3_enet_ring *ring = &priv->ring[skb->queue_mapping]; ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; ++ struct netdev_queue *dev_queue; ++ int pre_ntu, ret; ++ bool doorbell; ++ ++ /* Hardware can only handle short frames above 32 bytes */ ++ if (skb_put_padto(skb, HNS3_MIN_TX_LEN)) { ++ hns3_tx_doorbell(ring, 0, !netdev_xmit_more()); ++ ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ ++ return NETDEV_TX_OK; ++ } ++ ++ /* Prefetch the data used later */ ++ prefetch(skb->data); ++ ++ ret = hns3_nic_maybe_stop_tx(ring, netdev, skb); ++ if (unlikely(ret <= 0)) { ++ if (ret == -EBUSY) { ++ hns3_tx_doorbell(ring, 0, true); ++ return NETDEV_TX_BUSY; ++ } ++ ++ hns3_rl_err(netdev, "xmit error: %d!\n", ret); ++ goto out_err_tx_ok; ++ } ++ ++ ret = hns3_handle_skb_desc(ring, skb, desc_cb, ring->next_to_use); ++ if (unlikely(ret <= 0)) ++ goto out_err_tx_ok; ++ ++ pre_ntu = ring->next_to_use ? (ring->next_to_use - 1) : ++ (ring->desc_num - 1); ++ ++ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) ++ hns3_tsyn(netdev, skb, &ring->desc[pre_ntu]); ++ ++ ring->desc[pre_ntu].tx.bdtp_fe_sc_vld_ra_ri |= ++ cpu_to_le16(BIT(HNS3_TXD_FE_B)); ++ trace_hns3_tx_desc(ring, pre_ntu); ++ ++ skb_tx_timestamp(skb); ++ ++ /* Complete translate all packets */ ++ dev_queue = netdev_get_tx_queue(netdev, ring->queue_index); ++ doorbell = __netdev_tx_sent_queue(dev_queue, desc_cb->send_bytes, ++ netdev_xmit_more()); ++ hns3_tx_doorbell(ring, ret, doorbell); ++ ++ return NETDEV_TX_OK; ++ ++out_err_tx_ok: ++ dev_kfree_skb_any(skb); ++ hns3_tx_doorbell(ring, 0, !netdev_xmit_more()); ++ return NETDEV_TX_OK; ++} ++ ++static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p) ++{ ++ char format_mac_addr_perm[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ char format_mac_addr_sa[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct sockaddr *mac_addr = p; ++ int ret; ++ ++ if (!mac_addr || !is_valid_ether_addr((const u8 *)mac_addr->sa_data)) ++ return -EADDRNOTAVAIL; ++ ++ if (ether_addr_equal(netdev->dev_addr, mac_addr->sa_data)) { ++ hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data); ++ netdev_info(netdev, "already using mac address %s\n", ++ format_mac_addr_sa); ++ return 0; ++ } ++ ++ /* For VF device, if there is a perm_addr, then the user will not ++ * be allowed to change the address. ++ */ ++ if (!hns3_is_phys_func(h->pdev) && ++ !is_zero_ether_addr(netdev->perm_addr)) { ++ hnae3_format_mac_addr(format_mac_addr_perm, netdev->perm_addr); ++ hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data); ++ netdev_err(netdev, "has permanent MAC %s, user MAC %s not allow\n", ++ format_mac_addr_perm, format_mac_addr_sa); ++ return -EPERM; ++ } ++ ++ ret = h->ae_algo->ops->set_mac_addr(h, mac_addr->sa_data, false); ++ if (ret) { ++ netdev_err(netdev, "set_mac_address fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ eth_hw_addr_set(netdev, mac_addr->sa_data); ++ ++ return 0; ++} ++ ++static int hns3_nic_do_ioctl(struct net_device *netdev, ++ struct ifreq *ifr, int cmd) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (!netif_running(netdev)) ++ return -EINVAL; ++ ++ if (!h->ae_algo->ops->do_ioctl) ++ return -EOPNOTSUPP; ++ ++ return h->ae_algo->ops->do_ioctl(h, ifr, cmd); ++} ++ ++static int hns3_nic_set_features(struct net_device *netdev, ++ netdev_features_t features) ++{ ++ netdev_features_t changed = netdev->features ^ features; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = priv->ae_handle; ++ bool enable; ++ int ret; ++ ++ if (changed & (NETIF_F_GRO_HW) && h->ae_algo->ops->set_gro_en) { ++ enable = !!(features & NETIF_F_GRO_HW); ++ ret = h->ae_algo->ops->set_gro_en(h, enable); ++ if (ret) ++ return ret; ++ } ++ ++ if ((changed & NETIF_F_HW_VLAN_CTAG_RX) && ++ h->ae_algo->ops->enable_hw_strip_rxvtag) { ++ enable = !!(features & NETIF_F_HW_VLAN_CTAG_RX); ++ ret = h->ae_algo->ops->enable_hw_strip_rxvtag(h, enable); ++ if (ret) ++ return ret; ++ } ++ ++ if ((changed & NETIF_F_NTUPLE) && h->ae_algo->ops->enable_fd) { ++ enable = !!(features & NETIF_F_NTUPLE); ++ h->ae_algo->ops->enable_fd(h, enable); ++ } ++ ++ if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) && ++ h->ae_algo->ops->cls_flower_active(h)) { ++ netdev_err(netdev, ++ "there are offloaded TC filters active, cannot disable HW TC offload"); ++ return -EINVAL; ++ } ++ ++ if ((changed & NETIF_F_HW_VLAN_CTAG_FILTER) && ++ h->ae_algo->ops->enable_vlan_filter) { ++ enable = !!(features & NETIF_F_HW_VLAN_CTAG_FILTER); ++ ret = h->ae_algo->ops->enable_vlan_filter(h, enable); ++ if (ret) ++ return ret; ++ } ++ ++ netdev->features = features; ++ return 0; ++} ++ ++static netdev_features_t hns3_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) ++{ ++#define HNS3_MAX_HDR_LEN 480U ++#define HNS3_MAX_L4_HDR_LEN 60U ++ ++ size_t len; ++ ++ if (skb->ip_summed != CHECKSUM_PARTIAL) ++ return features; ++ ++ if (skb->encapsulation) ++ len = skb_inner_transport_header(skb) - skb->data; ++ else ++ len = skb_transport_header(skb) - skb->data; ++ ++ /* Assume L4 is 60 byte as TCP is the only protocol with a ++ * a flexible value, and it's max len is 60 bytes. ++ */ ++ len += HNS3_MAX_L4_HDR_LEN; ++ ++ /* Hardware only supports checksum on the skb with a max header ++ * len of 480 bytes. ++ */ ++ if (len > HNS3_MAX_HDR_LEN) ++ features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); ++ ++ return features; ++} ++ ++static void hns3_fetch_stats(struct rtnl_link_stats64 *stats, ++ struct hns3_enet_ring *ring, bool is_tx) ++{ ++ unsigned int start; ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&ring->syncp); ++ if (is_tx) { ++ stats->tx_bytes += ring->stats.tx_bytes; ++ stats->tx_packets += ring->stats.tx_pkts; ++ stats->tx_dropped += ring->stats.sw_err_cnt; ++ stats->tx_dropped += ring->stats.tx_vlan_err; ++ stats->tx_dropped += ring->stats.tx_l4_proto_err; ++ stats->tx_dropped += ring->stats.tx_l2l3l4_err; ++ stats->tx_dropped += ring->stats.tx_tso_err; ++ stats->tx_dropped += ring->stats.over_max_recursion; ++ stats->tx_dropped += ring->stats.hw_limitation; ++ stats->tx_dropped += ring->stats.copy_bits_err; ++ stats->tx_dropped += ring->stats.skb2sgl_err; ++ stats->tx_dropped += ring->stats.map_sg_err; ++ stats->tx_errors += ring->stats.sw_err_cnt; ++ stats->tx_errors += ring->stats.tx_vlan_err; ++ stats->tx_errors += ring->stats.tx_l4_proto_err; ++ stats->tx_errors += ring->stats.tx_l2l3l4_err; ++ stats->tx_errors += ring->stats.tx_tso_err; ++ stats->tx_errors += ring->stats.over_max_recursion; ++ stats->tx_errors += ring->stats.hw_limitation; ++ stats->tx_errors += ring->stats.copy_bits_err; ++ stats->tx_errors += ring->stats.skb2sgl_err; ++ stats->tx_errors += ring->stats.map_sg_err; ++ } else { ++ stats->rx_bytes += ring->stats.rx_bytes; ++ stats->rx_packets += ring->stats.rx_pkts; ++ stats->rx_dropped += ring->stats.l2_err; ++ stats->rx_errors += ring->stats.l2_err; ++ stats->rx_errors += ring->stats.l3l4_csum_err; ++ stats->rx_crc_errors += ring->stats.l2_err; ++ stats->multicast += ring->stats.rx_multicast; ++ stats->rx_length_errors += ring->stats.err_pkt_len; ++ } ++ } while (u64_stats_fetch_retry_irq(&ring->syncp, start)); ++} ++ ++static void hns3_nic_get_stats64(struct net_device *netdev, ++ struct rtnl_link_stats64 *stats) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ int queue_num = priv->ae_handle->kinfo.num_tqps; ++ struct hnae3_handle *handle = priv->ae_handle; ++ struct rtnl_link_stats64 ring_total_stats; ++ struct hns3_enet_ring *ring; ++ unsigned int idx; ++ ++ if (test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) ++ return; ++ ++ handle->ae_algo->ops->update_stats(handle, &netdev->stats); ++ ++ memset(&ring_total_stats, 0, sizeof(ring_total_stats)); ++ for (idx = 0; idx < queue_num; idx++) { ++ /* fetch the tx stats */ ++ ring = &priv->ring[idx]; ++ hns3_fetch_stats(&ring_total_stats, ring, true); ++ ++ /* fetch the rx stats */ ++ ring = &priv->ring[idx + queue_num]; ++ hns3_fetch_stats(&ring_total_stats, ring, false); ++ } ++ ++ stats->tx_bytes = ring_total_stats.tx_bytes; ++ stats->tx_packets = ring_total_stats.tx_packets; ++ stats->rx_bytes = ring_total_stats.rx_bytes; ++ stats->rx_packets = ring_total_stats.rx_packets; ++ ++ stats->rx_errors = ring_total_stats.rx_errors; ++ stats->multicast = ring_total_stats.multicast; ++ stats->rx_length_errors = ring_total_stats.rx_length_errors; ++ stats->rx_crc_errors = ring_total_stats.rx_crc_errors; ++ stats->rx_missed_errors = netdev->stats.rx_missed_errors; ++ ++ stats->tx_errors = ring_total_stats.tx_errors; ++ stats->rx_dropped = ring_total_stats.rx_dropped; ++ stats->tx_dropped = ring_total_stats.tx_dropped; ++ stats->collisions = netdev->stats.collisions; ++ stats->rx_over_errors = netdev->stats.rx_over_errors; ++ stats->rx_frame_errors = netdev->stats.rx_frame_errors; ++ stats->rx_fifo_errors = netdev->stats.rx_fifo_errors; ++ stats->tx_aborted_errors = netdev->stats.tx_aborted_errors; ++ stats->tx_carrier_errors = netdev->stats.tx_carrier_errors; ++ stats->tx_fifo_errors = netdev->stats.tx_fifo_errors; ++ stats->tx_heartbeat_errors = netdev->stats.tx_heartbeat_errors; ++ stats->tx_window_errors = netdev->stats.tx_window_errors; ++ stats->rx_compressed = netdev->stats.rx_compressed; ++ stats->tx_compressed = netdev->stats.tx_compressed; ++} ++ ++static int hns3_setup_tc(struct net_device *netdev, void *type_data) ++{ ++ struct tc_mqprio_qopt_offload *mqprio_qopt = type_data; ++ struct hnae3_knic_private_info *kinfo; ++ u8 tc = mqprio_qopt->qopt.num_tc; ++ u16 mode = mqprio_qopt->mode; ++ u8 hw = mqprio_qopt->qopt.hw; ++ struct hnae3_handle *h; ++ ++ if (!((hw == TC_MQPRIO_HW_OFFLOAD_TCS && ++ mode == TC_MQPRIO_MODE_CHANNEL) || (!hw && tc == 0))) ++ return -EOPNOTSUPP; ++ ++ if (tc > HNAE3_MAX_TC) ++ return -EINVAL; ++ ++ if (!netdev) ++ return -EINVAL; ++ ++ h = hns3_get_handle(netdev); ++ kinfo = &h->kinfo; ++ ++ netif_dbg(h, drv, netdev, "setup tc: num_tc=%u\n", tc); ++ ++ return (kinfo->dcb_ops && kinfo->dcb_ops->setup_tc) ? ++ kinfo->dcb_ops->setup_tc(h, mqprio_qopt) : -EOPNOTSUPP; ++} ++ ++static int hns3_setup_tc_cls_flower(struct hns3_nic_priv *priv, ++ struct flow_cls_offload *flow) ++{ ++ int tc = tc_classid_to_hwtc(priv->netdev, flow->classid); ++ struct hnae3_handle *h = hns3_get_handle(priv->netdev); ++ ++ switch (flow->command) { ++ case FLOW_CLS_REPLACE: ++ if (h->ae_algo->ops->add_cls_flower) ++ return h->ae_algo->ops->add_cls_flower(h, flow, tc); ++ break; ++ case FLOW_CLS_DESTROY: ++ if (h->ae_algo->ops->del_cls_flower) ++ return h->ae_algo->ops->del_cls_flower(h, flow); ++ break; ++ default: ++ break; ++ } ++ ++ return -EOPNOTSUPP; ++} ++ ++static int hns3_setup_tc_block_cb(enum tc_setup_type type, void *type_data, ++ void *cb_priv) ++{ ++ struct hns3_nic_priv *priv = cb_priv; ++ ++ if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data)) ++ return -EOPNOTSUPP; ++ ++ switch (type) { ++ case TC_SETUP_CLSFLOWER: ++ return hns3_setup_tc_cls_flower(priv, type_data); ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++static LIST_HEAD(hns3_block_cb_list); ++ ++static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type, ++ void *type_data) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(dev); ++ int ret; ++ ++ switch (type) { ++ case TC_SETUP_QDISC_MQPRIO: ++ ret = hns3_setup_tc(dev, type_data); ++ break; ++ case TC_SETUP_BLOCK: ++ ret = flow_block_cb_setup_simple(type_data, ++ &hns3_block_cb_list, ++ hns3_setup_tc_block_cb, ++ priv, priv, true); ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return ret; ++} ++ ++static int hns3_vlan_rx_add_vid(struct net_device *netdev, ++ __be16 proto, u16 vid) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret = -EIO; ++ ++ if (h->ae_algo->ops->set_vlan_filter) ++ ret = h->ae_algo->ops->set_vlan_filter(h, proto, vid, false); ++ ++ return ret; ++} ++ ++static int hns3_vlan_rx_kill_vid(struct net_device *netdev, ++ __be16 proto, u16 vid) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret = -EIO; ++ ++ if (h->ae_algo->ops->set_vlan_filter) ++ ret = h->ae_algo->ops->set_vlan_filter(h, proto, vid, true); ++ ++ return ret; ++} ++ ++static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, ++ u8 qos, __be16 vlan_proto) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret = -EIO; ++ ++ netif_dbg(h, drv, netdev, ++ "set vf vlan: vf=%d, vlan=%u, qos=%u, vlan_proto=0x%x\n", ++ vf, vlan, qos, ntohs(vlan_proto)); ++ ++ if (h->ae_algo->ops->set_vf_vlan_filter) ++ ret = h->ae_algo->ops->set_vf_vlan_filter(h, vf, vlan, ++ qos, vlan_proto); ++ ++ return ret; ++} ++ ++static int hns3_set_vf_spoofchk(struct net_device *netdev, int vf, bool enable) ++{ ++ struct hnae3_handle *handle = hns3_get_handle(netdev); ++ ++ if (hns3_nic_resetting(netdev)) ++ return -EBUSY; ++ ++ if (!handle->ae_algo->ops->set_vf_spoofchk) ++ return -EOPNOTSUPP; ++ ++ return handle->ae_algo->ops->set_vf_spoofchk(handle, vf, enable); ++} ++ ++static int hns3_set_vf_trust(struct net_device *netdev, int vf, bool enable) ++{ ++ struct hnae3_handle *handle = hns3_get_handle(netdev); ++ ++ if (!handle->ae_algo->ops->set_vf_trust) ++ return -EOPNOTSUPP; ++ ++ return handle->ae_algo->ops->set_vf_trust(handle, vf, enable); ++} ++ ++static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret; ++ ++ if (hns3_nic_resetting(netdev)) ++ return -EBUSY; ++ ++ if (!h->ae_algo->ops->set_mtu) ++ return -EOPNOTSUPP; ++ ++ netif_dbg(h, drv, netdev, ++ "change mtu from %u to %d\n", netdev->mtu, new_mtu); ++ ++ ret = h->ae_algo->ops->set_mtu(h, new_mtu); ++ if (ret) ++ netdev_err(netdev, "failed to change MTU in hardware %d\n", ++ ret); ++ else ++ netdev->mtu = new_mtu; ++ ++ return ret; ++} ++ ++static int hns3_get_timeout_queue(struct net_device *ndev) ++{ ++ int i; ++ ++ /* Find the stopped queue the same way the stack does */ ++ for (i = 0; i < ndev->num_tx_queues; i++) { ++ struct netdev_queue *q; ++ unsigned long trans_start; ++ ++ q = netdev_get_tx_queue(ndev, i); ++ trans_start = READ_ONCE(q->trans_start); ++ if (netif_xmit_stopped(q) && ++ time_after(jiffies, ++ (trans_start + ndev->watchdog_timeo))) { ++#ifdef CONFIG_BQL ++ struct dql *dql = &q->dql; ++ ++ netdev_info(ndev, "DQL info last_cnt: %u, queued: %u, adj_limit: %u, completed: %u\n", ++ dql->last_obj_cnt, dql->num_queued, ++ dql->adj_limit, dql->num_completed); ++#endif ++ netdev_info(ndev, "queue state: 0x%lx, delta msecs: %u\n", ++ q->state, ++ jiffies_to_msecs(jiffies - trans_start)); ++ break; ++ } ++ } ++ ++ return i; ++} ++ ++static void hns3_dump_queue_stats(struct net_device *ndev, ++ struct hns3_enet_ring *tx_ring, ++ int timeout_queue) ++{ ++ struct napi_struct *napi = &tx_ring->tqp_vector->napi; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ ++ netdev_info(ndev, ++ "tx_timeout count: %llu, queue id: %d, SW_NTU: 0x%x, SW_NTC: 0x%x, napi state: %lu\n", ++ priv->tx_timeout_count, timeout_queue, tx_ring->next_to_use, ++ tx_ring->next_to_clean, napi->state); ++ ++ netdev_info(ndev, ++ "tx_pkts: %llu, tx_bytes: %llu, sw_err_cnt: %llu, tx_pending: %d\n", ++ tx_ring->stats.tx_pkts, tx_ring->stats.tx_bytes, ++ tx_ring->stats.sw_err_cnt, tx_ring->pending_buf); ++ ++ netdev_info(ndev, ++ "seg_pkt_cnt: %llu, tx_more: %llu, restart_queue: %llu, tx_busy: %llu\n", ++ tx_ring->stats.seg_pkt_cnt, tx_ring->stats.tx_more, ++ tx_ring->stats.restart_queue, tx_ring->stats.tx_busy); ++ ++ netdev_info(ndev, "tx_push: %llu, tx_mem_doorbell: %llu\n", ++ tx_ring->stats.tx_push, tx_ring->stats.tx_mem_doorbell); ++} ++ ++static void hns3_dump_queue_reg(struct net_device *ndev, ++ struct hns3_enet_ring *tx_ring) ++{ ++ netdev_info(ndev, ++ "BD_NUM: 0x%x HW_HEAD: 0x%x, HW_TAIL: 0x%x, BD_ERR: 0x%x, INT: 0x%x\n", ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_NUM_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_HEAD_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TAIL_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_ERR_REG), ++ readl(tx_ring->tqp_vector->mask_addr)); ++ netdev_info(ndev, ++ "RING_EN: 0x%x, TC: 0x%x, FBD_NUM: 0x%x FBD_OFT: 0x%x, EBD_NUM: 0x%x, EBD_OFT: 0x%x\n", ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_EN_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TC_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_FBDNUM_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_OFFSET_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_EBDNUM_REG), ++ hns3_tqp_read_reg(tx_ring, ++ HNS3_RING_TX_RING_EBD_OFFSET_REG)); ++} ++ ++static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ struct hnae3_handle *h = hns3_get_handle(ndev); ++ struct hns3_enet_ring *tx_ring; ++ int timeout_queue; ++ ++ timeout_queue = hns3_get_timeout_queue(ndev); ++ if (timeout_queue >= ndev->num_tx_queues) { ++ netdev_info(ndev, ++ "no netdev TX timeout queue found, timeout count: %llu\n", ++ priv->tx_timeout_count); ++ return false; ++ } ++ ++ priv->tx_timeout_count++; ++ ++ tx_ring = &priv->ring[timeout_queue]; ++ hns3_dump_queue_stats(ndev, tx_ring, timeout_queue); ++ ++ /* When mac received many pause frames continuous, it's unable to send ++ * packets, which may cause tx timeout ++ */ ++ if (h->ae_algo->ops->get_mac_stats) { ++ struct hns3_mac_stats mac_stats; ++ ++ h->ae_algo->ops->get_mac_stats(h, &mac_stats); ++ netdev_info(ndev, "tx_pause_cnt: %llu, rx_pause_cnt: %llu\n", ++ mac_stats.tx_pause_cnt, mac_stats.rx_pause_cnt); ++ } ++ ++ hns3_dump_queue_reg(ndev, tx_ring); ++ ++ return true; ++} ++ ++static void hns3_nic_net_timeout(struct net_device *ndev, unsigned int txqueue) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ struct hnae3_handle *h = priv->ae_handle; ++ ++ if (!hns3_get_tx_timeo_queue_info(ndev)) ++ return; ++ ++ /* request the reset, and let the hclge to determine ++ * which reset level should be done ++ */ ++ if (h->ae_algo->ops->reset_event) ++ h->ae_algo->ops->reset_event(h->pdev, h); ++} ++ ++#ifdef CONFIG_RFS_ACCEL ++static int hns3_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, ++ u16 rxq_index, u32 flow_id) ++{ ++ struct hnae3_handle *h = hns3_get_handle(dev); ++ struct flow_keys fkeys; ++ ++ if (!h->ae_algo->ops->add_arfs_entry) ++ return -EOPNOTSUPP; ++ ++ if (skb->encapsulation) ++ return -EPROTONOSUPPORT; ++ ++ if (!skb_flow_dissect_flow_keys(skb, &fkeys, 0)) ++ return -EPROTONOSUPPORT; ++ ++ if ((fkeys.basic.n_proto != htons(ETH_P_IP) && ++ fkeys.basic.n_proto != htons(ETH_P_IPV6)) || ++ (fkeys.basic.ip_proto != IPPROTO_TCP && ++ fkeys.basic.ip_proto != IPPROTO_UDP)) ++ return -EPROTONOSUPPORT; ++ ++ return h->ae_algo->ops->add_arfs_entry(h, rxq_index, flow_id, &fkeys); ++} ++#endif ++ ++static int hns3_nic_get_vf_config(struct net_device *ndev, int vf, ++ struct ifla_vf_info *ivf) ++{ ++ struct hnae3_handle *h = hns3_get_handle(ndev); ++ ++ if (!h->ae_algo->ops->get_vf_config) ++ return -EOPNOTSUPP; ++ ++ return h->ae_algo->ops->get_vf_config(h, vf, ivf); ++} ++ ++static int hns3_nic_set_vf_link_state(struct net_device *ndev, int vf, ++ int link_state) ++{ ++ struct hnae3_handle *h = hns3_get_handle(ndev); ++ ++ if (!h->ae_algo->ops->set_vf_link_state) ++ return -EOPNOTSUPP; ++ ++ return h->ae_algo->ops->set_vf_link_state(h, vf, link_state); ++} ++ ++static int hns3_nic_set_vf_rate(struct net_device *ndev, int vf, ++ int min_tx_rate, int max_tx_rate) ++{ ++ struct hnae3_handle *h = hns3_get_handle(ndev); ++ ++ if (!h->ae_algo->ops->set_vf_rate) ++ return -EOPNOTSUPP; ++ ++ return h->ae_algo->ops->set_vf_rate(h, vf, min_tx_rate, max_tx_rate, ++ false); ++} ++ ++static int hns3_nic_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ ++ if (!h->ae_algo->ops->set_vf_mac) ++ return -EOPNOTSUPP; ++ ++ if (is_multicast_ether_addr(mac)) { ++ hnae3_format_mac_addr(format_mac_addr, mac); ++ netdev_err(netdev, ++ "Invalid MAC:%s specified. Could not set MAC\n", ++ format_mac_addr); ++ return -EINVAL; ++ } ++ ++ return h->ae_algo->ops->set_vf_mac(h, vf_id, mac); ++} ++ ++static const struct net_device_ops hns3_nic_netdev_ops = { ++ .ndo_open = hns3_nic_net_open, ++ .ndo_stop = hns3_nic_net_stop, ++ .ndo_start_xmit = hns3_nic_net_xmit, ++ .ndo_tx_timeout = hns3_nic_net_timeout, ++ .ndo_set_mac_address = hns3_nic_net_set_mac_address, ++ .ndo_eth_ioctl = hns3_nic_do_ioctl, ++ .ndo_change_mtu = hns3_nic_change_mtu, ++ .ndo_set_features = hns3_nic_set_features, ++ .ndo_features_check = hns3_features_check, ++ .ndo_get_stats64 = hns3_nic_get_stats64, ++ .ndo_setup_tc = hns3_nic_setup_tc, ++ .ndo_set_rx_mode = hns3_nic_set_rx_mode, ++ .ndo_vlan_rx_add_vid = hns3_vlan_rx_add_vid, ++ .ndo_vlan_rx_kill_vid = hns3_vlan_rx_kill_vid, ++ .ndo_set_vf_vlan = hns3_ndo_set_vf_vlan, ++ .ndo_set_vf_spoofchk = hns3_set_vf_spoofchk, ++ .ndo_set_vf_trust = hns3_set_vf_trust, ++#ifdef CONFIG_RFS_ACCEL ++ .ndo_rx_flow_steer = hns3_rx_flow_steer, ++#endif ++ .ndo_get_vf_config = hns3_nic_get_vf_config, ++ .ndo_set_vf_link_state = hns3_nic_set_vf_link_state, ++ .ndo_set_vf_rate = hns3_nic_set_vf_rate, ++ .ndo_set_vf_mac = hns3_nic_set_vf_mac, ++}; ++ ++bool hns3_is_phys_func(struct pci_dev *pdev) ++{ ++ u32 dev_id = pdev->device; ++ ++ switch (dev_id) { ++ case HNAE3_DEV_ID_GE: ++ case HNAE3_DEV_ID_25GE: ++ case HNAE3_DEV_ID_25GE_RDMA: ++ case HNAE3_DEV_ID_25GE_RDMA_MACSEC: ++ case HNAE3_DEV_ID_50GE_RDMA: ++ case HNAE3_DEV_ID_50GE_RDMA_MACSEC: ++ case HNAE3_DEV_ID_100G_RDMA_MACSEC: ++ case HNAE3_DEV_ID_200G_RDMA: ++ return true; ++ case HNAE3_DEV_ID_VF: ++ case HNAE3_DEV_ID_RDMA_DCB_PFC_VF: ++ return false; ++ default: ++ dev_warn(&pdev->dev, "un-recognized pci device-id %u", ++ dev_id); ++ } ++ ++ return false; ++} ++ ++static void hns3_disable_sriov(struct pci_dev *pdev) ++{ ++ /* If our VFs are assigned we cannot shut down SR-IOV ++ * without causing issues, so just leave the hardware ++ * available but disabled ++ */ ++ if (pci_vfs_assigned(pdev)) { ++ dev_warn(&pdev->dev, ++ "disabling driver while VFs are assigned\n"); ++ return; ++ } ++ ++ pci_disable_sriov(pdev); ++} ++ ++/* hns3_probe - Device initialization routine ++ * @pdev: PCI device information struct ++ * @ent: entry in hns3_pci_tbl ++ * ++ * hns3_probe initializes a PF identified by a pci_dev structure. ++ * The OS initialization, configuring of the PF private structure, ++ * and a hardware reset occur. ++ * ++ * Returns 0 on success, negative on failure ++ */ ++static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ++{ ++ struct hnae3_ae_dev *ae_dev; ++ int ret; ++ ++ ae_dev = devm_kzalloc(&pdev->dev, sizeof(*ae_dev), GFP_KERNEL); ++ if (!ae_dev) ++ return -ENOMEM; ++ ++ ae_dev->pdev = pdev; ++ ae_dev->flag = ent->driver_data; ++ pci_set_drvdata(pdev, ae_dev); ++ ++ ret = hnae3_register_ae_dev(ae_dev); ++ if (ret) ++ pci_set_drvdata(pdev, NULL); ++ ++ return ret; ++} ++ ++/** ++ * hns3_clean_vf_config ++ * @pdev: pointer to a pci_dev structure ++ * @num_vfs: number of VFs allocated ++ * ++ * Clean residual vf config after disable sriov ++ **/ ++static void hns3_clean_vf_config(struct pci_dev *pdev, int num_vfs) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ if (ae_dev->ops->clean_vf_config) ++ ae_dev->ops->clean_vf_config(ae_dev, num_vfs); ++} ++ ++/* hns3_remove - Device removal routine ++ * @pdev: PCI device information struct ++ */ ++static void hns3_remove(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ if (hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV)) ++ hns3_disable_sriov(pdev); ++ ++ hnae3_unregister_ae_dev(ae_dev); ++ pci_set_drvdata(pdev, NULL); ++} ++ ++/** ++ * hns3_pci_sriov_configure ++ * @pdev: pointer to a pci_dev structure ++ * @num_vfs: number of VFs to allocate ++ * ++ * Enable or change the number of VFs. Called when the user updates the number ++ * of VFs in sysfs. ++ **/ ++static int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) ++{ ++ int ret; ++ ++ if (!(hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))) { ++ dev_warn(&pdev->dev, "Can not config SRIOV\n"); ++ return -EINVAL; ++ } ++ ++ if (num_vfs) { ++ ret = pci_enable_sriov(pdev, num_vfs); ++ if (ret) ++ dev_err(&pdev->dev, "SRIOV enable failed %d\n", ret); ++ else ++ return num_vfs; ++ } else if (!pci_vfs_assigned(pdev)) { ++ int num_vfs_pre = pci_num_vf(pdev); ++ ++ pci_disable_sriov(pdev); ++ hns3_clean_vf_config(pdev, num_vfs_pre); ++ } else { ++ dev_warn(&pdev->dev, ++ "Unable to free VFs because some are assigned to VMs.\n"); ++ } ++ ++ return 0; ++} ++ ++static void hns3_shutdown(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ hnae3_unregister_ae_dev(ae_dev); ++ pci_set_drvdata(pdev, NULL); ++ ++ if (system_state == SYSTEM_POWER_OFF) ++ pci_set_power_state(pdev, PCI_D3hot); ++} ++ ++static int __maybe_unused hns3_suspend(struct device *dev) ++{ ++ struct hnae3_ae_dev *ae_dev = dev_get_drvdata(dev); ++ ++ if (ae_dev && hns3_is_phys_func(ae_dev->pdev)) { ++ dev_info(dev, "Begin to suspend.\n"); ++ if (ae_dev->ops && ae_dev->ops->reset_prepare) ++ ae_dev->ops->reset_prepare(ae_dev, HNAE3_FUNC_RESET); ++ } ++ ++ return 0; ++} ++ ++static int __maybe_unused hns3_resume(struct device *dev) ++{ ++ struct hnae3_ae_dev *ae_dev = dev_get_drvdata(dev); ++ ++ if (ae_dev && hns3_is_phys_func(ae_dev->pdev)) { ++ dev_info(dev, "Begin to resume.\n"); ++ if (ae_dev->ops && ae_dev->ops->reset_done) ++ ae_dev->ops->reset_done(ae_dev); ++ } ++ ++ return 0; ++} ++ ++static pci_ers_result_t hns3_error_detected(struct pci_dev *pdev, ++ pci_channel_state_t state) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ pci_ers_result_t ret; ++ ++ dev_info(&pdev->dev, "PCI error detected, state(=%u)!!\n", state); ++ ++ if (state == pci_channel_io_perm_failure) ++ return PCI_ERS_RESULT_DISCONNECT; ++ ++ if (!ae_dev || !ae_dev->ops) { ++ dev_err(&pdev->dev, ++ "Can't recover - error happened before device initialized\n"); ++ return PCI_ERS_RESULT_NONE; ++ } ++ ++ if (ae_dev->ops->handle_hw_ras_error) ++ ret = ae_dev->ops->handle_hw_ras_error(ae_dev); ++ else ++ return PCI_ERS_RESULT_NONE; ++ ++ return ret; ++} ++ ++static pci_ers_result_t hns3_slot_reset(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ const struct hnae3_ae_ops *ops; ++ enum hnae3_reset_type reset_type; ++ struct device *dev = &pdev->dev; ++ ++ if (!ae_dev || !ae_dev->ops) ++ return PCI_ERS_RESULT_NONE; ++ ++ ops = ae_dev->ops; ++ /* request the reset */ ++ if (ops->reset_event && ops->get_reset_level && ++ ops->set_default_reset_request) { ++ if (ae_dev->hw_err_reset_req) { ++ reset_type = ops->get_reset_level(ae_dev, ++ &ae_dev->hw_err_reset_req); ++ ops->set_default_reset_request(ae_dev, reset_type); ++ dev_info(dev, "requesting reset due to PCI error\n"); ++ ops->reset_event(pdev, NULL); ++ } ++ ++ return PCI_ERS_RESULT_RECOVERED; ++ } ++ ++ return PCI_ERS_RESULT_DISCONNECT; ++} ++ ++static void hns3_reset_prepare(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ dev_info(&pdev->dev, "FLR prepare\n"); ++ if (ae_dev && ae_dev->ops && ae_dev->ops->reset_prepare) ++ ae_dev->ops->reset_prepare(ae_dev, HNAE3_FLR_RESET); ++} ++ ++static void hns3_reset_done(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ dev_info(&pdev->dev, "FLR done\n"); ++ if (ae_dev && ae_dev->ops && ae_dev->ops->reset_done) ++ ae_dev->ops->reset_done(ae_dev); ++} ++ ++static const struct pci_error_handlers hns3_err_handler = { ++ .error_detected = hns3_error_detected, ++ .slot_reset = hns3_slot_reset, ++ .reset_prepare = hns3_reset_prepare, ++ .reset_done = hns3_reset_done, ++}; ++ ++static SIMPLE_DEV_PM_OPS(hns3_pm_ops, hns3_suspend, hns3_resume); ++ ++static struct pci_driver hns3_driver = { ++ .name = hns3_driver_name, ++ .id_table = hns3_pci_tbl, ++ .probe = hns3_probe, ++ .remove = hns3_remove, ++ .shutdown = hns3_shutdown, ++ .driver.pm = &hns3_pm_ops, ++ .sriov_configure = hns3_pci_sriov_configure, ++ .err_handler = &hns3_err_handler, ++}; ++ ++/* set default feature to hns3 */ ++static void hns3_set_default_feature(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct pci_dev *pdev = h->pdev; ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ netdev->priv_flags |= IFF_UNICAST_FLT; ++ ++ netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM; ++ ++ netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | ++ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | ++ NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO | ++ NETIF_F_GRO | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_GSO_GRE | ++ NETIF_F_GSO_GRE_CSUM | NETIF_F_GSO_UDP_TUNNEL | ++ NETIF_F_SCTP_CRC | NETIF_F_FRAGLIST; ++ ++ if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V2) { ++ netdev->features |= NETIF_F_GRO_HW; ++ ++ if (!(h->flags & HNAE3_SUPPORT_VF)) ++ netdev->features |= NETIF_F_NTUPLE; ++ } ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_UDP_GSO_B, ae_dev->caps)) ++ netdev->features |= NETIF_F_GSO_UDP_L4; ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_HW_TX_CSUM_B, ae_dev->caps)) ++ netdev->features |= NETIF_F_HW_CSUM; ++ else ++ netdev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B, ae_dev->caps)) ++ netdev->features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_FD_FORWARD_TC_B, ae_dev->caps)) ++ netdev->features |= NETIF_F_HW_TC; ++ ++ netdev->hw_features |= netdev->features; ++ if (!test_bit(HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, ae_dev->caps)) ++ netdev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; ++ ++ netdev->vlan_features |= netdev->features & ++ ~(NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_TX | ++ NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_GRO_HW | NETIF_F_NTUPLE | ++ NETIF_F_HW_TC); ++ ++ netdev->hw_enc_features |= netdev->vlan_features | NETIF_F_TSO_MANGLEID; ++} ++ ++static int hns3_alloc_buffer(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb) ++{ ++ unsigned int order = hns3_page_order(ring); ++ struct page *p; ++ ++ if (ring->page_pool) { ++ p = page_pool_dev_alloc_frag(ring->page_pool, ++ &cb->page_offset, ++ hns3_buf_size(ring)); ++ if (unlikely(!p)) ++ return -ENOMEM; ++ ++ cb->priv = p; ++ cb->buf = page_address(p); ++ cb->dma = page_pool_get_dma_addr(p); ++ cb->type = DESC_TYPE_PP_FRAG; ++ cb->reuse_flag = 0; ++ return 0; ++ } ++ ++ p = dev_alloc_pages(order); ++ if (!p) ++ return -ENOMEM; ++ ++ cb->priv = p; ++ cb->page_offset = 0; ++ cb->reuse_flag = 0; ++ cb->buf = page_address(p); ++ cb->length = hns3_page_size(ring); ++ cb->type = DESC_TYPE_PAGE; ++ page_ref_add(p, USHRT_MAX - 1); ++ cb->pagecnt_bias = USHRT_MAX; ++ ++ return 0; ++} ++ ++static void hns3_free_buffer(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb, int budget) ++{ ++ if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD | ++ DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB)) ++ napi_consume_skb(cb->priv, budget); ++ else if (!HNAE3_IS_TX_RING(ring)) { ++ if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias) ++ __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); ++ else if (cb->type & DESC_TYPE_PP_FRAG) ++ page_pool_put_full_page(ring->page_pool, cb->priv, ++ false); ++ } ++ memset(cb, 0, sizeof(*cb)); ++} ++ ++static int hns3_map_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb) ++{ ++ cb->dma = dma_map_page(ring_to_dev(ring), cb->priv, 0, ++ cb->length, ring_to_dma_dir(ring)); ++ ++ if (unlikely(dma_mapping_error(ring_to_dev(ring), cb->dma))) ++ return -EIO; ++ ++ return 0; ++} ++ ++static void hns3_unmap_buffer(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb) ++{ ++ if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB)) ++ dma_unmap_single(ring_to_dev(ring), cb->dma, cb->length, ++ ring_to_dma_dir(ring)); ++ else if ((cb->type & DESC_TYPE_PAGE) && cb->length) ++ dma_unmap_page(ring_to_dev(ring), cb->dma, cb->length, ++ ring_to_dma_dir(ring)); ++ else if (cb->type & (DESC_TYPE_BOUNCE_ALL | DESC_TYPE_BOUNCE_HEAD | ++ DESC_TYPE_SGL_SKB)) ++ hns3_tx_spare_reclaim_cb(ring, cb); ++} ++ ++static void hns3_buffer_detach(struct hns3_enet_ring *ring, int i) ++{ ++ hns3_unmap_buffer(ring, &ring->desc_cb[i]); ++ ring->desc[i].addr = 0; ++ ring->desc_cb[i].refill = 0; ++} ++ ++static void hns3_free_buffer_detach(struct hns3_enet_ring *ring, int i, ++ int budget) ++{ ++ struct hns3_desc_cb *cb = &ring->desc_cb[i]; ++ ++ if (!ring->desc_cb[i].dma) ++ return; ++ ++ hns3_buffer_detach(ring, i); ++ hns3_free_buffer(ring, cb, budget); ++} ++ ++static void hns3_free_buffers(struct hns3_enet_ring *ring) ++{ ++ int i; ++ ++ for (i = 0; i < ring->desc_num; i++) ++ hns3_free_buffer_detach(ring, i, 0); ++} ++ ++/* free desc along with its attached buffer */ ++static void hns3_free_desc(struct hns3_enet_ring *ring) ++{ ++ int size = ring->desc_num * sizeof(ring->desc[0]); ++ ++ hns3_free_buffers(ring); ++ ++ if (ring->desc) { ++ dma_free_coherent(ring_to_dev(ring), size, ++ ring->desc, ring->desc_dma_addr); ++ ring->desc = NULL; ++ } ++} ++ ++static int hns3_alloc_desc(struct hns3_enet_ring *ring) ++{ ++ int size = ring->desc_num * sizeof(ring->desc[0]); ++ ++ ring->desc = dma_alloc_coherent(ring_to_dev(ring), size, ++ &ring->desc_dma_addr, GFP_KERNEL); ++ if (!ring->desc) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int hns3_alloc_and_map_buffer(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb) ++{ ++ int ret; ++ ++ ret = hns3_alloc_buffer(ring, cb); ++ if (ret || ring->page_pool) ++ goto out; ++ ++ ret = hns3_map_buffer(ring, cb); ++ if (ret) ++ goto out_with_buf; ++ ++ return 0; ++ ++out_with_buf: ++ hns3_free_buffer(ring, cb, 0); ++out: ++ return ret; ++} ++ ++static int hns3_alloc_and_attach_buffer(struct hns3_enet_ring *ring, int i) ++{ ++ int ret = hns3_alloc_and_map_buffer(ring, &ring->desc_cb[i]); ++ ++ if (ret) ++ return ret; ++ ++ ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + ++ ring->desc_cb[i].page_offset); ++ ring->desc_cb[i].refill = 1; ++ ++ return 0; ++} ++ ++/* Allocate memory for raw pkg, and map with dma */ ++static int hns3_alloc_ring_buffers(struct hns3_enet_ring *ring) ++{ ++ int i, j, ret; ++ ++ for (i = 0; i < ring->desc_num; i++) { ++ ret = hns3_alloc_and_attach_buffer(ring, i); ++ if (ret) ++ goto out_buffer_fail; ++ } ++ ++ return 0; ++ ++out_buffer_fail: ++ for (j = i - 1; j >= 0; j--) ++ hns3_free_buffer_detach(ring, j, 0); ++ return ret; ++} ++ ++/* detach a in-used buffer and replace with a reserved one */ ++static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i, ++ struct hns3_desc_cb *res_cb) ++{ ++ hns3_unmap_buffer(ring, &ring->desc_cb[i]); ++ ring->desc_cb[i] = *res_cb; ++ ring->desc_cb[i].refill = 1; ++ ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + ++ ring->desc_cb[i].page_offset); ++ ring->desc[i].rx.bd_base_info = 0; ++} ++ ++static void hns3_reuse_buffer(struct hns3_enet_ring *ring, int i) ++{ ++ ring->desc_cb[i].reuse_flag = 0; ++ ring->desc_cb[i].refill = 1; ++ ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + ++ ring->desc_cb[i].page_offset); ++ ring->desc[i].rx.bd_base_info = 0; ++ ++ dma_sync_single_for_device(ring_to_dev(ring), ++ ring->desc_cb[i].dma + ring->desc_cb[i].page_offset, ++ hns3_buf_size(ring), ++ DMA_FROM_DEVICE); ++} ++ ++static bool hns3_nic_reclaim_desc(struct hns3_enet_ring *ring, ++ int *bytes, int *pkts, int budget) ++{ ++ /* pair with ring->last_to_use update in hns3_tx_doorbell(), ++ * smp_store_release() is not used in hns3_tx_doorbell() because ++ * the doorbell operation already have the needed barrier operation. ++ */ ++ int ltu = smp_load_acquire(&ring->last_to_use); ++ int ntc = ring->next_to_clean; ++ struct hns3_desc_cb *desc_cb; ++ bool reclaimed = false; ++ struct hns3_desc *desc; ++ ++ while (ltu != ntc) { ++ desc = &ring->desc[ntc]; ++ ++ if (le16_to_cpu(desc->tx.bdtp_fe_sc_vld_ra_ri) & ++ BIT(HNS3_TXD_VLD_B)) ++ break; ++ ++ desc_cb = &ring->desc_cb[ntc]; ++ ++ if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_ALL | ++ DESC_TYPE_BOUNCE_HEAD | ++ DESC_TYPE_SGL_SKB)) { ++ (*pkts)++; ++ (*bytes) += desc_cb->send_bytes; ++ } ++ ++ /* desc_cb will be cleaned, after hnae3_free_buffer_detach */ ++ hns3_free_buffer_detach(ring, ntc, budget); ++ ++ if (++ntc == ring->desc_num) ++ ntc = 0; ++ ++ /* Issue prefetch for next Tx descriptor */ ++ prefetch(&ring->desc_cb[ntc]); ++ reclaimed = true; ++ } ++ ++ if (unlikely(!reclaimed)) ++ return false; ++ ++ /* This smp_store_release() pairs with smp_load_acquire() in ++ * ring_space called by hns3_nic_net_xmit. ++ */ ++ smp_store_release(&ring->next_to_clean, ntc); ++ ++ hns3_tx_spare_update(ring); ++ ++ return true; ++} ++ ++void hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct netdev_queue *dev_queue; ++ int bytes, pkts; ++ ++ bytes = 0; ++ pkts = 0; ++ ++ if (unlikely(!hns3_nic_reclaim_desc(ring, &bytes, &pkts, budget))) ++ return; ++ ++ ring->tqp_vector->tx_group.total_bytes += bytes; ++ ring->tqp_vector->tx_group.total_packets += pkts; ++ ++ u64_stats_update_begin(&ring->syncp); ++ ring->stats.tx_bytes += bytes; ++ ring->stats.tx_pkts += pkts; ++ u64_stats_update_end(&ring->syncp); ++ ++ dev_queue = netdev_get_tx_queue(netdev, ring->tqp->tqp_index); ++ netdev_tx_completed_queue(dev_queue, pkts, bytes); ++ ++ if (unlikely(netif_carrier_ok(netdev) && ++ ring_space(ring) > HNS3_MAX_TSO_BD_NUM)) { ++ /* Make sure that anybody stopping the queue after this ++ * sees the new next_to_clean. ++ */ ++ smp_mb(); ++ if (netif_tx_queue_stopped(dev_queue) && ++ !test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) { ++ netif_tx_wake_queue(dev_queue); ++ ring->stats.restart_queue++; ++ } ++ } ++} ++ ++static int hns3_desc_unused(struct hns3_enet_ring *ring) ++{ ++ int ntc = ring->next_to_clean; ++ int ntu = ring->next_to_use; ++ ++ if (unlikely(ntc == ntu && !ring->desc_cb[ntc].refill)) ++ return ring->desc_num; ++ ++ return ((ntc >= ntu) ? 0 : ring->desc_num) + ntc - ntu; ++} ++ ++/* Return true if there is any allocation failure */ ++static bool hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, ++ int cleand_count) ++{ ++ struct hns3_desc_cb *desc_cb; ++ struct hns3_desc_cb res_cbs; ++ int i, ret; ++ ++ for (i = 0; i < cleand_count; i++) { ++ desc_cb = &ring->desc_cb[ring->next_to_use]; ++ if (desc_cb->reuse_flag) { ++ hns3_ring_stats_update(ring, reuse_pg_cnt); ++ ++ hns3_reuse_buffer(ring, ring->next_to_use); ++ } else { ++ ret = hns3_alloc_and_map_buffer(ring, &res_cbs); ++ if (ret) { ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ ++ hns3_rl_err(ring_to_netdev(ring), ++ "alloc rx buffer failed: %d\n", ++ ret); ++ ++ writel(i, ring->tqp->io_base + ++ HNS3_RING_RX_RING_HEAD_REG); ++ return true; ++ } ++ hns3_replace_buffer(ring, ring->next_to_use, &res_cbs); ++ ++ hns3_ring_stats_update(ring, non_reuse_pg); ++ } ++ ++ ring_ptr_move_fw(ring, next_to_use); ++ } ++ ++ writel(i, ring->tqp->io_base + HNS3_RING_RX_RING_HEAD_REG); ++ return false; ++} ++ ++static bool hns3_can_reuse_page(struct hns3_desc_cb *cb) ++{ ++ return page_count(cb->priv) == cb->pagecnt_bias; ++} ++ ++static int hns3_handle_rx_copybreak(struct sk_buff *skb, int i, ++ struct hns3_enet_ring *ring, ++ int pull_len, ++ struct hns3_desc_cb *desc_cb) ++{ ++ struct hns3_desc *desc = &ring->desc[ring->next_to_clean]; ++ u32 frag_offset = desc_cb->page_offset + pull_len; ++ int size = le16_to_cpu(desc->rx.size); ++ u32 frag_size = size - pull_len; ++ void *frag = napi_alloc_frag(frag_size); ++ ++ if (unlikely(!frag)) { ++ hns3_ring_stats_update(ring, frag_alloc_err); ++ ++ hns3_rl_err(ring_to_netdev(ring), ++ "failed to allocate rx frag\n"); ++ return -ENOMEM; ++ } ++ ++ desc_cb->reuse_flag = 1; ++ memcpy(frag, desc_cb->buf + frag_offset, frag_size); ++ skb_add_rx_frag(skb, i, virt_to_page(frag), ++ offset_in_page(frag), frag_size, frag_size); ++ ++ hns3_ring_stats_update(ring, frag_alloc); ++ return 0; ++} ++ ++static void hns3_nic_reuse_page(struct sk_buff *skb, int i, ++ struct hns3_enet_ring *ring, int pull_len, ++ struct hns3_desc_cb *desc_cb) ++{ ++ struct hns3_desc *desc = &ring->desc[ring->next_to_clean]; ++ u32 frag_offset = desc_cb->page_offset + pull_len; ++ int size = le16_to_cpu(desc->rx.size); ++ u32 truesize = hns3_buf_size(ring); ++ u32 frag_size = size - pull_len; ++ int ret = 0; ++ bool reused; ++ ++ if (ring->page_pool) { ++ skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset, ++ frag_size, truesize); ++ return; ++ } ++ ++ /* Avoid re-using remote or pfmem page */ ++ if (unlikely(!dev_page_is_reusable(desc_cb->priv))) ++ goto out; ++ ++ reused = hns3_can_reuse_page(desc_cb); ++ ++ /* Rx page can be reused when: ++ * 1. Rx page is only owned by the driver when page_offset ++ * is zero, which means 0 @ truesize will be used by ++ * stack after skb_add_rx_frag() is called, and the rest ++ * of rx page can be reused by driver. ++ * Or ++ * 2. Rx page is only owned by the driver when page_offset ++ * is non-zero, which means page_offset @ truesize will ++ * be used by stack after skb_add_rx_frag() is called, ++ * and 0 @ truesize can be reused by driver. ++ */ ++ if ((!desc_cb->page_offset && reused) || ++ ((desc_cb->page_offset + truesize + truesize) <= ++ hns3_page_size(ring) && desc_cb->page_offset)) { ++ desc_cb->page_offset += truesize; ++ desc_cb->reuse_flag = 1; ++ } else if (desc_cb->page_offset && reused) { ++ desc_cb->page_offset = 0; ++ desc_cb->reuse_flag = 1; ++ } else if (frag_size <= ring->rx_copybreak) { ++ ret = hns3_handle_rx_copybreak(skb, i, ring, pull_len, desc_cb); ++ if (!ret) ++ return; ++ } ++ ++out: ++ desc_cb->pagecnt_bias--; ++ ++ if (unlikely(!desc_cb->pagecnt_bias)) { ++ page_ref_add(desc_cb->priv, USHRT_MAX); ++ desc_cb->pagecnt_bias = USHRT_MAX; ++ } ++ ++ skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset, ++ frag_size, truesize); ++ ++ if (unlikely(!desc_cb->reuse_flag)) ++ __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias); ++} ++ ++static int hns3_gro_complete(struct sk_buff *skb, u32 l234info) ++{ ++ __be16 type = skb->protocol; ++ struct tcphdr *th; ++ int depth = 0; ++ ++ while (eth_type_vlan(type)) { ++ struct vlan_hdr *vh; ++ ++ if ((depth + VLAN_HLEN) > skb_headlen(skb)) ++ return -EFAULT; ++ ++ vh = (struct vlan_hdr *)(skb->data + depth); ++ type = vh->h_vlan_encapsulated_proto; ++ depth += VLAN_HLEN; ++ } ++ ++ skb_set_network_header(skb, depth); ++ ++ if (type == htons(ETH_P_IP)) { ++ const struct iphdr *iph = ip_hdr(skb); ++ ++ depth += sizeof(struct iphdr); ++ skb_set_transport_header(skb, depth); ++ th = tcp_hdr(skb); ++ th->check = ~tcp_v4_check(skb->len - depth, iph->saddr, ++ iph->daddr, 0); ++ } else if (type == htons(ETH_P_IPV6)) { ++ const struct ipv6hdr *iph = ipv6_hdr(skb); ++ ++ depth += sizeof(struct ipv6hdr); ++ skb_set_transport_header(skb, depth); ++ th = tcp_hdr(skb); ++ th->check = ~tcp_v6_check(skb->len - depth, &iph->saddr, ++ &iph->daddr, 0); ++ } else { ++ hns3_rl_err(skb->dev, ++ "Error: FW GRO supports only IPv4/IPv6, not 0x%04x, depth: %d\n", ++ be16_to_cpu(type), depth); ++ return -EFAULT; ++ } ++ ++ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; ++ if (th->cwr) ++ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; ++ ++ if (l234info & BIT(HNS3_RXD_GRO_FIXID_B)) ++ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; ++ ++ skb->csum_start = (unsigned char *)th - skb->head; ++ skb->csum_offset = offsetof(struct tcphdr, check); ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ ++ trace_hns3_gro(skb); ++ ++ return 0; ++} ++ ++static bool hns3_checksum_complete(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, u32 ptype, u16 csum) ++{ ++ if (ptype == HNS3_INVALID_PTYPE || ++ hns3_rx_ptype_tbl[ptype].ip_summed != CHECKSUM_COMPLETE) ++ return false; ++ ++ hns3_ring_stats_update(ring, csum_complete); ++ skb->ip_summed = CHECKSUM_COMPLETE; ++ skb->csum = csum_unfold((__force __sum16)csum); ++ ++ return true; ++} ++ ++static void hns3_rx_handle_csum(struct sk_buff *skb, u32 l234info, ++ u32 ol_info, u32 ptype) ++{ ++ int l3_type, l4_type; ++ int ol4_type; ++ ++ if (ptype != HNS3_INVALID_PTYPE) { ++ skb->csum_level = hns3_rx_ptype_tbl[ptype].csum_level; ++ skb->ip_summed = hns3_rx_ptype_tbl[ptype].ip_summed; ++ ++ return; ++ } ++ ++ ol4_type = hnae3_get_field(ol_info, HNS3_RXD_OL4ID_M, ++ HNS3_RXD_OL4ID_S); ++ switch (ol4_type) { ++ case HNS3_OL4_TYPE_MAC_IN_UDP: ++ case HNS3_OL4_TYPE_NVGRE: ++ skb->csum_level = 1; ++ fallthrough; ++ case HNS3_OL4_TYPE_NO_TUN: ++ l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M, ++ HNS3_RXD_L3ID_S); ++ l4_type = hnae3_get_field(l234info, HNS3_RXD_L4ID_M, ++ HNS3_RXD_L4ID_S); ++ /* Can checksum ipv4 or ipv6 + UDP/TCP/SCTP packets */ ++ if ((l3_type == HNS3_L3_TYPE_IPV4 || ++ l3_type == HNS3_L3_TYPE_IPV6) && ++ (l4_type == HNS3_L4_TYPE_UDP || ++ l4_type == HNS3_L4_TYPE_TCP || ++ l4_type == HNS3_L4_TYPE_SCTP)) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ break; ++ default: ++ break; ++ } ++} ++ ++static void hns3_rx_checksum(struct hns3_enet_ring *ring, struct sk_buff *skb, ++ u32 l234info, u32 bd_base_info, u32 ol_info, ++ u16 csum) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ u32 ptype = HNS3_INVALID_PTYPE; ++ ++ skb->ip_summed = CHECKSUM_NONE; ++ ++ skb_checksum_none_assert(skb); ++ ++ if (!(netdev->features & NETIF_F_RXCSUM)) ++ return; ++ ++ if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) ++ ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M, ++ HNS3_RXD_PTYPE_S); ++ ++ if (hns3_checksum_complete(ring, skb, ptype, csum)) ++ return; ++ ++ /* check if hardware has done checksum */ ++ if (!(bd_base_info & BIT(HNS3_RXD_L3L4P_B))) ++ return; ++ ++ if (unlikely(l234info & (BIT(HNS3_RXD_L3E_B) | BIT(HNS3_RXD_L4E_B) | ++ BIT(HNS3_RXD_OL3E_B) | ++ BIT(HNS3_RXD_OL4E_B)))) { ++ hns3_ring_stats_update(ring, l3l4_csum_err); ++ ++ return; ++ } ++ ++ hns3_rx_handle_csum(skb, l234info, ol_info, ptype); ++} ++ ++static void hns3_rx_skb(struct hns3_enet_ring *ring, struct sk_buff *skb) ++{ ++ if (skb_has_frag_list(skb)) ++ napi_gro_flush(&ring->tqp_vector->napi, false); ++ ++ napi_gro_receive(&ring->tqp_vector->napi, skb); ++} ++ ++static bool hns3_parse_vlan_tag(struct hns3_enet_ring *ring, ++ struct hns3_desc *desc, u32 l234info, ++ u16 *vlan_tag) ++{ ++ struct hnae3_handle *handle = ring->tqp->handle; ++ struct pci_dev *pdev = ring->tqp->handle->pdev; ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ if (unlikely(ae_dev->dev_version < HNAE3_DEVICE_VERSION_V2)) { ++ *vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag); ++ if (!(*vlan_tag & VLAN_VID_MASK)) ++ *vlan_tag = le16_to_cpu(desc->rx.vlan_tag); ++ ++ return (*vlan_tag != 0); ++ } ++ ++#define HNS3_STRP_OUTER_VLAN 0x1 ++#define HNS3_STRP_INNER_VLAN 0x2 ++#define HNS3_STRP_BOTH 0x3 ++ ++ /* Hardware always insert VLAN tag into RX descriptor when ++ * remove the tag from packet, driver needs to determine ++ * reporting which tag to stack. ++ */ ++ switch (hnae3_get_field(l234info, HNS3_RXD_STRP_TAGP_M, ++ HNS3_RXD_STRP_TAGP_S)) { ++ case HNS3_STRP_OUTER_VLAN: ++ if (handle->port_base_vlan_state != ++ HNAE3_PORT_BASE_VLAN_DISABLE) ++ return false; ++ ++ *vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag); ++ return true; ++ case HNS3_STRP_INNER_VLAN: ++ if (handle->port_base_vlan_state != ++ HNAE3_PORT_BASE_VLAN_DISABLE) ++ return false; ++ ++ *vlan_tag = le16_to_cpu(desc->rx.vlan_tag); ++ return true; ++ case HNS3_STRP_BOTH: ++ if (handle->port_base_vlan_state == ++ HNAE3_PORT_BASE_VLAN_DISABLE) ++ *vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag); ++ else ++ *vlan_tag = le16_to_cpu(desc->rx.vlan_tag); ++ ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static void hns3_rx_ring_move_fw(struct hns3_enet_ring *ring) ++{ ++ ring->desc[ring->next_to_clean].rx.bd_base_info &= ++ cpu_to_le32(~BIT(HNS3_RXD_VLD_B)); ++ ring->desc_cb[ring->next_to_clean].refill = 0; ++ ring->next_to_clean += 1; ++ ++ if (unlikely(ring->next_to_clean == ring->desc_num)) ++ ring->next_to_clean = 0; ++} ++ ++static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, ++ unsigned char *va) ++{ ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_clean]; ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct sk_buff *skb; ++ ++ ring->skb = napi_alloc_skb(&ring->tqp_vector->napi, HNS3_RX_HEAD_SIZE); ++ skb = ring->skb; ++ if (unlikely(!skb)) { ++ hns3_rl_err(netdev, "alloc rx skb fail\n"); ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ ++ return -ENOMEM; ++ } ++ ++ trace_hns3_rx_desc(ring); ++ prefetchw(skb->data); ++ ++ ring->pending_buf = 1; ++ ring->frag_num = 0; ++ ring->tail_skb = NULL; ++ if (length <= HNS3_RX_HEAD_SIZE) { ++ memcpy(__skb_put(skb, length), va, ALIGN(length, sizeof(long))); ++ ++ /* We can reuse buffer as-is, just make sure it is reusable */ ++ if (dev_page_is_reusable(desc_cb->priv)) ++ desc_cb->reuse_flag = 1; ++ else if (desc_cb->type & DESC_TYPE_PP_FRAG) ++ page_pool_put_full_page(ring->page_pool, desc_cb->priv, ++ false); ++ else /* This page cannot be reused so discard it */ ++ __page_frag_cache_drain(desc_cb->priv, ++ desc_cb->pagecnt_bias); ++ ++ hns3_rx_ring_move_fw(ring); ++ return 0; ++ } ++ ++ if (ring->page_pool) ++ skb_mark_for_recycle(skb); ++ ++ hns3_ring_stats_update(ring, seg_pkt_cnt); ++ ++ ring->pull_len = eth_get_headlen(netdev, va, HNS3_RX_HEAD_SIZE); ++ __skb_put(skb, ring->pull_len); ++ hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len, ++ desc_cb); ++ hns3_rx_ring_move_fw(ring); ++ ++ return 0; ++} ++ ++static int hns3_add_frag(struct hns3_enet_ring *ring) ++{ ++ struct sk_buff *skb = ring->skb; ++ struct sk_buff *head_skb = skb; ++ struct sk_buff *new_skb; ++ struct hns3_desc_cb *desc_cb; ++ struct hns3_desc *desc; ++ u32 bd_base_info; ++ ++ do { ++ desc = &ring->desc[ring->next_to_clean]; ++ desc_cb = &ring->desc_cb[ring->next_to_clean]; ++ bd_base_info = le32_to_cpu(desc->rx.bd_base_info); ++ /* make sure HW write desc complete */ ++ dma_rmb(); ++ if (!(bd_base_info & BIT(HNS3_RXD_VLD_B))) ++ return -ENXIO; ++ ++ if (unlikely(ring->frag_num >= MAX_SKB_FRAGS)) { ++ new_skb = napi_alloc_skb(&ring->tqp_vector->napi, 0); ++ if (unlikely(!new_skb)) { ++ hns3_rl_err(ring_to_netdev(ring), ++ "alloc rx fraglist skb fail\n"); ++ return -ENXIO; ++ } ++ ++ if (ring->page_pool) ++ skb_mark_for_recycle(new_skb); ++ ++ ring->frag_num = 0; ++ ++ if (ring->tail_skb) { ++ ring->tail_skb->next = new_skb; ++ ring->tail_skb = new_skb; ++ } else { ++ skb_shinfo(skb)->frag_list = new_skb; ++ ring->tail_skb = new_skb; ++ } ++ } ++ ++ if (ring->tail_skb) { ++ head_skb->truesize += hns3_buf_size(ring); ++ head_skb->data_len += le16_to_cpu(desc->rx.size); ++ head_skb->len += le16_to_cpu(desc->rx.size); ++ skb = ring->tail_skb; ++ } ++ ++ dma_sync_single_for_cpu(ring_to_dev(ring), ++ desc_cb->dma + desc_cb->page_offset, ++ hns3_buf_size(ring), ++ DMA_FROM_DEVICE); ++ ++ hns3_nic_reuse_page(skb, ring->frag_num++, ring, 0, desc_cb); ++ trace_hns3_rx_desc(ring); ++ hns3_rx_ring_move_fw(ring); ++ ring->pending_buf++; ++ } while (!(bd_base_info & BIT(HNS3_RXD_FE_B))); ++ ++ return 0; ++} ++ ++static int hns3_set_gro_and_checksum(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, u32 l234info, ++ u32 bd_base_info, u32 ol_info, u16 csum) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ u32 l3_type; ++ ++ skb_shinfo(skb)->gso_size = hnae3_get_field(bd_base_info, ++ HNS3_RXD_GRO_SIZE_M, ++ HNS3_RXD_GRO_SIZE_S); ++ /* if there is no HW GRO, do not set gro params */ ++ if (!skb_shinfo(skb)->gso_size) { ++ hns3_rx_checksum(ring, skb, l234info, bd_base_info, ol_info, ++ csum); ++ return 0; ++ } ++ ++ NAPI_GRO_CB(skb)->count = hnae3_get_field(l234info, ++ HNS3_RXD_GRO_COUNT_M, ++ HNS3_RXD_GRO_COUNT_S); ++ ++ if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) { ++ u32 ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M, ++ HNS3_RXD_PTYPE_S); ++ ++ l3_type = hns3_rx_ptype_tbl[ptype].l3_type; ++ } else { ++ l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M, ++ HNS3_RXD_L3ID_S); ++ } ++ ++ if (l3_type == HNS3_L3_TYPE_IPV4) ++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; ++ else if (l3_type == HNS3_L3_TYPE_IPV6) ++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; ++ else ++ return -EFAULT; ++ ++ return hns3_gro_complete(skb, l234info); ++} ++ ++static void hns3_set_rx_skb_rss_type(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, u32 rss_hash, ++ u32 l234info, u32 ol_info) ++{ ++ enum pkt_hash_types rss_type = PKT_HASH_TYPE_NONE; ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) { ++ u32 ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M, ++ HNS3_RXD_PTYPE_S); ++ ++ rss_type = hns3_rx_ptype_tbl[ptype].hash_type; ++ } else { ++ int l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M, ++ HNS3_RXD_L3ID_S); ++ int l4_type = hnae3_get_field(l234info, HNS3_RXD_L4ID_M, ++ HNS3_RXD_L4ID_S); ++ ++ if (l3_type == HNS3_L3_TYPE_IPV4 || ++ l3_type == HNS3_L3_TYPE_IPV6) { ++ if (l4_type == HNS3_L4_TYPE_UDP || ++ l4_type == HNS3_L4_TYPE_TCP || ++ l4_type == HNS3_L4_TYPE_SCTP) ++ rss_type = PKT_HASH_TYPE_L4; ++ else if (l4_type == HNS3_L4_TYPE_IGMP || ++ l4_type == HNS3_L4_TYPE_ICMP) ++ rss_type = PKT_HASH_TYPE_L3; ++ } ++ } ++ ++ skb_set_hash(skb, rss_hash, rss_type); ++} ++ ++static void hns3_handle_rx_ts_info(struct net_device *netdev, ++ struct hns3_desc *desc, struct sk_buff *skb, ++ u32 bd_base_info) ++{ ++ if (unlikely(bd_base_info & BIT(HNS3_RXD_TS_VLD_B))) { ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ u32 nsec = le32_to_cpu(desc->ts_nsec); ++ u32 sec = le32_to_cpu(desc->ts_sec); ++ ++ if (h->ae_algo->ops->get_rx_hwts) ++ h->ae_algo->ops->get_rx_hwts(h, skb, nsec, sec); ++ } ++} ++ ++static void hns3_handle_rx_vlan_tag(struct hns3_enet_ring *ring, ++ struct hns3_desc *desc, struct sk_buff *skb, ++ u32 l234info) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ ++ /* Based on hw strategy, the tag offloaded will be stored at ++ * ot_vlan_tag in two layer tag case, and stored at vlan_tag ++ * in one layer tag case. ++ */ ++ if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) { ++ u16 vlan_tag; ++ ++ if (hns3_parse_vlan_tag(ring, desc, l234info, &vlan_tag)) ++ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ++ vlan_tag); ++ } ++} ++ ++static int hns3_handle_bdinfo(struct hns3_enet_ring *ring, struct sk_buff *skb) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ enum hns3_pkt_l2t_type l2_frame_type; ++ u32 bd_base_info, l234info, ol_info; ++ struct hns3_desc *desc; ++ unsigned int len; ++ int pre_ntc, ret; ++ u16 csum; ++ ++ /* bdinfo handled below is only valid on the last BD of the ++ * current packet, and ring->next_to_clean indicates the first ++ * descriptor of next packet, so need - 1 below. ++ */ ++ pre_ntc = ring->next_to_clean ? (ring->next_to_clean - 1) : ++ (ring->desc_num - 1); ++ desc = &ring->desc[pre_ntc]; ++ bd_base_info = le32_to_cpu(desc->rx.bd_base_info); ++ l234info = le32_to_cpu(desc->rx.l234_info); ++ ol_info = le32_to_cpu(desc->rx.ol_info); ++ csum = le16_to_cpu(desc->csum); ++ ++ hns3_handle_rx_ts_info(netdev, desc, skb, bd_base_info); ++ ++ hns3_handle_rx_vlan_tag(ring, desc, skb, l234info); ++ ++ if (unlikely(!desc->rx.pkt_len || (l234info & (BIT(HNS3_RXD_TRUNCAT_B) | ++ BIT(HNS3_RXD_L2E_B))))) { ++ u64_stats_update_begin(&ring->syncp); ++ if (l234info & BIT(HNS3_RXD_L2E_B)) ++ ring->stats.l2_err++; ++ else ++ ring->stats.err_pkt_len++; ++ u64_stats_update_end(&ring->syncp); ++ ++ return -EFAULT; ++ } ++ ++ len = skb->len; ++ ++ /* Do update ip stack process */ ++ skb->protocol = eth_type_trans(skb, netdev); ++ ++ /* This is needed in order to enable forwarding support */ ++ ret = hns3_set_gro_and_checksum(ring, skb, l234info, ++ bd_base_info, ol_info, csum); ++ if (unlikely(ret)) { ++ hns3_ring_stats_update(ring, rx_err_cnt); ++ return ret; ++ } ++ ++ l2_frame_type = hnae3_get_field(l234info, HNS3_RXD_DMAC_M, ++ HNS3_RXD_DMAC_S); ++ ++ u64_stats_update_begin(&ring->syncp); ++ ring->stats.rx_pkts++; ++ ring->stats.rx_bytes += len; ++ ++ if (l2_frame_type == HNS3_L2_TYPE_MULTICAST) ++ ring->stats.rx_multicast++; ++ ++ u64_stats_update_end(&ring->syncp); ++ ++ ring->tqp_vector->rx_group.total_bytes += len; ++ ++ hns3_set_rx_skb_rss_type(ring, skb, le32_to_cpu(desc->rx.rss_hash), ++ l234info, ol_info); ++ return 0; ++} ++ ++static int hns3_handle_rx_bd(struct hns3_enet_ring *ring) ++{ ++ struct sk_buff *skb = ring->skb; ++ struct hns3_desc_cb *desc_cb; ++ struct hns3_desc *desc; ++ unsigned int length; ++ u32 bd_base_info; ++ int ret; ++ ++ desc = &ring->desc[ring->next_to_clean]; ++ desc_cb = &ring->desc_cb[ring->next_to_clean]; ++ ++ prefetch(desc); ++ ++ if (!skb) { ++ bd_base_info = le32_to_cpu(desc->rx.bd_base_info); ++ /* Check valid BD */ ++ if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B)))) ++ return -ENXIO; ++ ++ dma_rmb(); ++ length = le16_to_cpu(desc->rx.size); ++ ++ ring->va = desc_cb->buf + desc_cb->page_offset; ++ ++ dma_sync_single_for_cpu(ring_to_dev(ring), ++ desc_cb->dma + desc_cb->page_offset, ++ hns3_buf_size(ring), ++ DMA_FROM_DEVICE); ++ ++ /* Prefetch first cache line of first page. ++ * Idea is to cache few bytes of the header of the packet. ++ * Our L1 Cache line size is 64B so need to prefetch twice to make ++ * it 128B. But in actual we can have greater size of caches with ++ * 128B Level 1 cache lines. In such a case, single fetch would ++ * suffice to cache in the relevant part of the header. ++ */ ++ net_prefetch(ring->va); ++ ++ ret = hns3_alloc_skb(ring, length, ring->va); ++ skb = ring->skb; ++ ++ if (ret < 0) /* alloc buffer fail */ ++ return ret; ++ if (!(bd_base_info & BIT(HNS3_RXD_FE_B))) { /* need add frag */ ++ ret = hns3_add_frag(ring); ++ if (ret) ++ return ret; ++ } ++ } else { ++ ret = hns3_add_frag(ring); ++ if (ret) ++ return ret; ++ } ++ ++ /* As the head data may be changed when GRO enable, copy ++ * the head data in after other data rx completed ++ */ ++ if (skb->len > HNS3_RX_HEAD_SIZE) ++ memcpy(skb->data, ring->va, ++ ALIGN(ring->pull_len, sizeof(long))); ++ ++ ret = hns3_handle_bdinfo(ring, skb); ++ if (unlikely(ret)) { ++ dev_kfree_skb_any(skb); ++ return ret; ++ } ++ ++ skb_record_rx_queue(skb, ring->tqp->tqp_index); ++ return 0; ++} ++ ++int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget, ++ void (*rx_fn)(struct hns3_enet_ring *, struct sk_buff *)) ++{ ++#define RCB_NOF_ALLOC_RX_BUFF_ONCE 16 ++ int unused_count = hns3_desc_unused(ring); ++ bool failure = false; ++ int recv_pkts = 0; ++ int err; ++ ++ unused_count -= ring->pending_buf; ++ ++ while (recv_pkts < budget) { ++ /* Reuse or realloc buffers */ ++ if (unused_count >= RCB_NOF_ALLOC_RX_BUFF_ONCE) { ++ failure = failure || ++ hns3_nic_alloc_rx_buffers(ring, unused_count); ++ unused_count = 0; ++ } ++ ++ /* Poll one pkt */ ++ err = hns3_handle_rx_bd(ring); ++ /* Do not get FE for the packet or failed to alloc skb */ ++ if (unlikely(!ring->skb || err == -ENXIO)) { ++ goto out; ++ } else if (likely(!err)) { ++ rx_fn(ring, ring->skb); ++ recv_pkts++; ++ } ++ ++ unused_count += ring->pending_buf; ++ ring->skb = NULL; ++ ring->pending_buf = 0; ++ } ++ ++out: ++ /* sync head pointer before exiting, since hardware will calculate ++ * FBD number with head pointer ++ */ ++ if (unused_count > 0) ++ failure = failure || ++ hns3_nic_alloc_rx_buffers(ring, unused_count); ++ ++ return failure ? budget : recv_pkts; ++} ++ ++static void hns3_update_rx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ struct hns3_enet_ring_group *rx_group = &tqp_vector->rx_group; ++ struct dim_sample sample = {}; ++ ++ if (!rx_group->coal.adapt_enable) ++ return; ++ ++ dim_update_sample(tqp_vector->event_cnt, rx_group->total_packets, ++ rx_group->total_bytes, &sample); ++ net_dim(&rx_group->dim, sample); ++} ++ ++static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ struct hns3_enet_ring_group *tx_group = &tqp_vector->tx_group; ++ struct dim_sample sample = {}; ++ ++ if (!tx_group->coal.adapt_enable) ++ return; ++ ++ dim_update_sample(tqp_vector->event_cnt, tx_group->total_packets, ++ tx_group->total_bytes, &sample); ++ net_dim(&tx_group->dim, sample); ++} ++ ++static int hns3_nic_common_poll(struct napi_struct *napi, int budget) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(napi->dev); ++ struct hns3_enet_ring *ring; ++ int rx_pkt_total = 0; ++ ++ struct hns3_enet_tqp_vector *tqp_vector = ++ container_of(napi, struct hns3_enet_tqp_vector, napi); ++ bool clean_complete = true; ++ int rx_budget = budget; ++ ++ if (unlikely(test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) { ++ napi_complete(napi); ++ return 0; ++ } ++ ++ /* Since the actual Tx work is minimal, we can give the Tx a larger ++ * budget and be more aggressive about cleaning up the Tx descriptors. ++ */ ++ hns3_for_each_ring(ring, tqp_vector->tx_group) ++ hns3_clean_tx_ring(ring, budget); ++ ++ /* make sure rx ring budget not smaller than 1 */ ++ if (tqp_vector->num_tqps > 1) ++ rx_budget = max(budget / tqp_vector->num_tqps, 1); ++ ++ hns3_for_each_ring(ring, tqp_vector->rx_group) { ++ int rx_cleaned = hns3_clean_rx_ring(ring, rx_budget, ++ hns3_rx_skb); ++ if (rx_cleaned >= rx_budget) ++ clean_complete = false; ++ ++ rx_pkt_total += rx_cleaned; ++ } ++ ++ tqp_vector->rx_group.total_packets += rx_pkt_total; ++ ++ if (!clean_complete) ++ return budget; ++ ++ if (napi_complete(napi) && ++ likely(!test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) { ++ hns3_update_rx_int_coalesce(tqp_vector); ++ hns3_update_tx_int_coalesce(tqp_vector); ++ ++ hns3_mask_vector_irq(tqp_vector, 1); ++ } ++ ++ return rx_pkt_total; ++} ++ ++static int hns3_create_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, ++ struct hnae3_ring_chain_node **head, ++ bool is_tx) ++{ ++ u32 bit_value = is_tx ? HNAE3_RING_TYPE_TX : HNAE3_RING_TYPE_RX; ++ u32 field_value = is_tx ? HNAE3_RING_GL_TX : HNAE3_RING_GL_RX; ++ struct hnae3_ring_chain_node *cur_chain = *head; ++ struct pci_dev *pdev = tqp_vector->handle->pdev; ++ struct hnae3_ring_chain_node *chain; ++ struct hns3_enet_ring *ring; ++ ++ ring = is_tx ? tqp_vector->tx_group.ring : tqp_vector->rx_group.ring; ++ ++ if (cur_chain) { ++ while (cur_chain->next) ++ cur_chain = cur_chain->next; ++ } ++ ++ while (ring) { ++ chain = devm_kzalloc(&pdev->dev, sizeof(*chain), GFP_KERNEL); ++ if (!chain) ++ return -ENOMEM; ++ if (cur_chain) ++ cur_chain->next = chain; ++ else ++ *head = chain; ++ chain->tqp_index = ring->tqp->tqp_index; ++ hnae3_set_bit(chain->flag, HNAE3_RING_TYPE_B, ++ bit_value); ++ hnae3_set_field(chain->int_gl_idx, ++ HNAE3_RING_GL_IDX_M, ++ HNAE3_RING_GL_IDX_S, field_value); ++ ++ cur_chain = chain; ++ ++ ring = ring->next; ++ } ++ ++ return 0; ++} ++ ++static struct hnae3_ring_chain_node * ++hns3_get_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ struct pci_dev *pdev = tqp_vector->handle->pdev; ++ struct hnae3_ring_chain_node *cur_chain = NULL; ++ struct hnae3_ring_chain_node *chain; ++ ++ if (hns3_create_ring_chain(tqp_vector, &cur_chain, true)) ++ goto err_free_chain; ++ ++ if (hns3_create_ring_chain(tqp_vector, &cur_chain, false)) ++ goto err_free_chain; ++ ++ return cur_chain; ++ ++err_free_chain: ++ while (cur_chain) { ++ chain = cur_chain->next; ++ devm_kfree(&pdev->dev, cur_chain); ++ cur_chain = chain; ++ } ++ ++ return NULL; ++} ++ ++static void hns3_free_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, ++ struct hnae3_ring_chain_node *head) ++{ ++ struct pci_dev *pdev = tqp_vector->handle->pdev; ++ struct hnae3_ring_chain_node *chain_tmp, *chain; ++ ++ chain = head; ++ ++ while (chain) { ++ chain_tmp = chain->next; ++ devm_kfree(&pdev->dev, chain); ++ chain = chain_tmp; ++ } ++} ++ ++static void hns3_add_ring_to_group(struct hns3_enet_ring_group *group, ++ struct hns3_enet_ring *ring) ++{ ++ ring->next = group->ring; ++ group->ring = ring; ++ ++ group->count++; ++} ++ ++static void hns3_nic_set_cpumask(struct hns3_nic_priv *priv) ++{ ++ struct pci_dev *pdev = priv->ae_handle->pdev; ++ struct hns3_enet_tqp_vector *tqp_vector; ++ int num_vectors = priv->vector_num; ++ int numa_node; ++ int vector_i; ++ ++ numa_node = dev_to_node(&pdev->dev); ++ ++ for (vector_i = 0; vector_i < num_vectors; vector_i++) { ++ tqp_vector = &priv->tqp_vector[vector_i]; ++ cpumask_set_cpu(cpumask_local_spread(vector_i, numa_node), ++ &tqp_vector->affinity_mask); ++ } ++} ++ ++static void hns3_rx_dim_work(struct work_struct *work) ++{ ++ struct dim *dim = container_of(work, struct dim, work); ++ struct hns3_enet_ring_group *group = container_of(dim, ++ struct hns3_enet_ring_group, dim); ++ struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector; ++ struct dim_cq_moder cur_moder = ++ net_dim_get_rx_moderation(dim->mode, dim->profile_ix); ++ ++ hns3_set_vector_coalesce_rx_gl(group->ring->tqp_vector, cur_moder.usec); ++ tqp_vector->rx_group.coal.int_gl = cur_moder.usec; ++ ++ if (cur_moder.pkts < tqp_vector->rx_group.coal.int_ql_max) { ++ hns3_set_vector_coalesce_rx_ql(tqp_vector, cur_moder.pkts); ++ tqp_vector->rx_group.coal.int_ql = cur_moder.pkts; ++ } ++ ++ dim->state = DIM_START_MEASURE; ++} ++ ++static void hns3_tx_dim_work(struct work_struct *work) ++{ ++ struct dim *dim = container_of(work, struct dim, work); ++ struct hns3_enet_ring_group *group = container_of(dim, ++ struct hns3_enet_ring_group, dim); ++ struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector; ++ struct dim_cq_moder cur_moder = ++ net_dim_get_tx_moderation(dim->mode, dim->profile_ix); ++ ++ hns3_set_vector_coalesce_tx_gl(tqp_vector, cur_moder.usec); ++ tqp_vector->tx_group.coal.int_gl = cur_moder.usec; ++ ++ if (cur_moder.pkts < tqp_vector->tx_group.coal.int_ql_max) { ++ hns3_set_vector_coalesce_tx_ql(tqp_vector, cur_moder.pkts); ++ tqp_vector->tx_group.coal.int_ql = cur_moder.pkts; ++ } ++ ++ dim->state = DIM_START_MEASURE; ++} ++ ++static void hns3_nic_init_dim(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ INIT_WORK(&tqp_vector->rx_group.dim.work, hns3_rx_dim_work); ++ INIT_WORK(&tqp_vector->tx_group.dim.work, hns3_tx_dim_work); ++} ++ ++static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ struct hns3_enet_tqp_vector *tqp_vector; ++ int ret; ++ int i; ++ ++ hns3_nic_set_cpumask(priv); ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vector = &priv->tqp_vector[i]; ++ hns3_vector_coalesce_init_hw(tqp_vector, priv); ++ tqp_vector->num_tqps = 0; ++ hns3_nic_init_dim(tqp_vector); ++ } ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ u16 vector_i = i % priv->vector_num; ++ u16 tqp_num = h->kinfo.num_tqps; ++ ++ tqp_vector = &priv->tqp_vector[vector_i]; ++ ++ hns3_add_ring_to_group(&tqp_vector->tx_group, ++ &priv->ring[i]); ++ ++ hns3_add_ring_to_group(&tqp_vector->rx_group, ++ &priv->ring[i + tqp_num]); ++ ++ priv->ring[i].tqp_vector = tqp_vector; ++ priv->ring[i + tqp_num].tqp_vector = tqp_vector; ++ tqp_vector->num_tqps++; ++ } ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ struct hnae3_ring_chain_node *vector_ring_chain; ++ ++ tqp_vector = &priv->tqp_vector[i]; ++ ++ tqp_vector->rx_group.total_bytes = 0; ++ tqp_vector->rx_group.total_packets = 0; ++ tqp_vector->tx_group.total_bytes = 0; ++ tqp_vector->tx_group.total_packets = 0; ++ tqp_vector->handle = h; ++ ++ vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector); ++ if (!vector_ring_chain) { ++ ret = -ENOMEM; ++ goto map_ring_fail; ++ } ++ ++ ret = h->ae_algo->ops->map_ring_to_vector(h, ++ tqp_vector->vector_irq, vector_ring_chain); ++ ++ hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain); ++ ++ if (ret) ++ goto map_ring_fail; ++ ++ netif_napi_add(priv->netdev, &tqp_vector->napi, ++ hns3_nic_common_poll, NAPI_POLL_WEIGHT); ++ } ++ ++ return 0; ++ ++map_ring_fail: ++ while (i--) ++ netif_napi_del(&priv->tqp_vector[i].napi); ++ ++ return ret; ++} ++ ++static void hns3_nic_init_coal_cfg(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); ++ struct hns3_enet_coalesce *tx_coal = &priv->tx_coal; ++ struct hns3_enet_coalesce *rx_coal = &priv->rx_coal; ++ ++ /* initialize the configuration for interrupt coalescing. ++ * 1. GL (Interrupt Gap Limiter) ++ * 2. RL (Interrupt Rate Limiter) ++ * 3. QL (Interrupt Quantity Limiter) ++ * ++ * Default: enable interrupt coalescing self-adaptive and GL ++ */ ++ tx_coal->adapt_enable = 1; ++ rx_coal->adapt_enable = 1; ++ ++ tx_coal->int_gl = HNS3_INT_GL_50K; ++ rx_coal->int_gl = HNS3_INT_GL_50K; ++ ++ rx_coal->flow_level = HNS3_FLOW_LOW; ++ tx_coal->flow_level = HNS3_FLOW_LOW; ++ ++ if (ae_dev->dev_specs.int_ql_max) { ++ tx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG; ++ rx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG; ++ } ++} ++ ++static int hns3_nic_alloc_vector_data(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ struct hns3_enet_tqp_vector *tqp_vector; ++ struct hnae3_vector_info *vector; ++ struct pci_dev *pdev = h->pdev; ++ u16 tqp_num = h->kinfo.num_tqps; ++ u16 vector_num; ++ int ret = 0; ++ u16 i; ++ ++ /* RSS size, cpu online and vector_num should be the same */ ++ /* Should consider 2p/4p later */ ++ vector_num = min_t(u16, num_online_cpus(), tqp_num); ++ ++ vector = devm_kcalloc(&pdev->dev, vector_num, sizeof(*vector), ++ GFP_KERNEL); ++ if (!vector) ++ return -ENOMEM; ++ ++ /* save the actual available vector number */ ++ vector_num = h->ae_algo->ops->get_vector(h, vector_num, vector); ++ ++ priv->vector_num = vector_num; ++ priv->tqp_vector = (struct hns3_enet_tqp_vector *) ++ devm_kcalloc(&pdev->dev, vector_num, sizeof(*priv->tqp_vector), ++ GFP_KERNEL); ++ if (!priv->tqp_vector) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vector = &priv->tqp_vector[i]; ++ tqp_vector->idx = i; ++ tqp_vector->mask_addr = vector[i].io_addr; ++ tqp_vector->vector_irq = vector[i].vector; ++ hns3_vector_coalesce_init(tqp_vector, priv); ++ } ++ ++out: ++ devm_kfree(&pdev->dev, vector); ++ return ret; ++} ++ ++static void hns3_clear_ring_group(struct hns3_enet_ring_group *group) ++{ ++ group->ring = NULL; ++ group->count = 0; ++} ++ ++static void hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_ring_chain_node *vector_ring_chain; ++ struct hnae3_handle *h = priv->ae_handle; ++ struct hns3_enet_tqp_vector *tqp_vector; ++ int i; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vector = &priv->tqp_vector[i]; ++ ++ if (!tqp_vector->rx_group.ring && !tqp_vector->tx_group.ring) ++ continue; ++ ++ /* Since the mapping can be overwritten, when fail to get the ++ * chain between vector and ring, we should go on to deal with ++ * the remaining options. ++ */ ++ vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector); ++ if (!vector_ring_chain) ++ dev_warn(priv->dev, "failed to get ring chain\n"); ++ ++ h->ae_algo->ops->unmap_ring_from_vector(h, ++ tqp_vector->vector_irq, vector_ring_chain); ++ ++ hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain); ++ ++ hns3_clear_ring_group(&tqp_vector->rx_group); ++ hns3_clear_ring_group(&tqp_vector->tx_group); ++ netif_napi_del(&priv->tqp_vector[i].napi); ++ } ++} ++ ++static void hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ struct pci_dev *pdev = h->pdev; ++ int i, ret; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ struct hns3_enet_tqp_vector *tqp_vector; ++ ++ tqp_vector = &priv->tqp_vector[i]; ++ ret = h->ae_algo->ops->put_vector(h, tqp_vector->vector_irq); ++ if (ret) ++ return; ++ } ++ ++ devm_kfree(&pdev->dev, priv->tqp_vector); ++} ++ ++static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv, ++ unsigned int ring_type) ++{ ++ int queue_num = priv->ae_handle->kinfo.num_tqps; ++ struct hns3_enet_ring *ring; ++ int desc_num; ++ ++ if (ring_type == HNAE3_RING_TYPE_TX) { ++ ring = &priv->ring[q->tqp_index]; ++ desc_num = priv->ae_handle->kinfo.num_tx_desc; ++ ring->queue_index = q->tqp_index; ++ ring->tx_copybreak = priv->tx_copybreak; ++ ring->last_to_use = 0; ++ } else { ++ ring = &priv->ring[q->tqp_index + queue_num]; ++ desc_num = priv->ae_handle->kinfo.num_rx_desc; ++ ring->queue_index = q->tqp_index; ++ ring->rx_copybreak = priv->rx_copybreak; ++ } ++ ++ hnae3_set_bit(ring->flag, HNAE3_RING_TYPE_B, ring_type); ++ ++ ring->tqp = q; ++ ring->desc = NULL; ++ ring->desc_cb = NULL; ++ ring->dev = priv->dev; ++ ring->desc_dma_addr = 0; ++ ring->buf_size = q->buf_size; ++ ring->desc_num = desc_num; ++ ring->next_to_use = 0; ++ ring->next_to_clean = 0; ++} ++ ++static void hns3_queue_to_ring(struct hnae3_queue *tqp, ++ struct hns3_nic_priv *priv) ++{ ++ hns3_ring_get_cfg(tqp, priv, HNAE3_RING_TYPE_TX); ++ hns3_ring_get_cfg(tqp, priv, HNAE3_RING_TYPE_RX); ++} ++ ++static int hns3_get_ring_config(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ struct pci_dev *pdev = h->pdev; ++ int i; ++ ++ priv->ring = devm_kzalloc(&pdev->dev, ++ array3_size(h->kinfo.num_tqps, ++ sizeof(*priv->ring), 2), ++ GFP_KERNEL); ++ if (!priv->ring) ++ return -ENOMEM; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) ++ hns3_queue_to_ring(h->kinfo.tqp[i], priv); ++ ++ return 0; ++} ++ ++static void hns3_put_ring_config(struct hns3_nic_priv *priv) ++{ ++ if (!priv->ring) ++ return; ++ ++ devm_kfree(priv->dev, priv->ring); ++ priv->ring = NULL; ++} ++ ++static void hns3_alloc_page_pool(struct hns3_enet_ring *ring) ++{ ++ struct page_pool_params pp_params = { ++ .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG | ++ PP_FLAG_DMA_SYNC_DEV, ++ .order = hns3_page_order(ring), ++ .pool_size = ring->desc_num * hns3_buf_size(ring) / ++ (PAGE_SIZE << hns3_page_order(ring)), ++ .nid = dev_to_node(ring_to_dev(ring)), ++ .dev = ring_to_dev(ring), ++ .dma_dir = DMA_FROM_DEVICE, ++ .offset = 0, ++ .max_len = PAGE_SIZE << hns3_page_order(ring), ++ }; ++ ++ ring->page_pool = page_pool_create(&pp_params); ++ if (IS_ERR(ring->page_pool)) { ++ dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n", ++ PTR_ERR(ring->page_pool)); ++ ring->page_pool = NULL; ++ } ++} ++ ++static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) ++{ ++ int ret; ++ ++ if (ring->desc_num <= 0 || ring->buf_size <= 0) ++ return -EINVAL; ++ ++ ring->desc_cb = devm_kcalloc(ring_to_dev(ring), ring->desc_num, ++ sizeof(ring->desc_cb[0]), GFP_KERNEL); ++ if (!ring->desc_cb) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ ret = hns3_alloc_desc(ring); ++ if (ret) ++ goto out_with_desc_cb; ++ ++ if (!HNAE3_IS_TX_RING(ring)) { ++ if (page_pool_enabled) ++ hns3_alloc_page_pool(ring); ++ ++ ret = hns3_alloc_ring_buffers(ring); ++ if (ret) ++ goto out_with_desc; ++ } else { ++ hns3_init_tx_spare_buffer(ring); ++ } ++ ++ return 0; ++ ++out_with_desc: ++ hns3_free_desc(ring); ++out_with_desc_cb: ++ devm_kfree(ring_to_dev(ring), ring->desc_cb); ++ ring->desc_cb = NULL; ++out: ++ return ret; ++} ++ ++void hns3_fini_ring(struct hns3_enet_ring *ring) ++{ ++ hns3_free_desc(ring); ++ devm_kfree(ring_to_dev(ring), ring->desc_cb); ++ ring->desc_cb = NULL; ++ ring->next_to_clean = 0; ++ ring->next_to_use = 0; ++ ring->last_to_use = 0; ++ ring->pending_buf = 0; ++ if (!HNAE3_IS_TX_RING(ring) && ring->skb) { ++ dev_kfree_skb_any(ring->skb); ++ ring->skb = NULL; ++ } else if (HNAE3_IS_TX_RING(ring) && ring->tx_spare) { ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ ++ dma_unmap_page(ring_to_dev(ring), tx_spare->dma, tx_spare->len, ++ DMA_TO_DEVICE); ++ free_pages((unsigned long)tx_spare->buf, ++ get_order(tx_spare->len)); ++ devm_kfree(ring_to_dev(ring), tx_spare); ++ ring->tx_spare = NULL; ++ } ++ ++ if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) { ++ page_pool_destroy(ring->page_pool); ++ ring->page_pool = NULL; ++ } ++} ++ ++static int hns3_buf_size2type(u32 buf_size) ++{ ++ int bd_size_type; ++ ++ switch (buf_size) { ++ case 512: ++ bd_size_type = HNS3_BD_SIZE_512_TYPE; ++ break; ++ case 1024: ++ bd_size_type = HNS3_BD_SIZE_1024_TYPE; ++ break; ++ case 2048: ++ bd_size_type = HNS3_BD_SIZE_2048_TYPE; ++ break; ++ case 4096: ++ bd_size_type = HNS3_BD_SIZE_4096_TYPE; ++ break; ++ default: ++ bd_size_type = HNS3_BD_SIZE_2048_TYPE; ++ } ++ ++ return bd_size_type; ++} ++ ++static void hns3_init_ring_hw(struct hns3_enet_ring *ring) ++{ ++ dma_addr_t dma = ring->desc_dma_addr; ++ struct hnae3_queue *q = ring->tqp; ++ ++ if (!HNAE3_IS_TX_RING(ring)) { ++ hns3_write_dev(q, HNS3_RING_RX_RING_BASEADDR_L_REG, (u32)dma); ++ hns3_write_dev(q, HNS3_RING_RX_RING_BASEADDR_H_REG, ++ (u32)((dma >> 31) >> 1)); ++ ++ hns3_write_dev(q, HNS3_RING_RX_RING_BD_LEN_REG, ++ hns3_buf_size2type(ring->buf_size)); ++ hns3_write_dev(q, HNS3_RING_RX_RING_BD_NUM_REG, ++ ring->desc_num / 8 - 1); ++ } else { ++ hns3_write_dev(q, HNS3_RING_TX_RING_BASEADDR_L_REG, ++ (u32)dma); ++ hns3_write_dev(q, HNS3_RING_TX_RING_BASEADDR_H_REG, ++ (u32)((dma >> 31) >> 1)); ++ ++ hns3_write_dev(q, HNS3_RING_TX_RING_BD_NUM_REG, ++ ring->desc_num / 8 - 1); ++ } ++} ++ ++static void hns3_init_tx_ring_tc(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_knic_private_info *kinfo = &priv->ae_handle->kinfo; ++ struct hnae3_tc_info *tc_info = &kinfo->tc_info; ++ int i; ++ ++ for (i = 0; i < tc_info->num_tc; i++) { ++ int j; ++ ++ for (j = 0; j < tc_info->tqp_count[i]; j++) { ++ struct hnae3_queue *q; ++ ++ q = priv->ring[tc_info->tqp_offset[i] + j].tqp; ++ hns3_write_dev(q, HNS3_RING_TX_RING_TC_REG, i); ++ } ++ } ++} ++ ++int hns3_init_all_ring(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ int ring_num = h->kinfo.num_tqps * 2; ++ int i, j; ++ int ret; ++ ++ for (i = 0; i < ring_num; i++) { ++ ret = hns3_alloc_ring_memory(&priv->ring[i]); ++ if (ret) { ++ dev_err(priv->dev, ++ "Alloc ring memory fail! ret=%d\n", ret); ++ goto out_when_alloc_ring_memory; ++ } ++ ++ u64_stats_init(&priv->ring[i].syncp); ++ } ++ ++ return 0; ++ ++out_when_alloc_ring_memory: ++ for (j = i - 1; j >= 0; j--) ++ hns3_fini_ring(&priv->ring[j]); ++ ++ return -ENOMEM; ++} ++ ++static void hns3_uninit_all_ring(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ int i; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ hns3_fini_ring(&priv->ring[i]); ++ hns3_fini_ring(&priv->ring[i + h->kinfo.num_tqps]); ++ } ++} ++ ++/* Set mac addr if it is configured. or leave it to the AE driver */ ++static int hns3_init_mac_addr(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ struct hnae3_handle *h = priv->ae_handle; ++ u8 mac_addr_temp[ETH_ALEN]; ++ int ret = 0; ++ ++ if (h->ae_algo->ops->get_mac_addr) ++ h->ae_algo->ops->get_mac_addr(h, mac_addr_temp); ++ ++ /* Check if the MAC address is valid, if not get a random one */ ++ if (!is_valid_ether_addr(mac_addr_temp)) { ++ eth_hw_addr_random(netdev); ++ hnae3_format_mac_addr(format_mac_addr, netdev->dev_addr); ++ dev_warn(priv->dev, "using random MAC address %s\n", ++ format_mac_addr); ++ } else if (!ether_addr_equal(netdev->dev_addr, mac_addr_temp)) { ++ eth_hw_addr_set(netdev, mac_addr_temp); ++ ether_addr_copy(netdev->perm_addr, mac_addr_temp); ++ } else { ++ return 0; ++ } ++ ++ if (h->ae_algo->ops->set_mac_addr) ++ ret = h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr, true); ++ ++ return ret; ++} ++ ++static int hns3_init_phy(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret = 0; ++ ++ if (h->ae_algo->ops->mac_connect_phy) ++ ret = h->ae_algo->ops->mac_connect_phy(h); ++ ++ return ret; ++} ++ ++static void hns3_uninit_phy(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (h->ae_algo->ops->mac_disconnect_phy) ++ h->ae_algo->ops->mac_disconnect_phy(h); ++} ++ ++static int hns3_client_start(struct hnae3_handle *handle) ++{ ++ if (!handle->ae_algo->ops->client_start) ++ return 0; ++ ++ return handle->ae_algo->ops->client_start(handle); ++} ++ ++static void hns3_client_stop(struct hnae3_handle *handle) ++{ ++ if (!handle->ae_algo->ops->client_stop) ++ return; ++ ++ handle->ae_algo->ops->client_stop(handle); ++} ++ ++static void hns3_info_show(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_knic_private_info *kinfo = &priv->ae_handle->kinfo; ++ char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ ++ hnae3_format_mac_addr(format_mac_addr, priv->netdev->dev_addr); ++ dev_info(priv->dev, "MAC address: %s\n", format_mac_addr); ++ dev_info(priv->dev, "Task queue pairs numbers: %u\n", kinfo->num_tqps); ++ dev_info(priv->dev, "RSS size: %u\n", kinfo->rss_size); ++ dev_info(priv->dev, "Allocated RSS size: %u\n", kinfo->req_rss_size); ++ dev_info(priv->dev, "RX buffer length: %u\n", kinfo->rx_buf_len); ++ dev_info(priv->dev, "Desc num per TX queue: %u\n", kinfo->num_tx_desc); ++ dev_info(priv->dev, "Desc num per RX queue: %u\n", kinfo->num_rx_desc); ++ dev_info(priv->dev, "Total number of enabled TCs: %u\n", ++ kinfo->tc_info.num_tc); ++ dev_info(priv->dev, "Max mtu size: %u\n", priv->netdev->max_mtu); ++} ++ ++static void hns3_set_cq_period_mode(struct hns3_nic_priv *priv, ++ enum dim_cq_period_mode mode, bool is_tx) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); ++ struct hnae3_handle *handle = priv->ae_handle; ++ int i; ++ ++ if (is_tx) { ++ priv->tx_cqe_mode = mode; ++ ++ for (i = 0; i < priv->vector_num; i++) ++ priv->tqp_vector[i].tx_group.dim.mode = mode; ++ } else { ++ priv->rx_cqe_mode = mode; ++ ++ for (i = 0; i < priv->vector_num; i++) ++ priv->tqp_vector[i].rx_group.dim.mode = mode; ++ } ++ ++ if (hnae3_ae_dev_cq_supported(ae_dev)) { ++ u32 new_mode; ++ u64 reg; ++ ++ new_mode = (mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE) ? ++ HNS3_CQ_MODE_CQE : HNS3_CQ_MODE_EQE; ++ reg = is_tx ? HNS3_GL1_CQ_MODE_REG : HNS3_GL0_CQ_MODE_REG; ++ ++ writel(new_mode, handle->kinfo.io_base + reg); ++ } ++} ++ ++void hns3_cq_period_mode_init(struct hns3_nic_priv *priv, ++ enum dim_cq_period_mode tx_mode, ++ enum dim_cq_period_mode rx_mode) ++{ ++ hns3_set_cq_period_mode(priv, tx_mode, true); ++ hns3_set_cq_period_mode(priv, rx_mode, false); ++} ++ ++static void hns3_state_init(struct hnae3_handle *handle) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev); ++ struct net_device *netdev = handle->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ set_bit(HNS3_NIC_STATE_INITED, &priv->state); ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps)) ++ set_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state); ++ ++ if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) ++ set_bit(HNAE3_PFLAG_LIMIT_PROMISC, &handle->supported_pflags); ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_HW_TX_CSUM_B, ae_dev->caps)) ++ set_bit(HNS3_NIC_STATE_HW_TX_CSUM_ENABLE, &priv->state); ++ ++ if (hnae3_ae_dev_rxd_adv_layout_supported(ae_dev)) ++ set_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state); ++} ++ ++static void hns3_state_uninit(struct hnae3_handle *handle) ++{ ++ struct hns3_nic_priv *priv = handle->priv; ++ ++ clear_bit(HNS3_NIC_STATE_INITED, &priv->state); ++} ++ ++static int hns3_client_init(struct hnae3_handle *handle) ++{ ++ struct pci_dev *pdev = handle->pdev; ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ u16 alloc_tqps, max_rss_size; ++ struct hns3_nic_priv *priv; ++ struct net_device *netdev; ++ int ret; ++ ++ handle->ae_algo->ops->get_tqps_and_rss_info(handle, &alloc_tqps, ++ &max_rss_size); ++ netdev = alloc_etherdev_mq(sizeof(struct hns3_nic_priv), alloc_tqps); ++ if (!netdev) ++ return -ENOMEM; ++ ++ priv = netdev_priv(netdev); ++ priv->dev = &pdev->dev; ++ priv->netdev = netdev; ++ priv->ae_handle = handle; ++ priv->tx_timeout_count = 0; ++ priv->max_non_tso_bd_num = ae_dev->dev_specs.max_non_tso_bd_num; ++ set_bit(HNS3_NIC_STATE_DOWN, &priv->state); ++ ++ handle->msg_enable = netif_msg_init(debug, DEFAULT_MSG_LEVEL); ++ ++ handle->kinfo.netdev = netdev; ++ handle->priv = (void *)priv; ++ ++ hns3_init_mac_addr(netdev); ++ ++ hns3_set_default_feature(netdev); ++ ++ netdev->watchdog_timeo = HNS3_TX_TIMEOUT; ++ netdev->priv_flags |= IFF_UNICAST_FLT; ++ netdev->netdev_ops = &hns3_nic_netdev_ops; ++ SET_NETDEV_DEV(netdev, &pdev->dev); ++ hns3_ethtool_set_ops(netdev); ++ ++ /* Carrier off reporting is important to ethtool even BEFORE open */ ++ netif_carrier_off(netdev); ++ ++ ret = hns3_get_ring_config(priv); ++ if (ret) { ++ ret = -ENOMEM; ++ goto out_get_ring_cfg; ++ } ++ ++ hns3_nic_init_coal_cfg(priv); ++ ++ ret = hns3_nic_alloc_vector_data(priv); ++ if (ret) { ++ ret = -ENOMEM; ++ goto out_alloc_vector_data; ++ } ++ ++ ret = hns3_nic_init_vector_data(priv); ++ if (ret) { ++ ret = -ENOMEM; ++ goto out_init_vector_data; ++ } ++ ++ ret = hns3_init_all_ring(priv); ++ if (ret) { ++ ret = -ENOMEM; ++ goto out_init_ring; ++ } ++ ++ hns3_cq_period_mode_init(priv, DIM_CQ_PERIOD_MODE_START_FROM_EQE, ++ DIM_CQ_PERIOD_MODE_START_FROM_EQE); ++ ++ ret = hns3_init_phy(netdev); ++ if (ret) ++ goto out_init_phy; ++ ++ /* the device can work without cpu rmap, only aRFS needs it */ ++ ret = hns3_set_rx_cpu_rmap(netdev); ++ if (ret) ++ dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret); ++ ++ ret = hns3_nic_init_irq(priv); ++ if (ret) { ++ dev_err(priv->dev, "init irq failed! ret=%d\n", ret); ++ hns3_free_rx_cpu_rmap(netdev); ++ goto out_init_irq_fail; ++ } ++ ++ ret = hns3_client_start(handle); ++ if (ret) { ++ dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret); ++ goto out_client_start; ++ } ++ ++ hns3_dcbnl_setup(handle); ++ ++ ret = hns3_dbg_init(handle); ++ if (ret) { ++ dev_err(priv->dev, "failed to init debugfs, ret = %d\n", ++ ret); ++ goto out_client_start; ++ } ++ ++ netdev->max_mtu = HNS3_MAX_MTU(ae_dev->dev_specs.max_frm_size); ++ ++ hns3_state_init(handle); ++ ++ ret = register_netdev(netdev); ++ if (ret) { ++ dev_err(priv->dev, "probe register netdev fail!\n"); ++ goto out_reg_netdev_fail; ++ } ++ ++ if (netif_msg_drv(handle)) ++ hns3_info_show(priv); ++ ++ return ret; ++ ++out_reg_netdev_fail: ++ hns3_state_uninit(handle); ++ hns3_dbg_uninit(handle); ++ hns3_client_stop(handle); ++out_client_start: ++ hns3_free_rx_cpu_rmap(netdev); ++ hns3_nic_uninit_irq(priv); ++out_init_irq_fail: ++ hns3_uninit_phy(netdev); ++out_init_phy: ++ hns3_uninit_all_ring(priv); ++out_init_ring: ++ hns3_nic_uninit_vector_data(priv); ++out_init_vector_data: ++ hns3_nic_dealloc_vector_data(priv); ++out_alloc_vector_data: ++ priv->ring = NULL; ++out_get_ring_cfg: ++ priv->ae_handle = NULL; ++ free_netdev(netdev); ++ return ret; ++} ++ ++static void hns3_client_uninit(struct hnae3_handle *handle, bool reset) ++{ ++ struct net_device *netdev = handle->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ if (netdev->reg_state != NETREG_UNINITIALIZED) ++ unregister_netdev(netdev); ++ ++ hns3_client_stop(handle); ++ ++ hns3_uninit_phy(netdev); ++ ++ if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) { ++ netdev_warn(netdev, "already uninitialized\n"); ++ goto out_netdev_free; ++ } ++ ++ hns3_free_rx_cpu_rmap(netdev); ++ ++ hns3_nic_uninit_irq(priv); ++ ++ hns3_clear_all_ring(handle, true); ++ ++ hns3_nic_uninit_vector_data(priv); ++ ++ hns3_nic_dealloc_vector_data(priv); ++ ++ hns3_uninit_all_ring(priv); ++ ++ hns3_put_ring_config(priv); ++ ++out_netdev_free: ++ hns3_dbg_uninit(handle); ++ free_netdev(netdev); ++} ++ ++static void hns3_link_status_change(struct hnae3_handle *handle, bool linkup) ++{ ++ struct net_device *netdev = handle->kinfo.netdev; ++ ++ if (!netdev) ++ return; ++ ++ if (linkup) { ++ netif_tx_wake_all_queues(netdev); ++ netif_carrier_on(netdev); ++ if (netif_msg_link(handle)) ++ netdev_info(netdev, "link up\n"); ++ } else { ++ netif_carrier_off(netdev); ++ netif_tx_stop_all_queues(netdev); ++ if (netif_msg_link(handle)) ++ netdev_info(netdev, "link down\n"); ++ } ++} ++ ++static void hns3_clear_tx_ring(struct hns3_enet_ring *ring) ++{ ++ while (ring->next_to_clean != ring->next_to_use) { ++ ring->desc[ring->next_to_clean].tx.bdtp_fe_sc_vld_ra_ri = 0; ++ hns3_free_buffer_detach(ring, ring->next_to_clean, 0); ++ ring_ptr_move_fw(ring, next_to_clean); ++ } ++ ++ ring->pending_buf = 0; ++} ++ ++static int hns3_clear_rx_ring(struct hns3_enet_ring *ring) ++{ ++ struct hns3_desc_cb res_cbs; ++ int ret; ++ ++ while (ring->next_to_use != ring->next_to_clean) { ++ /* When a buffer is not reused, it's memory has been ++ * freed in hns3_handle_rx_bd or will be freed by ++ * stack, so we need to replace the buffer here. ++ */ ++ if (!ring->desc_cb[ring->next_to_use].reuse_flag) { ++ ret = hns3_alloc_and_map_buffer(ring, &res_cbs); ++ if (ret) { ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ /* if alloc new buffer fail, exit directly ++ * and reclear in up flow. ++ */ ++ netdev_warn(ring_to_netdev(ring), ++ "reserve buffer map failed, ret = %d\n", ++ ret); ++ return ret; ++ } ++ hns3_replace_buffer(ring, ring->next_to_use, &res_cbs); ++ } ++ ring_ptr_move_fw(ring, next_to_use); ++ } ++ ++ /* Free the pending skb in rx ring */ ++ if (ring->skb) { ++ dev_kfree_skb_any(ring->skb); ++ ring->skb = NULL; ++ ring->pending_buf = 0; ++ } ++ ++ return 0; ++} ++ ++static void hns3_force_clear_rx_ring(struct hns3_enet_ring *ring) ++{ ++ while (ring->next_to_use != ring->next_to_clean) { ++ /* When a buffer is not reused, it's memory has been ++ * freed in hns3_handle_rx_bd or will be freed by ++ * stack, so only need to unmap the buffer here. ++ */ ++ if (!ring->desc_cb[ring->next_to_use].reuse_flag) { ++ hns3_unmap_buffer(ring, ++ &ring->desc_cb[ring->next_to_use]); ++ ring->desc_cb[ring->next_to_use].dma = 0; ++ } ++ ++ ring_ptr_move_fw(ring, next_to_use); ++ } ++} ++ ++static void hns3_clear_all_ring(struct hnae3_handle *h, bool force) ++{ ++ struct net_device *ndev = h->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ u32 i; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ struct hns3_enet_ring *ring; ++ ++ ring = &priv->ring[i]; ++ hns3_clear_tx_ring(ring); ++ ++ ring = &priv->ring[i + h->kinfo.num_tqps]; ++ /* Continue to clear other rings even if clearing some ++ * rings failed. ++ */ ++ if (force) ++ hns3_force_clear_rx_ring(ring); ++ else ++ hns3_clear_rx_ring(ring); ++ } ++} ++ ++int hns3_nic_reset_all_ring(struct hnae3_handle *h) ++{ ++ struct net_device *ndev = h->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ struct hns3_enet_ring *rx_ring; ++ int i, j; ++ int ret; ++ ++ ret = h->ae_algo->ops->reset_queue(h); ++ if (ret) ++ return ret; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ hns3_init_ring_hw(&priv->ring[i]); ++ ++ /* We need to clear tx ring here because self test will ++ * use the ring and will not run down before up ++ */ ++ hns3_clear_tx_ring(&priv->ring[i]); ++ priv->ring[i].next_to_clean = 0; ++ priv->ring[i].next_to_use = 0; ++ priv->ring[i].last_to_use = 0; ++ ++ rx_ring = &priv->ring[i + h->kinfo.num_tqps]; ++ hns3_init_ring_hw(rx_ring); ++ ret = hns3_clear_rx_ring(rx_ring); ++ if (ret) ++ return ret; ++ ++ /* We can not know the hardware head and tail when this ++ * function is called in reset flow, so we reuse all desc. ++ */ ++ for (j = 0; j < rx_ring->desc_num; j++) ++ hns3_reuse_buffer(rx_ring, j); ++ ++ rx_ring->next_to_clean = 0; ++ rx_ring->next_to_use = 0; ++ } ++ ++ hns3_init_tx_ring_tc(priv); ++ ++ return 0; ++} ++ ++static int hns3_reset_notify_down_enet(struct hnae3_handle *handle) ++{ ++ struct hnae3_knic_private_info *kinfo = &handle->kinfo; ++ struct net_device *ndev = kinfo->netdev; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ ++ if (test_and_set_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) ++ return 0; ++ ++ if (!netif_running(ndev)) ++ return 0; ++ ++ return hns3_nic_net_stop(ndev); ++} ++ ++static int hns3_reset_notify_up_enet(struct hnae3_handle *handle) ++{ ++ struct hnae3_knic_private_info *kinfo = &handle->kinfo; ++ struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev); ++ int ret = 0; ++ ++ if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state)) { ++ netdev_err(kinfo->netdev, "device is not initialized yet\n"); ++ return -EFAULT; ++ } ++ ++ clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state); ++ ++ if (netif_running(kinfo->netdev)) { ++ ret = hns3_nic_net_open(kinfo->netdev); ++ if (ret) { ++ set_bit(HNS3_NIC_STATE_RESETTING, &priv->state); ++ netdev_err(kinfo->netdev, ++ "net up fail, ret=%d!\n", ret); ++ return ret; ++ } ++ } ++ ++ return ret; ++} ++ ++static int hns3_reset_notify_init_enet(struct hnae3_handle *handle) ++{ ++ struct net_device *netdev = handle->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ int ret; ++ ++ /* Carrier off reporting is important to ethtool even BEFORE open */ ++ netif_carrier_off(netdev); ++ ++ ret = hns3_get_ring_config(priv); ++ if (ret) ++ return ret; ++ ++ ret = hns3_nic_alloc_vector_data(priv); ++ if (ret) ++ goto err_put_ring; ++ ++ ret = hns3_nic_init_vector_data(priv); ++ if (ret) ++ goto err_dealloc_vector; ++ ++ ret = hns3_init_all_ring(priv); ++ if (ret) ++ goto err_uninit_vector; ++ ++ hns3_cq_period_mode_init(priv, priv->tx_cqe_mode, priv->rx_cqe_mode); ++ ++ /* the device can work without cpu rmap, only aRFS needs it */ ++ ret = hns3_set_rx_cpu_rmap(netdev); ++ if (ret) ++ dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret); ++ ++ ret = hns3_nic_init_irq(priv); ++ if (ret) { ++ dev_err(priv->dev, "init irq failed! ret=%d\n", ret); ++ hns3_free_rx_cpu_rmap(netdev); ++ goto err_init_irq_fail; ++ } ++ ++ if (!hns3_is_phys_func(handle->pdev)) ++ hns3_init_mac_addr(netdev); ++ ++ ret = hns3_client_start(handle); ++ if (ret) { ++ dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret); ++ goto err_client_start_fail; ++ } ++ ++ set_bit(HNS3_NIC_STATE_INITED, &priv->state); ++ ++ return ret; ++ ++err_client_start_fail: ++ hns3_free_rx_cpu_rmap(netdev); ++ hns3_nic_uninit_irq(priv); ++err_init_irq_fail: ++ hns3_uninit_all_ring(priv); ++err_uninit_vector: ++ hns3_nic_uninit_vector_data(priv); ++err_dealloc_vector: ++ hns3_nic_dealloc_vector_data(priv); ++err_put_ring: ++ hns3_put_ring_config(priv); ++ ++ return ret; ++} ++ ++static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle) ++{ ++ struct net_device *netdev = handle->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) { ++ netdev_warn(netdev, "already uninitialized\n"); ++ return 0; ++ } ++ ++ hns3_free_rx_cpu_rmap(netdev); ++ hns3_nic_uninit_irq(priv); ++ hns3_clear_all_ring(handle, true); ++ hns3_reset_tx_queue(priv->ae_handle); ++ ++ hns3_nic_uninit_vector_data(priv); ++ ++ hns3_nic_dealloc_vector_data(priv); ++ ++ hns3_uninit_all_ring(priv); ++ ++ hns3_put_ring_config(priv); ++ ++ return 0; ++} ++ ++int hns3_reset_notify(struct hnae3_handle *handle, ++ enum hnae3_reset_notify_type type) ++{ ++ int ret = 0; ++ ++ switch (type) { ++ case HNAE3_UP_CLIENT: ++ ret = hns3_reset_notify_up_enet(handle); ++ break; ++ case HNAE3_DOWN_CLIENT: ++ ret = hns3_reset_notify_down_enet(handle); ++ break; ++ case HNAE3_INIT_CLIENT: ++ ret = hns3_reset_notify_init_enet(handle); ++ break; ++ case HNAE3_UNINIT_CLIENT: ++ ret = hns3_reset_notify_uninit_enet(handle); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++static int hns3_change_channels(struct hnae3_handle *handle, u32 new_tqp_num, ++ bool rxfh_configured) ++{ ++ int ret; ++ ++ ret = handle->ae_algo->ops->set_channels(handle, new_tqp_num, ++ rxfh_configured); ++ if (ret) { ++ dev_err(&handle->pdev->dev, ++ "Change tqp num(%u) fail.\n", new_tqp_num); ++ return ret; ++ } ++ ++ ret = hns3_reset_notify(handle, HNAE3_INIT_CLIENT); ++ if (ret) ++ return ret; ++ ++ ret = hns3_reset_notify(handle, HNAE3_UP_CLIENT); ++ if (ret) ++ hns3_reset_notify(handle, HNAE3_UNINIT_CLIENT); ++ ++ return ret; ++} ++ ++int hns3_set_channels(struct net_device *netdev, ++ struct ethtool_channels *ch) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct hnae3_knic_private_info *kinfo = &h->kinfo; ++ bool rxfh_configured = netif_is_rxfh_configured(netdev); ++ u32 new_tqp_num = ch->combined_count; ++ u16 org_tqp_num; ++ int ret; ++ ++ if (hns3_nic_resetting(netdev)) ++ return -EBUSY; ++ ++ if (ch->rx_count || ch->tx_count) ++ return -EINVAL; ++ ++ if (kinfo->tc_info.mqprio_active) { ++ dev_err(&netdev->dev, ++ "it's not allowed to set channels via ethtool when MQPRIO mode is on\n"); ++ return -EINVAL; ++ } ++ ++ if (new_tqp_num > hns3_get_max_available_channels(h) || ++ new_tqp_num < 1) { ++ dev_err(&netdev->dev, ++ "Change tqps fail, the tqp range is from 1 to %u", ++ hns3_get_max_available_channels(h)); ++ return -EINVAL; ++ } ++ ++ if (kinfo->rss_size == new_tqp_num) ++ return 0; ++ ++ netif_dbg(h, drv, netdev, ++ "set channels: tqp_num=%u, rxfh=%d\n", ++ new_tqp_num, rxfh_configured); ++ ++ ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT); ++ if (ret) ++ return ret; ++ ++ ret = hns3_reset_notify(h, HNAE3_UNINIT_CLIENT); ++ if (ret) ++ return ret; ++ ++ org_tqp_num = h->kinfo.num_tqps; ++ ret = hns3_change_channels(h, new_tqp_num, rxfh_configured); ++ if (ret) { ++ int ret1; ++ ++ netdev_warn(netdev, ++ "Change channels fail, revert to old value\n"); ++ ret1 = hns3_change_channels(h, org_tqp_num, rxfh_configured); ++ if (ret1) { ++ netdev_err(netdev, ++ "revert to old channel fail\n"); ++ return ret1; ++ } ++ ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static const struct hns3_hw_error_info hns3_hw_err[] = { ++ { .type = HNAE3_PPU_POISON_ERROR, ++ .msg = "PPU poison" }, ++ { .type = HNAE3_CMDQ_ECC_ERROR, ++ .msg = "IMP CMDQ error" }, ++ { .type = HNAE3_IMP_RD_POISON_ERROR, ++ .msg = "IMP RD poison" }, ++ { .type = HNAE3_ROCEE_AXI_RESP_ERROR, ++ .msg = "ROCEE AXI RESP error" }, ++}; ++ ++static void hns3_process_hw_error(struct hnae3_handle *handle, ++ enum hnae3_hw_error_type type) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(hns3_hw_err); i++) { ++ if (hns3_hw_err[i].type == type) { ++ dev_err(&handle->pdev->dev, "Detected %s!\n", ++ hns3_hw_err[i].msg); ++ break; ++ } ++ } ++} ++ ++static const struct hnae3_client_ops client_ops = { ++ .init_instance = hns3_client_init, ++ .uninit_instance = hns3_client_uninit, ++ .link_status_change = hns3_link_status_change, ++ .reset_notify = hns3_reset_notify, ++ .process_hw_error = hns3_process_hw_error, ++}; ++ ++/* hns3_init_module - Driver registration routine ++ * hns3_init_module is the first routine called when the driver is ++ * loaded. All it does is register with the PCI subsystem. ++ */ ++static int __init hns3_init_module(void) ++{ ++ int ret; ++ ++ pr_info("%s: %s - version\n", hns3_driver_name, hns3_driver_string); ++ pr_info("%s: %s\n", hns3_driver_name, hns3_copyright); ++ ++ client.type = HNAE3_CLIENT_KNIC; ++ snprintf(client.name, HNAE3_CLIENT_NAME_LENGTH, "%s", ++ hns3_driver_name); ++ ++ client.ops = &client_ops; ++ ++ INIT_LIST_HEAD(&client.node); ++ ++ hns3_dbg_register_debugfs(hns3_driver_name); ++ ++ ret = hnae3_register_client(&client); ++ if (ret) ++ goto err_reg_client; ++ ++ ret = pci_register_driver(&hns3_driver); ++ if (ret) ++ goto err_reg_driver; ++ ++ return ret; ++ ++err_reg_driver: ++ hnae3_unregister_client(&client); ++err_reg_client: ++ hns3_dbg_unregister_debugfs(); ++ return ret; ++} ++module_init(hns3_init_module); ++ ++/* hns3_exit_module - Driver exit cleanup routine ++ * hns3_exit_module is called just before the driver is removed ++ * from memory. ++ */ ++static void __exit hns3_exit_module(void) ++{ ++ pci_unregister_driver(&hns3_driver); ++ hnae3_unregister_client(&client); ++ hns3_dbg_unregister_debugfs(); ++} ++module_exit(hns3_exit_module); ++ ++MODULE_DESCRIPTION("HNS3: Hisilicon Ethernet Driver"); ++MODULE_AUTHOR("Huawei Tech. Co., Ltd."); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS("pci:hns-nic"); +diff -rupN linux.orig/drivers/net/ethernet/huawei/hinic/hinic_rx.c linux/drivers/net/ethernet/huawei/hinic/hinic_rx.c +--- linux.orig/drivers/net/ethernet/huawei/hinic/hinic_rx.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/huawei/hinic/hinic_rx.c 2022-12-04 10:40:26.684034126 -0500 +@@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rx unsigned int start; do { @@ -1599,11 +7449,10 @@ index e5828a658caf4..a866bea651103 100644 } /** -diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c -index 3b6c7b5857376..5051cdff2384b 100644 ---- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c -+++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c -@@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_txq *txq, struct hinic_txq_stats *stats) +diff -rupN linux.orig/drivers/net/ethernet/huawei/hinic/hinic_tx.c linux/drivers/net/ethernet/huawei/hinic/hinic_tx.c +--- linux.orig/drivers/net/ethernet/huawei/hinic/hinic_tx.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/huawei/hinic/hinic_tx.c 2022-12-04 10:40:26.684034126 -0500 +@@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_tx unsigned int start; do { @@ -1620,11 +7469,10 @@ index 3b6c7b5857376..5051cdff2384b 100644 } /** -diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c -index 2cca9e84e31e1..34ab5ff9823b7 100644 ---- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c -+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c -@@ -1229,10 +1229,10 @@ static void fm10k_get_stats64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c linux/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c +--- linux.orig/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c 2022-12-04 10:40:26.684034126 -0500 +@@ -1229,10 +1229,10 @@ static void fm10k_get_stats64(struct net continue; do { @@ -1637,7 +7485,7 @@ index 2cca9e84e31e1..34ab5ff9823b7 100644 stats->rx_packets += packets; stats->rx_bytes += bytes; -@@ -1245,10 +1245,10 @@ static void fm10k_get_stats64(struct net_device *netdev, +@@ -1245,10 +1245,10 @@ static void fm10k_get_stats64(struct net continue; do { @@ -1650,11 +7498,10 @@ index 2cca9e84e31e1..34ab5ff9823b7 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; -diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c -index e9cd0fa6a0d2f..90f2eee78a3ee 100644 ---- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c -+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c -@@ -154,7 +154,7 @@ __i40e_add_ethtool_stats(u64 **data, void *pointer, +diff -rupN linux.orig/drivers/net/ethernet/intel/i40e/i40e_ethtool.c linux/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 2022-12-04 10:40:26.684034126 -0500 +@@ -154,7 +154,7 @@ __i40e_add_ethtool_stats(u64 **data, voi * @ring: the ring to copy * * Queue statistics must be copied while protected by @@ -1663,7 +7510,7 @@ index e9cd0fa6a0d2f..90f2eee78a3ee 100644 * Assumes that queue stats are defined in i40e_gstrings_queue_stats. If the * ring pointer is null, zero out the queue stat values and update the data * pointer. Otherwise safely copy the stats from the ring into the supplied -@@ -172,16 +172,16 @@ i40e_add_queue_stats(u64 **data, struct i40e_ring *ring) +@@ -172,16 +172,16 @@ i40e_add_queue_stats(u64 **data, struct /* To avoid invalid statistics values, ensure that we keep retrying * the copy until we get a consistent value according to @@ -1683,11 +7530,10 @@ index e9cd0fa6a0d2f..90f2eee78a3ee 100644 /* Once we successfully copy the stats in, update the data pointer */ *data += size; -diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c -index e3d9804aeb25e..09a9f67d9ebc0 100644 ---- a/drivers/net/ethernet/intel/i40e/i40e_main.c -+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c -@@ -418,10 +418,10 @@ static void i40e_get_netdev_stats_struct_tx(struct i40e_ring *ring, +diff -rupN linux.orig/drivers/net/ethernet/intel/i40e/i40e_main.c linux/drivers/net/ethernet/intel/i40e/i40e_main.c +--- linux.orig/drivers/net/ethernet/intel/i40e/i40e_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/i40e/i40e_main.c 2022-12-04 10:40:26.684034126 -0500 +@@ -418,10 +418,10 @@ static void i40e_get_netdev_stats_struct unsigned int start; do { @@ -1700,7 +7546,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; -@@ -471,10 +471,10 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev, +@@ -471,10 +471,10 @@ static void i40e_get_netdev_stats_struct if (!ring) continue; do { @@ -1713,7 +7559,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 stats->rx_packets += packets; stats->rx_bytes += bytes; -@@ -896,10 +896,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) +@@ -896,10 +896,10 @@ static void i40e_update_vsi_stats(struct continue; do { @@ -1726,7 +7572,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 tx_b += bytes; tx_p += packets; tx_restart += p->tx_stats.restart_queue; -@@ -914,10 +914,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) +@@ -914,10 +914,10 @@ static void i40e_update_vsi_stats(struct continue; do { @@ -1739,7 +7585,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 rx_b += bytes; rx_p += packets; rx_buf += p->rx_stats.alloc_buff_failed; -@@ -934,10 +934,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) +@@ -934,10 +934,10 @@ static void i40e_update_vsi_stats(struct continue; do { @@ -1752,11 +7598,10 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 tx_b += bytes; tx_p += packets; tx_restart += p->tx_stats.restart_queue; -diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c -index e535d4c3da49d..fafa3406e0bcc 100644 ---- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c -+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c -@@ -147,7 +147,7 @@ __iavf_add_ethtool_stats(u64 **data, void *pointer, +diff -rupN linux.orig/drivers/net/ethernet/intel/iavf/iavf_ethtool.c linux/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/iavf/iavf_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/iavf/iavf_ethtool.c 2022-12-04 10:40:26.684034126 -0500 +@@ -147,7 +147,7 @@ __iavf_add_ethtool_stats(u64 **data, voi * @ring: the ring to copy * * Queue statistics must be copied while protected by @@ -1765,7 +7610,7 @@ index e535d4c3da49d..fafa3406e0bcc 100644 * Assumes that queue stats are defined in iavf_gstrings_queue_stats. If the * ring pointer is null, zero out the queue stat values and update the data * pointer. Otherwise safely copy the stats from the ring into the supplied -@@ -165,14 +165,14 @@ iavf_add_queue_stats(u64 **data, struct iavf_ring *ring) +@@ -165,14 +165,14 @@ iavf_add_queue_stats(u64 **data, struct /* To avoid invalid statistics values, ensure that we keep retrying * the copy until we get a consistent value according to @@ -1783,11 +7628,10 @@ index e535d4c3da49d..fafa3406e0bcc 100644 /* Once we successfully copy the stats in, update the data pointer */ *data += size; -diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c -index e109cb93886be..b7394c7e5eed2 100644 ---- a/drivers/net/ethernet/intel/ice/ice_main.c -+++ b/drivers/net/ethernet/intel/ice/ice_main.c -@@ -6295,10 +6295,10 @@ ice_fetch_u64_stats_per_ring(struct u64_stats_sync *syncp, +diff -rupN linux.orig/drivers/net/ethernet/intel/ice/ice_main.c linux/drivers/net/ethernet/intel/ice/ice_main.c +--- linux.orig/drivers/net/ethernet/intel/ice/ice_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ice/ice_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -6295,10 +6295,10 @@ ice_fetch_u64_stats_per_ring(struct u64_ unsigned int start; do { @@ -1800,11 +7644,10 @@ index e109cb93886be..b7394c7e5eed2 100644 } /** -diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c -index c14fc871dd417..23c6fcfcb905c 100644 ---- a/drivers/net/ethernet/intel/igb/igb_ethtool.c -+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c -@@ -2311,15 +2311,15 @@ static void igb_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/igb/igb_ethtool.c linux/drivers/net/ethernet/intel/igb/igb_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/igb/igb_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/igb/igb_ethtool.c 2022-12-04 10:40:26.688034116 -0500 +@@ -2311,15 +2311,15 @@ static void igb_get_ethtool_stats(struct ring = adapter->tx_ring[j]; do { @@ -1824,7 +7667,7 @@ index c14fc871dd417..23c6fcfcb905c 100644 data[i+2] += restart2; i += IGB_TX_QUEUE_STATS_LEN; -@@ -2327,13 +2327,13 @@ static void igb_get_ethtool_stats(struct net_device *netdev, +@@ -2327,13 +2327,13 @@ static void igb_get_ethtool_stats(struct for (j = 0; j < adapter->num_rx_queues; j++) { ring = adapter->rx_ring[j]; do { @@ -1840,11 +7683,10 @@ index c14fc871dd417..23c6fcfcb905c 100644 i += IGB_RX_QUEUE_STATS_LEN; } spin_unlock(&adapter->stats64_lock); -diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c -index 2796e81d27260..98df55dc1e933 100644 ---- a/drivers/net/ethernet/intel/igb/igb_main.c -+++ b/drivers/net/ethernet/intel/igb/igb_main.c -@@ -6633,10 +6633,10 @@ void igb_update_stats(struct igb_adapter *adapter) +diff -rupN linux.orig/drivers/net/ethernet/intel/igb/igb_main.c linux/drivers/net/ethernet/intel/igb/igb_main.c +--- linux.orig/drivers/net/ethernet/intel/igb/igb_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/igb/igb_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -6633,10 +6633,10 @@ void igb_update_stats(struct igb_adapter } do { @@ -1857,7 +7699,7 @@ index 2796e81d27260..98df55dc1e933 100644 bytes += _bytes; packets += _packets; } -@@ -6649,10 +6649,10 @@ void igb_update_stats(struct igb_adapter *adapter) +@@ -6649,10 +6649,10 @@ void igb_update_stats(struct igb_adapter for (i = 0; i < adapter->num_tx_queues; i++) { struct igb_ring *ring = adapter->tx_ring[i]; do { @@ -1870,11 +7712,10 @@ index 2796e81d27260..98df55dc1e933 100644 bytes += _bytes; packets += _packets; } -diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c -index 8cc077b712add..5a26a7805ef80 100644 ---- a/drivers/net/ethernet/intel/igc/igc_ethtool.c -+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c -@@ -839,15 +839,15 @@ static void igc_ethtool_get_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/igc/igc_ethtool.c linux/drivers/net/ethernet/intel/igc/igc_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/igc/igc_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/igc/igc_ethtool.c 2022-12-04 10:40:26.688034116 -0500 +@@ -839,15 +839,15 @@ static void igc_ethtool_get_stats(struct ring = adapter->tx_ring[j]; do { @@ -1894,7 +7735,7 @@ index 8cc077b712add..5a26a7805ef80 100644 data[i + 2] += restart2; i += IGC_TX_QUEUE_STATS_LEN; -@@ -855,13 +855,13 @@ static void igc_ethtool_get_stats(struct net_device *netdev, +@@ -855,13 +855,13 @@ static void igc_ethtool_get_stats(struct for (j = 0; j < adapter->num_rx_queues; j++) { ring = adapter->rx_ring[j]; do { @@ -1910,11 +7751,10 @@ index 8cc077b712add..5a26a7805ef80 100644 i += IGC_RX_QUEUE_STATS_LEN; } spin_unlock(&adapter->stats64_lock); -diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c -index ebff0e04045d6..944299b06cc3d 100644 ---- a/drivers/net/ethernet/intel/igc/igc_main.c -+++ b/drivers/net/ethernet/intel/igc/igc_main.c -@@ -4645,10 +4645,10 @@ void igc_update_stats(struct igc_adapter *adapter) +diff -rupN linux.orig/drivers/net/ethernet/intel/igc/igc_main.c linux/drivers/net/ethernet/intel/igc/igc_main.c +--- linux.orig/drivers/net/ethernet/intel/igc/igc_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/igc/igc_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -4645,10 +4645,10 @@ void igc_update_stats(struct igc_adapter } do { @@ -1927,7 +7767,7 @@ index ebff0e04045d6..944299b06cc3d 100644 bytes += _bytes; packets += _packets; } -@@ -4662,10 +4662,10 @@ void igc_update_stats(struct igc_adapter *adapter) +@@ -4662,10 +4662,10 @@ void igc_update_stats(struct igc_adapter struct igc_ring *ring = adapter->tx_ring[i]; do { @@ -1940,11 +7780,10 @@ index ebff0e04045d6..944299b06cc3d 100644 bytes += _bytes; packets += _packets; } -diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c -index 04f453eabef64..51bcf0df3adcc 100644 ---- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c -+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c -@@ -1335,10 +1335,10 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c linux/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 2022-12-04 10:40:26.688034116 -0500 +@@ -1335,10 +1335,10 @@ static void ixgbe_get_ethtool_stats(stru } do { @@ -1957,7 +7796,7 @@ index 04f453eabef64..51bcf0df3adcc 100644 i += 2; } for (j = 0; j < IXGBE_NUM_RX_QUEUES; j++) { -@@ -1351,10 +1351,10 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev, +@@ -1351,10 +1351,10 @@ static void ixgbe_get_ethtool_stats(stru } do { @@ -1970,11 +7809,10 @@ index 04f453eabef64..51bcf0df3adcc 100644 i += 2; } -diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c -index d1e430b8c8aa1..01c5548f181d5 100644 ---- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c -+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c -@@ -9041,10 +9041,10 @@ static void ixgbe_get_ring_stats64(struct rtnl_link_stats64 *stats, +diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c linux/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +--- linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -9041,10 +9041,10 @@ static void ixgbe_get_ring_stats64(struc if (ring) { do { @@ -1987,7 +7825,7 @@ index d1e430b8c8aa1..01c5548f181d5 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; } -@@ -9064,10 +9064,10 @@ static void ixgbe_get_stats64(struct net_device *netdev, +@@ -9064,10 +9064,10 @@ static void ixgbe_get_stats64(struct net if (ring) { do { @@ -2000,11 +7838,10 @@ index d1e430b8c8aa1..01c5548f181d5 100644 stats->rx_packets += packets; stats->rx_bytes += bytes; } -diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c -index fed46872af2bf..b4632b67ab143 100644 ---- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c -+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c -@@ -458,10 +458,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbevf/ethtool.c linux/drivers/net/ethernet/intel/ixgbevf/ethtool.c +--- linux.orig/drivers/net/ethernet/intel/ixgbevf/ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ixgbevf/ethtool.c 2022-12-04 10:40:26.688034116 -0500 +@@ -458,10 +458,10 @@ static void ixgbevf_get_ethtool_stats(st } do { @@ -2017,7 +7854,7 @@ index fed46872af2bf..b4632b67ab143 100644 i += 2; } -@@ -475,10 +475,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev, +@@ -475,10 +475,10 @@ static void ixgbevf_get_ethtool_stats(st } do { @@ -2030,7 +7867,7 @@ index fed46872af2bf..b4632b67ab143 100644 i += 2; } -@@ -492,10 +492,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev, +@@ -492,10 +492,10 @@ static void ixgbevf_get_ethtool_stats(st } do { @@ -2043,11 +7880,10 @@ index fed46872af2bf..b4632b67ab143 100644 i += 2; } } -diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c -index 2f12fbe229c15..1d31b8cff4f10 100644 ---- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c -+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c -@@ -4350,10 +4350,10 @@ static void ixgbevf_get_tx_ring_stats(struct rtnl_link_stats64 *stats, +diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c linux/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +--- linux.orig/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -4350,10 +4350,10 @@ static void ixgbevf_get_tx_ring_stats(st if (ring) { do { @@ -2060,7 +7896,7 @@ index 2f12fbe229c15..1d31b8cff4f10 100644 stats->tx_bytes += bytes; stats->tx_packets += packets; } -@@ -4376,10 +4376,10 @@ static void ixgbevf_get_stats(struct net_device *netdev, +@@ -4376,10 +4376,10 @@ static void ixgbevf_get_stats(struct net for (i = 0; i < adapter->num_rx_queues; i++) { ring = adapter->rx_ring[i]; do { @@ -2073,11 +7909,10 @@ index 2f12fbe229c15..1d31b8cff4f10 100644 stats->rx_bytes += bytes; stats->rx_packets += packets; } -diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c -index 0caa2df87c044..89ea3ef0ee162 100644 ---- a/drivers/net/ethernet/marvell/mvneta.c -+++ b/drivers/net/ethernet/marvell/mvneta.c -@@ -813,14 +813,14 @@ mvneta_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/marvell/mvneta.c linux/drivers/net/ethernet/marvell/mvneta.c +--- linux.orig/drivers/net/ethernet/marvell/mvneta.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/marvell/mvneta.c 2022-12-04 10:40:26.692034106 -0500 +@@ -813,14 +813,14 @@ mvneta_get_stats64(struct net_device *de cpu_stats = per_cpu_ptr(pp->stats, cpu); do { @@ -2094,7 +7929,7 @@ index 0caa2df87c044..89ea3ef0ee162 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -@@ -4762,7 +4762,7 @@ mvneta_ethtool_update_pcpu_stats(struct mvneta_port *pp, +@@ -4762,7 +4762,7 @@ mvneta_ethtool_update_pcpu_stats(struct stats = per_cpu_ptr(pp->stats, cpu); do { @@ -2103,7 +7938,7 @@ index 0caa2df87c044..89ea3ef0ee162 100644 skb_alloc_error = stats->es.skb_alloc_error; refill_error = stats->es.refill_error; xdp_redirect = stats->es.ps.xdp_redirect; -@@ -4772,7 +4772,7 @@ mvneta_ethtool_update_pcpu_stats(struct mvneta_port *pp, +@@ -4772,7 +4772,7 @@ mvneta_ethtool_update_pcpu_stats(struct xdp_xmit_err = stats->es.ps.xdp_xmit_err; xdp_tx = stats->es.ps.xdp_tx; xdp_tx_err = stats->es.ps.xdp_tx_err; @@ -2112,11 +7947,10 @@ index 0caa2df87c044..89ea3ef0ee162 100644 es->skb_alloc_error += skb_alloc_error; es->refill_error += refill_error; -diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c -index eaa51cd7456b6..9dd8e0315dd4f 100644 ---- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c -+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c -@@ -2008,7 +2008,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *port, struct mvpp2_pcpu_stats *xdp_stats) +diff -rupN linux.orig/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c linux/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +--- linux.orig/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 2022-12-04 10:40:26.692034106 -0500 +@@ -2008,7 +2008,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *p cpu_stats = per_cpu_ptr(port->stats, cpu); do { @@ -2125,7 +7959,7 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644 xdp_redirect = cpu_stats->xdp_redirect; xdp_pass = cpu_stats->xdp_pass; xdp_drop = cpu_stats->xdp_drop; -@@ -2016,7 +2016,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *port, struct mvpp2_pcpu_stats *xdp_stats) +@@ -2016,7 +2016,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *p xdp_xmit_err = cpu_stats->xdp_xmit_err; xdp_tx = cpu_stats->xdp_tx; xdp_tx_err = cpu_stats->xdp_tx_err; @@ -2134,7 +7968,7 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644 xdp_stats->xdp_redirect += xdp_redirect; xdp_stats->xdp_pass += xdp_pass; -@@ -5115,12 +5115,12 @@ mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +@@ -5115,12 +5115,12 @@ mvpp2_get_stats64(struct net_device *dev cpu_stats = per_cpu_ptr(port->stats, cpu); do { @@ -2149,11 +7983,10 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c -index bbea5458000bf..c9bb92187719c 100644 ---- a/drivers/net/ethernet/marvell/sky2.c -+++ b/drivers/net/ethernet/marvell/sky2.c -@@ -3894,19 +3894,19 @@ static void sky2_get_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/marvell/sky2.c linux/drivers/net/ethernet/marvell/sky2.c +--- linux.orig/drivers/net/ethernet/marvell/sky2.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/marvell/sky2.c 2022-12-04 10:40:26.692034106 -0500 +@@ -3894,19 +3894,19 @@ static void sky2_get_stats(struct net_de u64 _bytes, _packets; do { @@ -2177,11 +8010,10 @@ index bbea5458000bf..c9bb92187719c 100644 stats->tx_packets = _packets; stats->tx_bytes = _bytes; -diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c -index b344632beaddf..988927f8c5d7d 100644 ---- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c -+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c -@@ -853,7 +853,7 @@ static void mtk_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c +--- linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c 2022-12-04 10:40:26.692034106 -0500 +@@ -853,7 +853,7 @@ static void mtk_get_stats64(struct net_d } do { @@ -2190,7 +8022,7 @@ index b344632beaddf..988927f8c5d7d 100644 storage->rx_packets = hw_stats->rx_packets; storage->tx_packets = hw_stats->tx_packets; storage->rx_bytes = hw_stats->rx_bytes; -@@ -865,7 +865,7 @@ static void mtk_get_stats64(struct net_device *dev, +@@ -865,7 +865,7 @@ static void mtk_get_stats64(struct net_d storage->rx_crc_errors = hw_stats->rx_fcs_errors; storage->rx_errors = hw_stats->rx_checksum_errors; storage->tx_aborted_errors = hw_stats->tx_skip; @@ -2199,7 +8031,7 @@ index b344632beaddf..988927f8c5d7d 100644 storage->tx_errors = dev->stats.tx_errors; storage->rx_dropped = dev->stats.rx_dropped; -@@ -3664,13 +3664,13 @@ static void mtk_get_ethtool_stats(struct net_device *dev, +@@ -3668,13 +3668,13 @@ static void mtk_get_ethtool_stats(struct do { data_dst = data; @@ -2215,11 +8047,4339 @@ index b344632beaddf..988927f8c5d7d 100644 } static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, -diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c -index 30c7b0e157218..fa2753318cdf7 100644 ---- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c -+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c -@@ -827,12 +827,12 @@ mlxsw_sp_port_get_sw_stats64(const struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig +--- linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig 2022-12-04 10:40:18.136056029 -0500 +@@ -0,0 +1,4325 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * ++ * Copyright (C) 2009-2016 John Crispin ++ * Copyright (C) 2009-2016 Felix Fietkau ++ * Copyright (C) 2013-2016 Michael Lee ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "mtk_eth_soc.h" ++#include "mtk_wed.h" ++ ++static int mtk_msg_level = -1; ++module_param_named(msg_level, mtk_msg_level, int, 0); ++MODULE_PARM_DESC(msg_level, "Message level (-1=defaults,0=none,...,16=all)"); ++ ++#define MTK_ETHTOOL_STAT(x) { #x, \ ++ offsetof(struct mtk_hw_stats, x) / sizeof(u64) } ++ ++#define MTK_ETHTOOL_XDP_STAT(x) { #x, \ ++ offsetof(struct mtk_hw_stats, xdp_stats.x) / \ ++ sizeof(u64) } ++ ++static const struct mtk_reg_map mtk_reg_map = { ++ .tx_irq_mask = 0x1a1c, ++ .tx_irq_status = 0x1a18, ++ .pdma = { ++ .rx_ptr = 0x0900, ++ .rx_cnt_cfg = 0x0904, ++ .pcrx_ptr = 0x0908, ++ .glo_cfg = 0x0a04, ++ .rst_idx = 0x0a08, ++ .delay_irq = 0x0a0c, ++ .irq_status = 0x0a20, ++ .irq_mask = 0x0a28, ++ .int_grp = 0x0a50, ++ }, ++ .qdma = { ++ .qtx_cfg = 0x1800, ++ .rx_ptr = 0x1900, ++ .rx_cnt_cfg = 0x1904, ++ .qcrx_ptr = 0x1908, ++ .glo_cfg = 0x1a04, ++ .rst_idx = 0x1a08, ++ .delay_irq = 0x1a0c, ++ .fc_th = 0x1a10, ++ .int_grp = 0x1a20, ++ .hred = 0x1a44, ++ .ctx_ptr = 0x1b00, ++ .dtx_ptr = 0x1b04, ++ .crx_ptr = 0x1b10, ++ .drx_ptr = 0x1b14, ++ .fq_head = 0x1b20, ++ .fq_tail = 0x1b24, ++ .fq_count = 0x1b28, ++ .fq_blen = 0x1b2c, ++ }, ++ .gdm1_cnt = 0x2400, ++}; ++ ++static const struct mtk_reg_map mt7628_reg_map = { ++ .tx_irq_mask = 0x0a28, ++ .tx_irq_status = 0x0a20, ++ .pdma = { ++ .rx_ptr = 0x0900, ++ .rx_cnt_cfg = 0x0904, ++ .pcrx_ptr = 0x0908, ++ .glo_cfg = 0x0a04, ++ .rst_idx = 0x0a08, ++ .delay_irq = 0x0a0c, ++ .irq_status = 0x0a20, ++ .irq_mask = 0x0a28, ++ .int_grp = 0x0a50, ++ }, ++}; ++ ++static const struct mtk_reg_map mt7986_reg_map = { ++ .tx_irq_mask = 0x461c, ++ .tx_irq_status = 0x4618, ++ .pdma = { ++ .rx_ptr = 0x6100, ++ .rx_cnt_cfg = 0x6104, ++ .pcrx_ptr = 0x6108, ++ .glo_cfg = 0x6204, ++ .rst_idx = 0x6208, ++ .delay_irq = 0x620c, ++ .irq_status = 0x6220, ++ .irq_mask = 0x6228, ++ .int_grp = 0x6250, ++ }, ++ .qdma = { ++ .qtx_cfg = 0x4400, ++ .rx_ptr = 0x4500, ++ .rx_cnt_cfg = 0x4504, ++ .qcrx_ptr = 0x4508, ++ .glo_cfg = 0x4604, ++ .rst_idx = 0x4608, ++ .delay_irq = 0x460c, ++ .fc_th = 0x4610, ++ .int_grp = 0x4620, ++ .hred = 0x4644, ++ .ctx_ptr = 0x4700, ++ .dtx_ptr = 0x4704, ++ .crx_ptr = 0x4710, ++ .drx_ptr = 0x4714, ++ .fq_head = 0x4720, ++ .fq_tail = 0x4724, ++ .fq_count = 0x4728, ++ .fq_blen = 0x472c, ++ }, ++ .gdm1_cnt = 0x1c00, ++}; ++ ++/* strings used by ethtool */ ++static const struct mtk_ethtool_stats { ++ char str[ETH_GSTRING_LEN]; ++ u32 offset; ++} mtk_ethtool_stats[] = { ++ MTK_ETHTOOL_STAT(tx_bytes), ++ MTK_ETHTOOL_STAT(tx_packets), ++ MTK_ETHTOOL_STAT(tx_skip), ++ MTK_ETHTOOL_STAT(tx_collisions), ++ MTK_ETHTOOL_STAT(rx_bytes), ++ MTK_ETHTOOL_STAT(rx_packets), ++ MTK_ETHTOOL_STAT(rx_overflow), ++ MTK_ETHTOOL_STAT(rx_fcs_errors), ++ MTK_ETHTOOL_STAT(rx_short_errors), ++ MTK_ETHTOOL_STAT(rx_long_errors), ++ MTK_ETHTOOL_STAT(rx_checksum_errors), ++ MTK_ETHTOOL_STAT(rx_flow_control_packets), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_redirect), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_pass), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_drop), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_tx), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_tx_errors), ++ MTK_ETHTOOL_XDP_STAT(tx_xdp_xmit), ++ MTK_ETHTOOL_XDP_STAT(tx_xdp_xmit_errors), ++}; ++ ++static const char * const mtk_clks_source_name[] = { ++ "ethif", "sgmiitop", "esw", "gp0", "gp1", "gp2", "fe", "trgpll", ++ "sgmii_tx250m", "sgmii_rx250m", "sgmii_cdr_ref", "sgmii_cdr_fb", ++ "sgmii2_tx250m", "sgmii2_rx250m", "sgmii2_cdr_ref", "sgmii2_cdr_fb", ++ "sgmii_ck", "eth2pll", "wocpu0", "wocpu1", "netsys0", "netsys1" ++}; ++ ++void mtk_w32(struct mtk_eth *eth, u32 val, unsigned reg) ++{ ++ __raw_writel(val, eth->base + reg); ++} ++ ++u32 mtk_r32(struct mtk_eth *eth, unsigned reg) ++{ ++ return __raw_readl(eth->base + reg); ++} ++ ++static u32 mtk_m32(struct mtk_eth *eth, u32 mask, u32 set, unsigned reg) ++{ ++ u32 val; ++ ++ val = mtk_r32(eth, reg); ++ val &= ~mask; ++ val |= set; ++ mtk_w32(eth, val, reg); ++ return reg; ++} ++ ++static int mtk_mdio_busy_wait(struct mtk_eth *eth) ++{ ++ unsigned long t_start = jiffies; ++ ++ while (1) { ++ if (!(mtk_r32(eth, MTK_PHY_IAC) & PHY_IAC_ACCESS)) ++ return 0; ++ if (time_after(jiffies, t_start + PHY_IAC_TIMEOUT)) ++ break; ++ cond_resched(); ++ } ++ ++ dev_err(eth->dev, "mdio: MDIO timeout\n"); ++ return -ETIMEDOUT; ++} ++ ++static int _mtk_mdio_write(struct mtk_eth *eth, u32 phy_addr, u32 phy_reg, ++ u32 write_data) ++{ ++ int ret; ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ if (phy_reg & MII_ADDR_C45) { ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C45 | ++ PHY_IAC_CMD_C45_ADDR | ++ PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) | ++ PHY_IAC_ADDR(phy_addr) | ++ PHY_IAC_DATA(mdiobus_c45_regad(phy_reg)), ++ MTK_PHY_IAC); ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C45 | ++ PHY_IAC_CMD_WRITE | ++ PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) | ++ PHY_IAC_ADDR(phy_addr) | ++ PHY_IAC_DATA(write_data), ++ MTK_PHY_IAC); ++ } else { ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C22 | ++ PHY_IAC_CMD_WRITE | ++ PHY_IAC_REG(phy_reg) | ++ PHY_IAC_ADDR(phy_addr) | ++ PHY_IAC_DATA(write_data), ++ MTK_PHY_IAC); ++ } ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ return 0; ++} ++ ++static int _mtk_mdio_read(struct mtk_eth *eth, u32 phy_addr, u32 phy_reg) ++{ ++ int ret; ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ if (phy_reg & MII_ADDR_C45) { ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C45 | ++ PHY_IAC_CMD_C45_ADDR | ++ PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) | ++ PHY_IAC_ADDR(phy_addr) | ++ PHY_IAC_DATA(mdiobus_c45_regad(phy_reg)), ++ MTK_PHY_IAC); ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C45 | ++ PHY_IAC_CMD_C45_READ | ++ PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) | ++ PHY_IAC_ADDR(phy_addr), ++ MTK_PHY_IAC); ++ } else { ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C22 | ++ PHY_IAC_CMD_C22_READ | ++ PHY_IAC_REG(phy_reg) | ++ PHY_IAC_ADDR(phy_addr), ++ MTK_PHY_IAC); ++ } ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ return mtk_r32(eth, MTK_PHY_IAC) & PHY_IAC_DATA_MASK; ++} ++ ++static int mtk_mdio_write(struct mii_bus *bus, int phy_addr, ++ int phy_reg, u16 val) ++{ ++ struct mtk_eth *eth = bus->priv; ++ ++ return _mtk_mdio_write(eth, phy_addr, phy_reg, val); ++} ++ ++static int mtk_mdio_read(struct mii_bus *bus, int phy_addr, int phy_reg) ++{ ++ struct mtk_eth *eth = bus->priv; ++ ++ return _mtk_mdio_read(eth, phy_addr, phy_reg); ++} ++ ++static int mt7621_gmac0_rgmii_adjust(struct mtk_eth *eth, ++ phy_interface_t interface) ++{ ++ u32 val; ++ ++ /* Check DDR memory type. ++ * Currently TRGMII mode with DDR2 memory is not supported. ++ */ ++ regmap_read(eth->ethsys, ETHSYS_SYSCFG, &val); ++ if (interface == PHY_INTERFACE_MODE_TRGMII && ++ val & SYSCFG_DRAM_TYPE_DDR2) { ++ dev_err(eth->dev, ++ "TRGMII mode with DDR2 memory is not supported!\n"); ++ return -EOPNOTSUPP; ++ } ++ ++ val = (interface == PHY_INTERFACE_MODE_TRGMII) ? ++ ETHSYS_TRGMII_MT7621_DDR_PLL : 0; ++ ++ regmap_update_bits(eth->ethsys, ETHSYS_CLKCFG0, ++ ETHSYS_TRGMII_MT7621_MASK, val); ++ ++ return 0; ++} ++ ++static void mtk_gmac0_rgmii_adjust(struct mtk_eth *eth, ++ phy_interface_t interface, int speed) ++{ ++ u32 val; ++ int ret; ++ ++ if (interface == PHY_INTERFACE_MODE_TRGMII) { ++ mtk_w32(eth, TRGMII_MODE, INTF_MODE); ++ val = 500000000; ++ ret = clk_set_rate(eth->clks[MTK_CLK_TRGPLL], val); ++ if (ret) ++ dev_err(eth->dev, "Failed to set trgmii pll: %d\n", ret); ++ return; ++ } ++ ++ val = (speed == SPEED_1000) ? ++ INTF_MODE_RGMII_1000 : INTF_MODE_RGMII_10_100; ++ mtk_w32(eth, val, INTF_MODE); ++ ++ regmap_update_bits(eth->ethsys, ETHSYS_CLKCFG0, ++ ETHSYS_TRGMII_CLK_SEL362_5, ++ ETHSYS_TRGMII_CLK_SEL362_5); ++ ++ val = (speed == SPEED_1000) ? 250000000 : 500000000; ++ ret = clk_set_rate(eth->clks[MTK_CLK_TRGPLL], val); ++ if (ret) ++ dev_err(eth->dev, "Failed to set trgmii pll: %d\n", ret); ++ ++ val = (speed == SPEED_1000) ? ++ RCK_CTRL_RGMII_1000 : RCK_CTRL_RGMII_10_100; ++ mtk_w32(eth, val, TRGMII_RCK_CTRL); ++ ++ val = (speed == SPEED_1000) ? ++ TCK_CTRL_RGMII_1000 : TCK_CTRL_RGMII_10_100; ++ mtk_w32(eth, val, TRGMII_TCK_CTRL); ++} ++ ++static struct phylink_pcs *mtk_mac_select_pcs(struct phylink_config *config, ++ phy_interface_t interface) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ struct mtk_eth *eth = mac->hw; ++ unsigned int sid; ++ ++ if (interface == PHY_INTERFACE_MODE_SGMII || ++ phy_interface_mode_is_8023z(interface)) { ++ sid = (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_SGMII)) ? ++ 0 : mac->id; ++ ++ return mtk_sgmii_select_pcs(eth->sgmii, sid); ++ } ++ ++ return NULL; ++} ++ ++static void mtk_mac_config(struct phylink_config *config, unsigned int mode, ++ const struct phylink_link_state *state) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ struct mtk_eth *eth = mac->hw; ++ int val, ge_mode, err = 0; ++ u32 i; ++ ++ /* MT76x8 has no hardware settings between for the MAC */ ++ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) && ++ mac->interface != state->interface) { ++ /* Setup soc pin functions */ ++ switch (state->interface) { ++ case PHY_INTERFACE_MODE_TRGMII: ++ if (mac->id) ++ goto err_phy; ++ if (!MTK_HAS_CAPS(mac->hw->soc->caps, ++ MTK_GMAC1_TRGMII)) ++ goto err_phy; ++ fallthrough; ++ case PHY_INTERFACE_MODE_RGMII_TXID: ++ case PHY_INTERFACE_MODE_RGMII_RXID: ++ case PHY_INTERFACE_MODE_RGMII_ID: ++ case PHY_INTERFACE_MODE_RGMII: ++ case PHY_INTERFACE_MODE_MII: ++ case PHY_INTERFACE_MODE_REVMII: ++ case PHY_INTERFACE_MODE_RMII: ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_RGMII)) { ++ err = mtk_gmac_rgmii_path_setup(eth, mac->id); ++ if (err) ++ goto init_err; ++ } ++ break; ++ case PHY_INTERFACE_MODE_1000BASEX: ++ case PHY_INTERFACE_MODE_2500BASEX: ++ case PHY_INTERFACE_MODE_SGMII: ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) { ++ err = mtk_gmac_sgmii_path_setup(eth, mac->id); ++ if (err) ++ goto init_err; ++ } ++ break; ++ case PHY_INTERFACE_MODE_GMII: ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_GEPHY)) { ++ err = mtk_gmac_gephy_path_setup(eth, mac->id); ++ if (err) ++ goto init_err; ++ } ++ break; ++ default: ++ goto err_phy; ++ } ++ ++ /* Setup clock for 1st gmac */ ++ if (!mac->id && state->interface != PHY_INTERFACE_MODE_SGMII && ++ !phy_interface_mode_is_8023z(state->interface) && ++ MTK_HAS_CAPS(mac->hw->soc->caps, MTK_GMAC1_TRGMII)) { ++ if (MTK_HAS_CAPS(mac->hw->soc->caps, ++ MTK_TRGMII_MT7621_CLK)) { ++ if (mt7621_gmac0_rgmii_adjust(mac->hw, ++ state->interface)) ++ goto err_phy; ++ } else { ++ /* FIXME: this is incorrect. Not only does it ++ * use state->speed (which is not guaranteed ++ * to be correct) but it also makes use of it ++ * in a code path that will only be reachable ++ * when the PHY interface mode changes, not ++ * when the speed changes. Consequently, RGMII ++ * is probably broken. ++ */ ++ mtk_gmac0_rgmii_adjust(mac->hw, ++ state->interface, ++ state->speed); ++ ++ /* mt7623_pad_clk_setup */ ++ for (i = 0 ; i < NUM_TRGMII_CTRL; i++) ++ mtk_w32(mac->hw, ++ TD_DM_DRVP(8) | TD_DM_DRVN(8), ++ TRGMII_TD_ODT(i)); ++ ++ /* Assert/release MT7623 RXC reset */ ++ mtk_m32(mac->hw, 0, RXC_RST | RXC_DQSISEL, ++ TRGMII_RCK_CTRL); ++ mtk_m32(mac->hw, RXC_RST, 0, TRGMII_RCK_CTRL); ++ } ++ } ++ ++ ge_mode = 0; ++ switch (state->interface) { ++ case PHY_INTERFACE_MODE_MII: ++ case PHY_INTERFACE_MODE_GMII: ++ ge_mode = 1; ++ break; ++ case PHY_INTERFACE_MODE_REVMII: ++ ge_mode = 2; ++ break; ++ case PHY_INTERFACE_MODE_RMII: ++ if (mac->id) ++ goto err_phy; ++ ge_mode = 3; ++ break; ++ default: ++ break; ++ } ++ ++ /* put the gmac into the right mode */ ++ regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val); ++ val &= ~SYSCFG0_GE_MODE(SYSCFG0_GE_MASK, mac->id); ++ val |= SYSCFG0_GE_MODE(ge_mode, mac->id); ++ regmap_write(eth->ethsys, ETHSYS_SYSCFG0, val); ++ ++ mac->interface = state->interface; ++ } ++ ++ /* SGMII */ ++ if (state->interface == PHY_INTERFACE_MODE_SGMII || ++ phy_interface_mode_is_8023z(state->interface)) { ++ /* The path GMAC to SGMII will be enabled once the SGMIISYS is ++ * being setup done. ++ */ ++ regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val); ++ ++ regmap_update_bits(eth->ethsys, ETHSYS_SYSCFG0, ++ SYSCFG0_SGMII_MASK, ++ ~(u32)SYSCFG0_SGMII_MASK); ++ ++ /* Save the syscfg0 value for mac_finish */ ++ mac->syscfg0 = val; ++ } else if (phylink_autoneg_inband(mode)) { ++ dev_err(eth->dev, ++ "In-band mode not supported in non SGMII mode!\n"); ++ return; ++ } ++ ++ return; ++ ++err_phy: ++ dev_err(eth->dev, "%s: GMAC%d mode %s not supported!\n", __func__, ++ mac->id, phy_modes(state->interface)); ++ return; ++ ++init_err: ++ dev_err(eth->dev, "%s: GMAC%d mode %s err: %d!\n", __func__, ++ mac->id, phy_modes(state->interface), err); ++} ++ ++static int mtk_mac_finish(struct phylink_config *config, unsigned int mode, ++ phy_interface_t interface) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ struct mtk_eth *eth = mac->hw; ++ u32 mcr_cur, mcr_new; ++ ++ /* Enable SGMII */ ++ if (interface == PHY_INTERFACE_MODE_SGMII || ++ phy_interface_mode_is_8023z(interface)) ++ regmap_update_bits(eth->ethsys, ETHSYS_SYSCFG0, ++ SYSCFG0_SGMII_MASK, mac->syscfg0); ++ ++ /* Setup gmac */ ++ mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); ++ mcr_new = mcr_cur; ++ mcr_new |= MAC_MCR_IPG_CFG | MAC_MCR_FORCE_MODE | ++ MAC_MCR_BACKOFF_EN | MAC_MCR_BACKPR_EN | MAC_MCR_FORCE_LINK; ++ ++ /* Only update control register when needed! */ ++ if (mcr_new != mcr_cur) ++ mtk_w32(mac->hw, mcr_new, MTK_MAC_MCR(mac->id)); ++ ++ return 0; ++} ++ ++static void mtk_mac_pcs_get_state(struct phylink_config *config, ++ struct phylink_link_state *state) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ u32 pmsr = mtk_r32(mac->hw, MTK_MAC_MSR(mac->id)); ++ ++ state->link = (pmsr & MAC_MSR_LINK); ++ state->duplex = (pmsr & MAC_MSR_DPX) >> 1; ++ ++ switch (pmsr & (MAC_MSR_SPEED_1000 | MAC_MSR_SPEED_100)) { ++ case 0: ++ state->speed = SPEED_10; ++ break; ++ case MAC_MSR_SPEED_100: ++ state->speed = SPEED_100; ++ break; ++ case MAC_MSR_SPEED_1000: ++ state->speed = SPEED_1000; ++ break; ++ default: ++ state->speed = SPEED_UNKNOWN; ++ break; ++ } ++ ++ state->pause &= (MLO_PAUSE_RX | MLO_PAUSE_TX); ++ if (pmsr & MAC_MSR_RX_FC) ++ state->pause |= MLO_PAUSE_RX; ++ if (pmsr & MAC_MSR_TX_FC) ++ state->pause |= MLO_PAUSE_TX; ++} ++ ++static void mtk_mac_link_down(struct phylink_config *config, unsigned int mode, ++ phy_interface_t interface) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ u32 mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); ++ ++ mcr &= ~(MAC_MCR_TX_EN | MAC_MCR_RX_EN); ++ mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); ++} ++ ++static void mtk_mac_link_up(struct phylink_config *config, ++ struct phy_device *phy, ++ unsigned int mode, phy_interface_t interface, ++ int speed, int duplex, bool tx_pause, bool rx_pause) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ u32 mcr; ++ ++ mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); ++ mcr &= ~(MAC_MCR_SPEED_100 | MAC_MCR_SPEED_1000 | ++ MAC_MCR_FORCE_DPX | MAC_MCR_FORCE_TX_FC | ++ MAC_MCR_FORCE_RX_FC); ++ ++ /* Configure speed */ ++ switch (speed) { ++ case SPEED_2500: ++ case SPEED_1000: ++ mcr |= MAC_MCR_SPEED_1000; ++ break; ++ case SPEED_100: ++ mcr |= MAC_MCR_SPEED_100; ++ break; ++ } ++ ++ /* Configure duplex */ ++ if (duplex == DUPLEX_FULL) ++ mcr |= MAC_MCR_FORCE_DPX; ++ ++ /* Configure pause modes - phylink will avoid these for half duplex */ ++ if (tx_pause) ++ mcr |= MAC_MCR_FORCE_TX_FC; ++ if (rx_pause) ++ mcr |= MAC_MCR_FORCE_RX_FC; ++ ++ mcr |= MAC_MCR_TX_EN | MAC_MCR_RX_EN; ++ mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); ++} ++ ++static const struct phylink_mac_ops mtk_phylink_ops = { ++ .validate = phylink_generic_validate, ++ .mac_select_pcs = mtk_mac_select_pcs, ++ .mac_pcs_get_state = mtk_mac_pcs_get_state, ++ .mac_config = mtk_mac_config, ++ .mac_finish = mtk_mac_finish, ++ .mac_link_down = mtk_mac_link_down, ++ .mac_link_up = mtk_mac_link_up, ++}; ++ ++static int mtk_mdio_init(struct mtk_eth *eth) ++{ ++ struct device_node *mii_np; ++ int ret; ++ ++ mii_np = of_get_child_by_name(eth->dev->of_node, "mdio-bus"); ++ if (!mii_np) { ++ dev_err(eth->dev, "no %s child node found", "mdio-bus"); ++ return -ENODEV; ++ } ++ ++ if (!of_device_is_available(mii_np)) { ++ ret = -ENODEV; ++ goto err_put_node; ++ } ++ ++ eth->mii_bus = devm_mdiobus_alloc(eth->dev); ++ if (!eth->mii_bus) { ++ ret = -ENOMEM; ++ goto err_put_node; ++ } ++ ++ eth->mii_bus->name = "mdio"; ++ eth->mii_bus->read = mtk_mdio_read; ++ eth->mii_bus->write = mtk_mdio_write; ++ eth->mii_bus->probe_capabilities = MDIOBUS_C22_C45; ++ eth->mii_bus->priv = eth; ++ eth->mii_bus->parent = eth->dev; ++ ++ snprintf(eth->mii_bus->id, MII_BUS_ID_SIZE, "%pOFn", mii_np); ++ ret = of_mdiobus_register(eth->mii_bus, mii_np); ++ ++err_put_node: ++ of_node_put(mii_np); ++ return ret; ++} ++ ++static void mtk_mdio_cleanup(struct mtk_eth *eth) ++{ ++ if (!eth->mii_bus) ++ return; ++ ++ mdiobus_unregister(eth->mii_bus); ++} ++ ++static inline void mtk_tx_irq_disable(struct mtk_eth *eth, u32 mask) ++{ ++ unsigned long flags; ++ u32 val; ++ ++ spin_lock_irqsave(ð->tx_irq_lock, flags); ++ val = mtk_r32(eth, eth->soc->reg_map->tx_irq_mask); ++ mtk_w32(eth, val & ~mask, eth->soc->reg_map->tx_irq_mask); ++ spin_unlock_irqrestore(ð->tx_irq_lock, flags); ++} ++ ++static inline void mtk_tx_irq_enable(struct mtk_eth *eth, u32 mask) ++{ ++ unsigned long flags; ++ u32 val; ++ ++ spin_lock_irqsave(ð->tx_irq_lock, flags); ++ val = mtk_r32(eth, eth->soc->reg_map->tx_irq_mask); ++ mtk_w32(eth, val | mask, eth->soc->reg_map->tx_irq_mask); ++ spin_unlock_irqrestore(ð->tx_irq_lock, flags); ++} ++ ++static inline void mtk_rx_irq_disable(struct mtk_eth *eth, u32 mask) ++{ ++ unsigned long flags; ++ u32 val; ++ ++ spin_lock_irqsave(ð->rx_irq_lock, flags); ++ val = mtk_r32(eth, eth->soc->reg_map->pdma.irq_mask); ++ mtk_w32(eth, val & ~mask, eth->soc->reg_map->pdma.irq_mask); ++ spin_unlock_irqrestore(ð->rx_irq_lock, flags); ++} ++ ++static inline void mtk_rx_irq_enable(struct mtk_eth *eth, u32 mask) ++{ ++ unsigned long flags; ++ u32 val; ++ ++ spin_lock_irqsave(ð->rx_irq_lock, flags); ++ val = mtk_r32(eth, eth->soc->reg_map->pdma.irq_mask); ++ mtk_w32(eth, val | mask, eth->soc->reg_map->pdma.irq_mask); ++ spin_unlock_irqrestore(ð->rx_irq_lock, flags); ++} ++ ++static int mtk_set_mac_address(struct net_device *dev, void *p) ++{ ++ int ret = eth_mac_addr(dev, p); ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ const char *macaddr = dev->dev_addr; ++ ++ if (ret) ++ return ret; ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return -EBUSY; ++ ++ spin_lock_bh(&mac->hw->page_lock); ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1], ++ MT7628_SDM_MAC_ADRH); ++ mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) | ++ (macaddr[4] << 8) | macaddr[5], ++ MT7628_SDM_MAC_ADRL); ++ } else { ++ mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1], ++ MTK_GDMA_MAC_ADRH(mac->id)); ++ mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) | ++ (macaddr[4] << 8) | macaddr[5], ++ MTK_GDMA_MAC_ADRL(mac->id)); ++ } ++ spin_unlock_bh(&mac->hw->page_lock); ++ ++ return 0; ++} ++ ++void mtk_stats_update_mac(struct mtk_mac *mac) ++{ ++ struct mtk_hw_stats *hw_stats = mac->hw_stats; ++ struct mtk_eth *eth = mac->hw; ++ ++ u64_stats_update_begin(&hw_stats->syncp); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ hw_stats->tx_packets += mtk_r32(mac->hw, MT7628_SDM_TPCNT); ++ hw_stats->tx_bytes += mtk_r32(mac->hw, MT7628_SDM_TBCNT); ++ hw_stats->rx_packets += mtk_r32(mac->hw, MT7628_SDM_RPCNT); ++ hw_stats->rx_bytes += mtk_r32(mac->hw, MT7628_SDM_RBCNT); ++ hw_stats->rx_checksum_errors += ++ mtk_r32(mac->hw, MT7628_SDM_CS_ERR); ++ } else { ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ unsigned int offs = hw_stats->reg_offset; ++ u64 stats; ++ ++ hw_stats->rx_bytes += mtk_r32(mac->hw, reg_map->gdm1_cnt + offs); ++ stats = mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x4 + offs); ++ if (stats) ++ hw_stats->rx_bytes += (stats << 32); ++ hw_stats->rx_packets += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x8 + offs); ++ hw_stats->rx_overflow += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x10 + offs); ++ hw_stats->rx_fcs_errors += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x14 + offs); ++ hw_stats->rx_short_errors += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x18 + offs); ++ hw_stats->rx_long_errors += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x1c + offs); ++ hw_stats->rx_checksum_errors += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x20 + offs); ++ hw_stats->rx_flow_control_packets += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x24 + offs); ++ hw_stats->tx_skip += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x28 + offs); ++ hw_stats->tx_collisions += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x2c + offs); ++ hw_stats->tx_bytes += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x30 + offs); ++ stats = mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x34 + offs); ++ if (stats) ++ hw_stats->tx_bytes += (stats << 32); ++ hw_stats->tx_packets += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x38 + offs); ++ } ++ ++ u64_stats_update_end(&hw_stats->syncp); ++} ++ ++static void mtk_stats_update(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->mac[i] || !eth->mac[i]->hw_stats) ++ continue; ++ if (spin_trylock(ð->mac[i]->hw_stats->stats_lock)) { ++ mtk_stats_update_mac(eth->mac[i]); ++ spin_unlock(ð->mac[i]->hw_stats->stats_lock); ++ } ++ } ++} ++ ++static void mtk_get_stats64(struct net_device *dev, ++ struct rtnl_link_stats64 *storage) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_hw_stats *hw_stats = mac->hw_stats; ++ unsigned int start; ++ ++ if (netif_running(dev) && netif_device_present(dev)) { ++ if (spin_trylock_bh(&hw_stats->stats_lock)) { ++ mtk_stats_update_mac(mac); ++ spin_unlock_bh(&hw_stats->stats_lock); ++ } ++ } ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&hw_stats->syncp); ++ storage->rx_packets = hw_stats->rx_packets; ++ storage->tx_packets = hw_stats->tx_packets; ++ storage->rx_bytes = hw_stats->rx_bytes; ++ storage->tx_bytes = hw_stats->tx_bytes; ++ storage->collisions = hw_stats->tx_collisions; ++ storage->rx_length_errors = hw_stats->rx_short_errors + ++ hw_stats->rx_long_errors; ++ storage->rx_over_errors = hw_stats->rx_overflow; ++ storage->rx_crc_errors = hw_stats->rx_fcs_errors; ++ storage->rx_errors = hw_stats->rx_checksum_errors; ++ storage->tx_aborted_errors = hw_stats->tx_skip; ++ } while (u64_stats_fetch_retry_irq(&hw_stats->syncp, start)); ++ ++ storage->tx_errors = dev->stats.tx_errors; ++ storage->rx_dropped = dev->stats.rx_dropped; ++ storage->tx_dropped = dev->stats.tx_dropped; ++} ++ ++static inline int mtk_max_frag_size(int mtu) ++{ ++ /* make sure buf_size will be at least MTK_MAX_RX_LENGTH */ ++ if (mtu + MTK_RX_ETH_HLEN < MTK_MAX_RX_LENGTH_2K) ++ mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN; ++ ++ return SKB_DATA_ALIGN(MTK_RX_HLEN + mtu) + ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++} ++ ++static inline int mtk_max_buf_size(int frag_size) ++{ ++ int buf_size = frag_size - NET_SKB_PAD - NET_IP_ALIGN - ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ ++ WARN_ON(buf_size < MTK_MAX_RX_LENGTH_2K); ++ ++ return buf_size; ++} ++ ++static bool mtk_rx_get_desc(struct mtk_eth *eth, struct mtk_rx_dma_v2 *rxd, ++ struct mtk_rx_dma_v2 *dma_rxd) ++{ ++ rxd->rxd2 = READ_ONCE(dma_rxd->rxd2); ++ if (!(rxd->rxd2 & RX_DMA_DONE)) ++ return false; ++ ++ rxd->rxd1 = READ_ONCE(dma_rxd->rxd1); ++ rxd->rxd3 = READ_ONCE(dma_rxd->rxd3); ++ rxd->rxd4 = READ_ONCE(dma_rxd->rxd4); ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ rxd->rxd5 = READ_ONCE(dma_rxd->rxd5); ++ rxd->rxd6 = READ_ONCE(dma_rxd->rxd6); ++ } ++ ++ return true; ++} ++ ++static void *mtk_max_lro_buf_alloc(gfp_t gfp_mask) ++{ ++ unsigned int size = mtk_max_frag_size(MTK_MAX_LRO_RX_LENGTH); ++ unsigned long data; ++ ++ data = __get_free_pages(gfp_mask | __GFP_COMP | __GFP_NOWARN, ++ get_order(size)); ++ ++ return (void *)data; ++} ++ ++/* the qdma core needs scratch memory to be setup */ ++static int mtk_init_fq_dma(struct mtk_eth *eth) ++{ ++ const struct mtk_soc_data *soc = eth->soc; ++ dma_addr_t phy_ring_tail; ++ int cnt = MTK_DMA_SIZE; ++ dma_addr_t dma_addr; ++ int i; ++ ++ eth->scratch_ring = dma_alloc_coherent(eth->dma_dev, ++ cnt * soc->txrx.txd_size, ++ ð->phy_scratch_ring, ++ GFP_KERNEL); ++ if (unlikely(!eth->scratch_ring)) ++ return -ENOMEM; ++ ++ eth->scratch_head = kcalloc(cnt, MTK_QDMA_PAGE_SIZE, GFP_KERNEL); ++ if (unlikely(!eth->scratch_head)) ++ return -ENOMEM; ++ ++ dma_addr = dma_map_single(eth->dma_dev, ++ eth->scratch_head, cnt * MTK_QDMA_PAGE_SIZE, ++ DMA_FROM_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr))) ++ return -ENOMEM; ++ ++ phy_ring_tail = eth->phy_scratch_ring + soc->txrx.txd_size * (cnt - 1); ++ ++ for (i = 0; i < cnt; i++) { ++ struct mtk_tx_dma_v2 *txd; ++ ++ txd = eth->scratch_ring + i * soc->txrx.txd_size; ++ txd->txd1 = dma_addr + i * MTK_QDMA_PAGE_SIZE; ++ if (i < cnt - 1) ++ txd->txd2 = eth->phy_scratch_ring + ++ (i + 1) * soc->txrx.txd_size; ++ ++ txd->txd3 = TX_DMA_PLEN0(MTK_QDMA_PAGE_SIZE); ++ txd->txd4 = 0; ++ if (MTK_HAS_CAPS(soc->caps, MTK_NETSYS_V2)) { ++ txd->txd5 = 0; ++ txd->txd6 = 0; ++ txd->txd7 = 0; ++ txd->txd8 = 0; ++ } ++ } ++ ++ mtk_w32(eth, eth->phy_scratch_ring, soc->reg_map->qdma.fq_head); ++ mtk_w32(eth, phy_ring_tail, soc->reg_map->qdma.fq_tail); ++ mtk_w32(eth, (cnt << 16) | cnt, soc->reg_map->qdma.fq_count); ++ mtk_w32(eth, MTK_QDMA_PAGE_SIZE << 16, soc->reg_map->qdma.fq_blen); ++ ++ return 0; ++} ++ ++static void *mtk_qdma_phys_to_virt(struct mtk_tx_ring *ring, u32 desc) ++{ ++ return ring->dma + (desc - ring->phys); ++} ++ ++static struct mtk_tx_buf *mtk_desc_to_tx_buf(struct mtk_tx_ring *ring, ++ void *txd, u32 txd_size) ++{ ++ int idx = (txd - ring->dma) / txd_size; ++ ++ return &ring->buf[idx]; ++} ++ ++static struct mtk_tx_dma *qdma_to_pdma(struct mtk_tx_ring *ring, ++ struct mtk_tx_dma *dma) ++{ ++ return ring->dma_pdma - (struct mtk_tx_dma *)ring->dma + dma; ++} ++ ++static int txd_to_idx(struct mtk_tx_ring *ring, void *dma, u32 txd_size) ++{ ++ return (dma - ring->dma) / txd_size; ++} ++ ++static void mtk_tx_unmap(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf, ++ struct xdp_frame_bulk *bq, bool napi) ++{ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) { ++ dma_unmap_single(eth->dma_dev, ++ dma_unmap_addr(tx_buf, dma_addr0), ++ dma_unmap_len(tx_buf, dma_len0), ++ DMA_TO_DEVICE); ++ } else if (tx_buf->flags & MTK_TX_FLAGS_PAGE0) { ++ dma_unmap_page(eth->dma_dev, ++ dma_unmap_addr(tx_buf, dma_addr0), ++ dma_unmap_len(tx_buf, dma_len0), ++ DMA_TO_DEVICE); ++ } ++ } else { ++ if (dma_unmap_len(tx_buf, dma_len0)) { ++ dma_unmap_page(eth->dma_dev, ++ dma_unmap_addr(tx_buf, dma_addr0), ++ dma_unmap_len(tx_buf, dma_len0), ++ DMA_TO_DEVICE); ++ } ++ ++ if (dma_unmap_len(tx_buf, dma_len1)) { ++ dma_unmap_page(eth->dma_dev, ++ dma_unmap_addr(tx_buf, dma_addr1), ++ dma_unmap_len(tx_buf, dma_len1), ++ DMA_TO_DEVICE); ++ } ++ } ++ ++ if (tx_buf->data && tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) { ++ if (tx_buf->type == MTK_TYPE_SKB) { ++ struct sk_buff *skb = tx_buf->data; ++ ++ if (napi) ++ napi_consume_skb(skb, napi); ++ else ++ dev_kfree_skb_any(skb); ++ } else { ++ struct xdp_frame *xdpf = tx_buf->data; ++ ++ if (napi && tx_buf->type == MTK_TYPE_XDP_TX) ++ xdp_return_frame_rx_napi(xdpf); ++ else if (bq) ++ xdp_return_frame_bulk(xdpf, bq); ++ else ++ xdp_return_frame(xdpf); ++ } ++ } ++ tx_buf->flags = 0; ++ tx_buf->data = NULL; ++} ++ ++static void setup_tx_buf(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf, ++ struct mtk_tx_dma *txd, dma_addr_t mapped_addr, ++ size_t size, int idx) ++{ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr); ++ dma_unmap_len_set(tx_buf, dma_len0, size); ++ } else { ++ if (idx & 1) { ++ txd->txd3 = mapped_addr; ++ txd->txd2 |= TX_DMA_PLEN1(size); ++ dma_unmap_addr_set(tx_buf, dma_addr1, mapped_addr); ++ dma_unmap_len_set(tx_buf, dma_len1, size); ++ } else { ++ tx_buf->data = (void *)MTK_DMA_DUMMY_DESC; ++ txd->txd1 = mapped_addr; ++ txd->txd2 = TX_DMA_PLEN0(size); ++ dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr); ++ dma_unmap_len_set(tx_buf, dma_len0, size); ++ } ++ } ++} ++ ++static void mtk_tx_set_dma_desc_v1(struct net_device *dev, void *txd, ++ struct mtk_tx_dma_desc_info *info) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ struct mtk_tx_dma *desc = txd; ++ u32 data; ++ ++ WRITE_ONCE(desc->txd1, info->addr); ++ ++ data = TX_DMA_SWC | TX_DMA_PLEN0(info->size); ++ if (info->last) ++ data |= TX_DMA_LS0; ++ WRITE_ONCE(desc->txd3, data); ++ ++ data = (mac->id + 1) << TX_DMA_FPORT_SHIFT; /* forward port */ ++ if (info->first) { ++ if (info->gso) ++ data |= TX_DMA_TSO; ++ /* tx checksum offload */ ++ if (info->csum) ++ data |= TX_DMA_CHKSUM; ++ /* vlan header offload */ ++ if (info->vlan) ++ data |= TX_DMA_INS_VLAN | info->vlan_tci; ++ } ++ WRITE_ONCE(desc->txd4, data); ++} ++ ++static void mtk_tx_set_dma_desc_v2(struct net_device *dev, void *txd, ++ struct mtk_tx_dma_desc_info *info) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_tx_dma_v2 *desc = txd; ++ struct mtk_eth *eth = mac->hw; ++ u32 data; ++ ++ WRITE_ONCE(desc->txd1, info->addr); ++ ++ data = TX_DMA_PLEN0(info->size); ++ if (info->last) ++ data |= TX_DMA_LS0; ++ WRITE_ONCE(desc->txd3, data); ++ ++ if (!info->qid && mac->id) ++ info->qid = MTK_QDMA_GMAC2_QID; ++ ++ data = (mac->id + 1) << TX_DMA_FPORT_SHIFT_V2; /* forward port */ ++ data |= TX_DMA_SWC_V2 | QID_BITS_V2(info->qid); ++ WRITE_ONCE(desc->txd4, data); ++ ++ data = 0; ++ if (info->first) { ++ if (info->gso) ++ data |= TX_DMA_TSO_V2; ++ /* tx checksum offload */ ++ if (info->csum) ++ data |= TX_DMA_CHKSUM_V2; ++ } ++ WRITE_ONCE(desc->txd5, data); ++ ++ data = 0; ++ if (info->first && info->vlan) ++ data |= TX_DMA_INS_VLAN_V2 | info->vlan_tci; ++ WRITE_ONCE(desc->txd6, data); ++ ++ WRITE_ONCE(desc->txd7, 0); ++ WRITE_ONCE(desc->txd8, 0); ++} ++ ++static void mtk_tx_set_dma_desc(struct net_device *dev, void *txd, ++ struct mtk_tx_dma_desc_info *info) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) ++ mtk_tx_set_dma_desc_v2(dev, txd, info); ++ else ++ mtk_tx_set_dma_desc_v1(dev, txd, info); ++} ++ ++static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev, ++ int tx_num, struct mtk_tx_ring *ring, bool gso) ++{ ++ struct mtk_tx_dma_desc_info txd_info = { ++ .size = skb_headlen(skb), ++ .gso = gso, ++ .csum = skb->ip_summed == CHECKSUM_PARTIAL, ++ .vlan = skb_vlan_tag_present(skb), ++ .qid = skb->mark & MTK_QDMA_TX_MASK, ++ .vlan_tci = skb_vlan_tag_get(skb), ++ .first = true, ++ .last = !skb_is_nonlinear(skb), ++ }; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ const struct mtk_soc_data *soc = eth->soc; ++ struct mtk_tx_dma *itxd, *txd; ++ struct mtk_tx_dma *itxd_pdma, *txd_pdma; ++ struct mtk_tx_buf *itx_buf, *tx_buf; ++ int i, n_desc = 1; ++ int k = 0; ++ ++ itxd = ring->next_free; ++ itxd_pdma = qdma_to_pdma(ring, itxd); ++ if (itxd == ring->last_free) ++ return -ENOMEM; ++ ++ itx_buf = mtk_desc_to_tx_buf(ring, itxd, soc->txrx.txd_size); ++ memset(itx_buf, 0, sizeof(*itx_buf)); ++ ++ txd_info.addr = dma_map_single(eth->dma_dev, skb->data, txd_info.size, ++ DMA_TO_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, txd_info.addr))) ++ return -ENOMEM; ++ ++ mtk_tx_set_dma_desc(dev, itxd, &txd_info); ++ ++ itx_buf->flags |= MTK_TX_FLAGS_SINGLE0; ++ itx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 : ++ MTK_TX_FLAGS_FPORT1; ++ setup_tx_buf(eth, itx_buf, itxd_pdma, txd_info.addr, txd_info.size, ++ k++); ++ ++ /* TX SG offload */ ++ txd = itxd; ++ txd_pdma = qdma_to_pdma(ring, txd); ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ unsigned int offset = 0; ++ int frag_size = skb_frag_size(frag); ++ ++ while (frag_size) { ++ bool new_desc = true; ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA) || ++ (i & 0x1)) { ++ txd = mtk_qdma_phys_to_virt(ring, txd->txd2); ++ txd_pdma = qdma_to_pdma(ring, txd); ++ if (txd == ring->last_free) ++ goto err_dma; ++ ++ n_desc++; ++ } else { ++ new_desc = false; ++ } ++ ++ memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info)); ++ txd_info.size = min_t(unsigned int, frag_size, ++ soc->txrx.dma_max_len); ++ txd_info.qid = skb->mark & MTK_QDMA_TX_MASK; ++ txd_info.last = i == skb_shinfo(skb)->nr_frags - 1 && ++ !(frag_size - txd_info.size); ++ txd_info.addr = skb_frag_dma_map(eth->dma_dev, frag, ++ offset, txd_info.size, ++ DMA_TO_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, txd_info.addr))) ++ goto err_dma; ++ ++ mtk_tx_set_dma_desc(dev, txd, &txd_info); ++ ++ tx_buf = mtk_desc_to_tx_buf(ring, txd, ++ soc->txrx.txd_size); ++ if (new_desc) ++ memset(tx_buf, 0, sizeof(*tx_buf)); ++ tx_buf->data = (void *)MTK_DMA_DUMMY_DESC; ++ tx_buf->flags |= MTK_TX_FLAGS_PAGE0; ++ tx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 : ++ MTK_TX_FLAGS_FPORT1; ++ ++ setup_tx_buf(eth, tx_buf, txd_pdma, txd_info.addr, ++ txd_info.size, k++); ++ ++ frag_size -= txd_info.size; ++ offset += txd_info.size; ++ } ++ } ++ ++ /* store skb to cleanup */ ++ itx_buf->type = MTK_TYPE_SKB; ++ itx_buf->data = skb; ++ ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ if (k & 0x1) ++ txd_pdma->txd2 |= TX_DMA_LS0; ++ else ++ txd_pdma->txd2 |= TX_DMA_LS1; ++ } ++ ++ netdev_sent_queue(dev, skb->len); ++ skb_tx_timestamp(skb); ++ ++ ring->next_free = mtk_qdma_phys_to_virt(ring, txd->txd2); ++ atomic_sub(n_desc, &ring->free_count); ++ ++ /* make sure that all changes to the dma ring are flushed before we ++ * continue ++ */ ++ wmb(); ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)) || ++ !netdev_xmit_more()) ++ mtk_w32(eth, txd->txd2, soc->reg_map->qdma.ctx_ptr); ++ } else { ++ int next_idx; ++ ++ next_idx = NEXT_DESP_IDX(txd_to_idx(ring, txd, soc->txrx.txd_size), ++ ring->dma_size); ++ mtk_w32(eth, next_idx, MT7628_TX_CTX_IDX0); ++ } ++ ++ return 0; ++ ++err_dma: ++ do { ++ tx_buf = mtk_desc_to_tx_buf(ring, itxd, soc->txrx.txd_size); ++ ++ /* unmap dma */ ++ mtk_tx_unmap(eth, tx_buf, NULL, false); ++ ++ itxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU; ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) ++ itxd_pdma->txd2 = TX_DMA_DESP2_DEF; ++ ++ itxd = mtk_qdma_phys_to_virt(ring, itxd->txd2); ++ itxd_pdma = qdma_to_pdma(ring, itxd); ++ } while (itxd != txd); ++ ++ return -ENOMEM; ++} ++ ++static int mtk_cal_txd_req(struct mtk_eth *eth, struct sk_buff *skb) ++{ ++ int i, nfrags = 1; ++ skb_frag_t *frag; ++ ++ if (skb_is_gso(skb)) { ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ frag = &skb_shinfo(skb)->frags[i]; ++ nfrags += DIV_ROUND_UP(skb_frag_size(frag), ++ eth->soc->txrx.dma_max_len); ++ } ++ } else { ++ nfrags += skb_shinfo(skb)->nr_frags; ++ } ++ ++ return nfrags; ++} ++ ++static int mtk_queue_stopped(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ if (netif_queue_stopped(eth->netdev[i])) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static void mtk_wake_queue(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ netif_wake_queue(eth->netdev[i]); ++ } ++} ++ ++static netdev_tx_t mtk_start_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct net_device_stats *stats = &dev->stats; ++ bool gso = false; ++ int tx_num; ++ ++ /* normally we can rely on the stack not calling this more than once, ++ * however we have 2 queues running on the same ring so we need to lock ++ * the ring access ++ */ ++ spin_lock(ð->page_lock); ++ ++ if (unlikely(test_bit(MTK_RESETTING, ð->state))) ++ goto drop; ++ ++ tx_num = mtk_cal_txd_req(eth, skb); ++ if (unlikely(atomic_read(&ring->free_count) <= tx_num)) { ++ netif_stop_queue(dev); ++ netif_err(eth, tx_queued, dev, ++ "Tx Ring full when queue awake!\n"); ++ spin_unlock(ð->page_lock); ++ return NETDEV_TX_BUSY; ++ } ++ ++ /* TSO: fill MSS info in tcp checksum field */ ++ if (skb_is_gso(skb)) { ++ if (skb_cow_head(skb, 0)) { ++ netif_warn(eth, tx_err, dev, ++ "GSO expand head fail.\n"); ++ goto drop; ++ } ++ ++ if (skb_shinfo(skb)->gso_type & ++ (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) { ++ gso = true; ++ tcp_hdr(skb)->check = htons(skb_shinfo(skb)->gso_size); ++ } ++ } ++ ++ if (mtk_tx_map(skb, dev, tx_num, ring, gso) < 0) ++ goto drop; ++ ++ if (unlikely(atomic_read(&ring->free_count) <= ring->thresh)) ++ netif_stop_queue(dev); ++ ++ spin_unlock(ð->page_lock); ++ ++ return NETDEV_TX_OK; ++ ++drop: ++ spin_unlock(ð->page_lock); ++ stats->tx_dropped++; ++ dev_kfree_skb_any(skb); ++ return NETDEV_TX_OK; ++} ++ ++static struct mtk_rx_ring *mtk_get_rx_ring(struct mtk_eth *eth) ++{ ++ int i; ++ struct mtk_rx_ring *ring; ++ int idx; ++ ++ if (!eth->hwlro) ++ return ð->rx_ring[0]; ++ ++ for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) { ++ struct mtk_rx_dma *rxd; ++ ++ ring = ð->rx_ring[i]; ++ idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size); ++ rxd = ring->dma + idx * eth->soc->txrx.rxd_size; ++ if (rxd->rxd2 & RX_DMA_DONE) { ++ ring->calc_idx_update = true; ++ return ring; ++ } ++ } ++ ++ return NULL; ++} ++ ++static void mtk_update_rx_cpu_idx(struct mtk_eth *eth) ++{ ++ struct mtk_rx_ring *ring; ++ int i; ++ ++ if (!eth->hwlro) { ++ ring = ð->rx_ring[0]; ++ mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg); ++ } else { ++ for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) { ++ ring = ð->rx_ring[i]; ++ if (ring->calc_idx_update) { ++ ring->calc_idx_update = false; ++ mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg); ++ } ++ } ++ } ++} ++ ++static bool mtk_page_pool_enabled(struct mtk_eth *eth) ++{ ++ return MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2); ++} ++ ++static struct page_pool *mtk_create_page_pool(struct mtk_eth *eth, ++ struct xdp_rxq_info *xdp_q, ++ int id, int size) ++{ ++ struct page_pool_params pp_params = { ++ .order = 0, ++ .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, ++ .pool_size = size, ++ .nid = NUMA_NO_NODE, ++ .dev = eth->dma_dev, ++ .offset = MTK_PP_HEADROOM, ++ .max_len = MTK_PP_MAX_BUF_SIZE, ++ }; ++ struct page_pool *pp; ++ int err; ++ ++ pp_params.dma_dir = rcu_access_pointer(eth->prog) ? DMA_BIDIRECTIONAL ++ : DMA_FROM_DEVICE; ++ pp = page_pool_create(&pp_params); ++ if (IS_ERR(pp)) ++ return pp; ++ ++ err = __xdp_rxq_info_reg(xdp_q, ð->dummy_dev, eth->rx_napi.napi_id, ++ id, PAGE_SIZE); ++ if (err < 0) ++ goto err_free_pp; ++ ++ err = xdp_rxq_info_reg_mem_model(xdp_q, MEM_TYPE_PAGE_POOL, pp); ++ if (err) ++ goto err_unregister_rxq; ++ ++ return pp; ++ ++err_unregister_rxq: ++ xdp_rxq_info_unreg(xdp_q); ++err_free_pp: ++ page_pool_destroy(pp); ++ ++ return ERR_PTR(err); ++} ++ ++static void *mtk_page_pool_get_buff(struct page_pool *pp, dma_addr_t *dma_addr, ++ gfp_t gfp_mask) ++{ ++ struct page *page; ++ ++ page = page_pool_alloc_pages(pp, gfp_mask | __GFP_NOWARN); ++ if (!page) ++ return NULL; ++ ++ *dma_addr = page_pool_get_dma_addr(page) + MTK_PP_HEADROOM; ++ return page_address(page); ++} ++ ++static void mtk_rx_put_buff(struct mtk_rx_ring *ring, void *data, bool napi) ++{ ++ if (ring->page_pool) ++ page_pool_put_full_page(ring->page_pool, ++ virt_to_head_page(data), napi); ++ else ++ skb_free_frag(data); ++} ++ ++static int mtk_xdp_frame_map(struct mtk_eth *eth, struct net_device *dev, ++ struct mtk_tx_dma_desc_info *txd_info, ++ struct mtk_tx_dma *txd, struct mtk_tx_buf *tx_buf, ++ void *data, u16 headroom, int index, bool dma_map) ++{ ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_tx_dma *txd_pdma; ++ ++ if (dma_map) { /* ndo_xdp_xmit */ ++ txd_info->addr = dma_map_single(eth->dma_dev, data, ++ txd_info->size, DMA_TO_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, txd_info->addr))) ++ return -ENOMEM; ++ ++ tx_buf->flags |= MTK_TX_FLAGS_SINGLE0; ++ } else { ++ struct page *page = virt_to_head_page(data); ++ ++ txd_info->addr = page_pool_get_dma_addr(page) + ++ sizeof(struct xdp_frame) + headroom; ++ dma_sync_single_for_device(eth->dma_dev, txd_info->addr, ++ txd_info->size, DMA_BIDIRECTIONAL); ++ } ++ mtk_tx_set_dma_desc(dev, txd, txd_info); ++ ++ tx_buf->flags |= !mac->id ? MTK_TX_FLAGS_FPORT0 : MTK_TX_FLAGS_FPORT1; ++ tx_buf->type = dma_map ? MTK_TYPE_XDP_NDO : MTK_TYPE_XDP_TX; ++ tx_buf->data = (void *)MTK_DMA_DUMMY_DESC; ++ ++ txd_pdma = qdma_to_pdma(ring, txd); ++ setup_tx_buf(eth, tx_buf, txd_pdma, txd_info->addr, txd_info->size, ++ index); ++ ++ return 0; ++} ++ ++static int mtk_xdp_submit_frame(struct mtk_eth *eth, struct xdp_frame *xdpf, ++ struct net_device *dev, bool dma_map) ++{ ++ struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf); ++ const struct mtk_soc_data *soc = eth->soc; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct mtk_tx_dma_desc_info txd_info = { ++ .size = xdpf->len, ++ .first = true, ++ .last = !xdp_frame_has_frags(xdpf), ++ }; ++ int err, index = 0, n_desc = 1, nr_frags; ++ struct mtk_tx_dma *htxd, *txd, *txd_pdma; ++ struct mtk_tx_buf *htx_buf, *tx_buf; ++ void *data = xdpf->data; ++ ++ if (unlikely(test_bit(MTK_RESETTING, ð->state))) ++ return -EBUSY; ++ ++ nr_frags = unlikely(xdp_frame_has_frags(xdpf)) ? sinfo->nr_frags : 0; ++ if (unlikely(atomic_read(&ring->free_count) <= 1 + nr_frags)) ++ return -EBUSY; ++ ++ spin_lock(ð->page_lock); ++ ++ txd = ring->next_free; ++ if (txd == ring->last_free) { ++ spin_unlock(ð->page_lock); ++ return -ENOMEM; ++ } ++ htxd = txd; ++ ++ tx_buf = mtk_desc_to_tx_buf(ring, txd, soc->txrx.txd_size); ++ memset(tx_buf, 0, sizeof(*tx_buf)); ++ htx_buf = tx_buf; ++ ++ for (;;) { ++ err = mtk_xdp_frame_map(eth, dev, &txd_info, txd, tx_buf, ++ data, xdpf->headroom, index, dma_map); ++ if (err < 0) ++ goto unmap; ++ ++ if (txd_info.last) ++ break; ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA) || (index & 0x1)) { ++ txd = mtk_qdma_phys_to_virt(ring, txd->txd2); ++ txd_pdma = qdma_to_pdma(ring, txd); ++ if (txd == ring->last_free) ++ goto unmap; ++ ++ tx_buf = mtk_desc_to_tx_buf(ring, txd, ++ soc->txrx.txd_size); ++ memset(tx_buf, 0, sizeof(*tx_buf)); ++ n_desc++; ++ } ++ ++ memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info)); ++ txd_info.size = skb_frag_size(&sinfo->frags[index]); ++ txd_info.last = index + 1 == nr_frags; ++ data = skb_frag_address(&sinfo->frags[index]); ++ ++ index++; ++ } ++ /* store xdpf for cleanup */ ++ htx_buf->data = xdpf; ++ ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ txd_pdma = qdma_to_pdma(ring, txd); ++ if (index & 1) ++ txd_pdma->txd2 |= TX_DMA_LS0; ++ else ++ txd_pdma->txd2 |= TX_DMA_LS1; ++ } ++ ++ ring->next_free = mtk_qdma_phys_to_virt(ring, txd->txd2); ++ atomic_sub(n_desc, &ring->free_count); ++ ++ /* make sure that all changes to the dma ring are flushed before we ++ * continue ++ */ ++ wmb(); ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ mtk_w32(eth, txd->txd2, soc->reg_map->qdma.ctx_ptr); ++ } else { ++ int idx; ++ ++ idx = txd_to_idx(ring, txd, soc->txrx.txd_size); ++ mtk_w32(eth, NEXT_DESP_IDX(idx, ring->dma_size), ++ MT7628_TX_CTX_IDX0); ++ } ++ ++ spin_unlock(ð->page_lock); ++ ++ return 0; ++ ++unmap: ++ while (htxd != txd) { ++ txd_pdma = qdma_to_pdma(ring, htxd); ++ tx_buf = mtk_desc_to_tx_buf(ring, htxd, soc->txrx.txd_size); ++ mtk_tx_unmap(eth, tx_buf, NULL, false); ++ ++ htxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU; ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) ++ txd_pdma->txd2 = TX_DMA_DESP2_DEF; ++ ++ htxd = mtk_qdma_phys_to_virt(ring, htxd->txd2); ++ } ++ ++ spin_unlock(ð->page_lock); ++ ++ return err; ++} ++ ++static int mtk_xdp_xmit(struct net_device *dev, int num_frame, ++ struct xdp_frame **frames, u32 flags) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_hw_stats *hw_stats = mac->hw_stats; ++ struct mtk_eth *eth = mac->hw; ++ int i, nxmit = 0; ++ ++ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) ++ return -EINVAL; ++ ++ for (i = 0; i < num_frame; i++) { ++ if (mtk_xdp_submit_frame(eth, frames[i], dev, true)) ++ break; ++ nxmit++; ++ } ++ ++ u64_stats_update_begin(&hw_stats->syncp); ++ hw_stats->xdp_stats.tx_xdp_xmit += nxmit; ++ hw_stats->xdp_stats.tx_xdp_xmit_errors += num_frame - nxmit; ++ u64_stats_update_end(&hw_stats->syncp); ++ ++ return nxmit; ++} ++ ++static u32 mtk_xdp_run(struct mtk_eth *eth, struct mtk_rx_ring *ring, ++ struct xdp_buff *xdp, struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_hw_stats *hw_stats = mac->hw_stats; ++ u64 *count = &hw_stats->xdp_stats.rx_xdp_drop; ++ struct bpf_prog *prog; ++ u32 act = XDP_PASS; ++ ++ rcu_read_lock(); ++ ++ prog = rcu_dereference(eth->prog); ++ if (!prog) ++ goto out; ++ ++ act = bpf_prog_run_xdp(prog, xdp); ++ switch (act) { ++ case XDP_PASS: ++ count = &hw_stats->xdp_stats.rx_xdp_pass; ++ goto update_stats; ++ case XDP_REDIRECT: ++ if (unlikely(xdp_do_redirect(dev, xdp, prog))) { ++ act = XDP_DROP; ++ break; ++ } ++ ++ count = &hw_stats->xdp_stats.rx_xdp_redirect; ++ goto update_stats; ++ case XDP_TX: { ++ struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); ++ ++ if (!xdpf || mtk_xdp_submit_frame(eth, xdpf, dev, false)) { ++ count = &hw_stats->xdp_stats.rx_xdp_tx_errors; ++ act = XDP_DROP; ++ break; ++ } ++ ++ count = &hw_stats->xdp_stats.rx_xdp_tx; ++ goto update_stats; ++ } ++ default: ++ bpf_warn_invalid_xdp_action(dev, prog, act); ++ fallthrough; ++ case XDP_ABORTED: ++ trace_xdp_exception(dev, prog, act); ++ fallthrough; ++ case XDP_DROP: ++ break; ++ } ++ ++ page_pool_put_full_page(ring->page_pool, ++ virt_to_head_page(xdp->data), true); ++ ++update_stats: ++ u64_stats_update_begin(&hw_stats->syncp); ++ *count = *count + 1; ++ u64_stats_update_end(&hw_stats->syncp); ++out: ++ rcu_read_unlock(); ++ ++ return act; ++} ++ ++static int mtk_poll_rx(struct napi_struct *napi, int budget, ++ struct mtk_eth *eth) ++{ ++ struct dim_sample dim_sample = {}; ++ struct mtk_rx_ring *ring; ++ bool xdp_flush = false; ++ int idx; ++ struct sk_buff *skb; ++ u8 *data, *new_data; ++ struct mtk_rx_dma_v2 *rxd, trxd; ++ int done = 0, bytes = 0; ++ ++ while (done < budget) { ++ unsigned int pktlen, *rxdcsum; ++ struct net_device *netdev; ++ dma_addr_t dma_addr; ++ u32 hash, reason; ++ int mac = 0; ++ ++ ring = mtk_get_rx_ring(eth); ++ if (unlikely(!ring)) ++ goto rx_done; ++ ++ idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size); ++ rxd = ring->dma + idx * eth->soc->txrx.rxd_size; ++ data = ring->data[idx]; ++ ++ if (!mtk_rx_get_desc(eth, &trxd, rxd)) ++ break; ++ ++ /* find out which mac the packet come from. values start at 1 */ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) ++ mac = RX_DMA_GET_SPORT_V2(trxd.rxd5) - 1; ++ else if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) && ++ !(trxd.rxd4 & RX_DMA_SPECIAL_TAG)) ++ mac = RX_DMA_GET_SPORT(trxd.rxd4) - 1; ++ ++ if (unlikely(mac < 0 || mac >= MTK_MAC_COUNT || ++ !eth->netdev[mac])) ++ goto release_desc; ++ ++ netdev = eth->netdev[mac]; ++ ++ if (unlikely(test_bit(MTK_RESETTING, ð->state))) ++ goto release_desc; ++ ++ pktlen = RX_DMA_GET_PLEN0(trxd.rxd2); ++ ++ /* alloc new buffer */ ++ if (ring->page_pool) { ++ struct page *page = virt_to_head_page(data); ++ struct xdp_buff xdp; ++ u32 ret; ++ ++ new_data = mtk_page_pool_get_buff(ring->page_pool, ++ &dma_addr, ++ GFP_ATOMIC); ++ if (unlikely(!new_data)) { ++ netdev->stats.rx_dropped++; ++ goto release_desc; ++ } ++ ++ dma_sync_single_for_cpu(eth->dma_dev, ++ page_pool_get_dma_addr(page) + MTK_PP_HEADROOM, ++ pktlen, page_pool_get_dma_dir(ring->page_pool)); ++ ++ xdp_init_buff(&xdp, PAGE_SIZE, &ring->xdp_q); ++ xdp_prepare_buff(&xdp, data, MTK_PP_HEADROOM, pktlen, ++ false); ++ xdp_buff_clear_frags_flag(&xdp); ++ ++ ret = mtk_xdp_run(eth, ring, &xdp, netdev); ++ if (ret == XDP_REDIRECT) ++ xdp_flush = true; ++ ++ if (ret != XDP_PASS) ++ goto skip_rx; ++ ++ skb = build_skb(data, PAGE_SIZE); ++ if (unlikely(!skb)) { ++ page_pool_put_full_page(ring->page_pool, ++ page, true); ++ netdev->stats.rx_dropped++; ++ goto skip_rx; ++ } ++ ++ skb_reserve(skb, xdp.data - xdp.data_hard_start); ++ skb_put(skb, xdp.data_end - xdp.data); ++ skb_mark_for_recycle(skb); ++ } else { ++ if (ring->frag_size <= PAGE_SIZE) ++ new_data = napi_alloc_frag(ring->frag_size); ++ else ++ new_data = mtk_max_lro_buf_alloc(GFP_ATOMIC); ++ ++ if (unlikely(!new_data)) { ++ netdev->stats.rx_dropped++; ++ goto release_desc; ++ } ++ ++ dma_addr = dma_map_single(eth->dma_dev, ++ new_data + NET_SKB_PAD + eth->ip_align, ++ ring->buf_size, DMA_FROM_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, ++ dma_addr))) { ++ skb_free_frag(new_data); ++ netdev->stats.rx_dropped++; ++ goto release_desc; ++ } ++ ++ dma_unmap_single(eth->dma_dev, trxd.rxd1, ++ ring->buf_size, DMA_FROM_DEVICE); ++ ++ skb = build_skb(data, ring->frag_size); ++ if (unlikely(!skb)) { ++ netdev->stats.rx_dropped++; ++ skb_free_frag(data); ++ goto skip_rx; ++ } ++ ++ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); ++ skb_put(skb, pktlen); ++ } ++ ++ skb->dev = netdev; ++ bytes += skb->len; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ hash = trxd.rxd5 & MTK_RXD5_FOE_ENTRY; ++ if (hash != MTK_RXD5_FOE_ENTRY) ++ skb_set_hash(skb, jhash_1word(hash, 0), ++ PKT_HASH_TYPE_L4); ++ rxdcsum = &trxd.rxd3; ++ } else { ++ hash = trxd.rxd4 & MTK_RXD4_FOE_ENTRY; ++ if (hash != MTK_RXD4_FOE_ENTRY) ++ skb_set_hash(skb, jhash_1word(hash, 0), ++ PKT_HASH_TYPE_L4); ++ rxdcsum = &trxd.rxd4; ++ } ++ ++ if (*rxdcsum & eth->soc->txrx.rx_dma_l4_valid) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ else ++ skb_checksum_none_assert(skb); ++ skb->protocol = eth_type_trans(skb, netdev); ++ ++ reason = FIELD_GET(MTK_RXD4_PPE_CPU_REASON, trxd.rxd4); ++ if (reason == MTK_PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED) ++ mtk_ppe_check_skb(eth->ppe, skb, hash); ++ ++ if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) { ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ if (trxd.rxd3 & RX_DMA_VTAG_V2) ++ __vlan_hwaccel_put_tag(skb, ++ htons(RX_DMA_VPID(trxd.rxd4)), ++ RX_DMA_VID(trxd.rxd4)); ++ } else if (trxd.rxd2 & RX_DMA_VTAG) { ++ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ++ RX_DMA_VID(trxd.rxd3)); ++ } ++ ++ /* If the device is attached to a dsa switch, the special ++ * tag inserted in VLAN field by hw switch can * be offloaded ++ * by RX HW VLAN offload. Clear vlan info. ++ */ ++ if (netdev_uses_dsa(netdev)) ++ __vlan_hwaccel_clear_tag(skb); ++ } ++ ++ skb_record_rx_queue(skb, 0); ++ napi_gro_receive(napi, skb); ++ ++skip_rx: ++ ring->data[idx] = new_data; ++ rxd->rxd1 = (unsigned int)dma_addr; ++release_desc: ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ rxd->rxd2 = RX_DMA_LSO; ++ else ++ rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size); ++ ++ ring->calc_idx = idx; ++ done++; ++ } ++ ++rx_done: ++ if (done) { ++ /* make sure that all changes to the dma ring are flushed before ++ * we continue ++ */ ++ wmb(); ++ mtk_update_rx_cpu_idx(eth); ++ } ++ ++ eth->rx_packets += done; ++ eth->rx_bytes += bytes; ++ dim_update_sample(eth->rx_events, eth->rx_packets, eth->rx_bytes, ++ &dim_sample); ++ net_dim(ð->rx_dim, dim_sample); ++ ++ if (xdp_flush) ++ xdp_do_flush_map(); ++ ++ return done; ++} ++ ++static int mtk_poll_tx_qdma(struct mtk_eth *eth, int budget, ++ unsigned int *done, unsigned int *bytes) ++{ ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct mtk_tx_buf *tx_buf; ++ struct xdp_frame_bulk bq; ++ struct mtk_tx_dma *desc; ++ u32 cpu, dma; ++ ++ cpu = ring->last_free_ptr; ++ dma = mtk_r32(eth, reg_map->qdma.drx_ptr); ++ ++ desc = mtk_qdma_phys_to_virt(ring, cpu); ++ xdp_frame_bulk_init(&bq); ++ ++ while ((cpu != dma) && budget) { ++ u32 next_cpu = desc->txd2; ++ int mac = 0; ++ ++ desc = mtk_qdma_phys_to_virt(ring, desc->txd2); ++ if ((desc->txd3 & TX_DMA_OWNER_CPU) == 0) ++ break; ++ ++ tx_buf = mtk_desc_to_tx_buf(ring, desc, ++ eth->soc->txrx.txd_size); ++ if (tx_buf->flags & MTK_TX_FLAGS_FPORT1) ++ mac = 1; ++ ++ if (!tx_buf->data) ++ break; ++ ++ if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) { ++ if (tx_buf->type == MTK_TYPE_SKB) { ++ struct sk_buff *skb = tx_buf->data; ++ ++ bytes[mac] += skb->len; ++ done[mac]++; ++ } ++ budget--; ++ } ++ mtk_tx_unmap(eth, tx_buf, &bq, true); ++ ++ ring->last_free = desc; ++ atomic_inc(&ring->free_count); ++ ++ cpu = next_cpu; ++ } ++ xdp_flush_frame_bulk(&bq); ++ ++ ring->last_free_ptr = cpu; ++ mtk_w32(eth, cpu, reg_map->qdma.crx_ptr); ++ ++ return budget; ++} ++ ++static int mtk_poll_tx_pdma(struct mtk_eth *eth, int budget, ++ unsigned int *done, unsigned int *bytes) ++{ ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct mtk_tx_buf *tx_buf; ++ struct xdp_frame_bulk bq; ++ struct mtk_tx_dma *desc; ++ u32 cpu, dma; ++ ++ cpu = ring->cpu_idx; ++ dma = mtk_r32(eth, MT7628_TX_DTX_IDX0); ++ xdp_frame_bulk_init(&bq); ++ ++ while ((cpu != dma) && budget) { ++ tx_buf = &ring->buf[cpu]; ++ if (!tx_buf->data) ++ break; ++ ++ if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) { ++ if (tx_buf->type == MTK_TYPE_SKB) { ++ struct sk_buff *skb = tx_buf->data; ++ ++ bytes[0] += skb->len; ++ done[0]++; ++ } ++ budget--; ++ } ++ mtk_tx_unmap(eth, tx_buf, &bq, true); ++ ++ desc = ring->dma + cpu * eth->soc->txrx.txd_size; ++ ring->last_free = desc; ++ atomic_inc(&ring->free_count); ++ ++ cpu = NEXT_DESP_IDX(cpu, ring->dma_size); ++ } ++ xdp_flush_frame_bulk(&bq); ++ ++ ring->cpu_idx = cpu; ++ ++ return budget; ++} ++ ++static int mtk_poll_tx(struct mtk_eth *eth, int budget) ++{ ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct dim_sample dim_sample = {}; ++ unsigned int done[MTK_MAX_DEVS]; ++ unsigned int bytes[MTK_MAX_DEVS]; ++ int total = 0, i; ++ ++ memset(done, 0, sizeof(done)); ++ memset(bytes, 0, sizeof(bytes)); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ budget = mtk_poll_tx_qdma(eth, budget, done, bytes); ++ else ++ budget = mtk_poll_tx_pdma(eth, budget, done, bytes); ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i] || !done[i]) ++ continue; ++ netdev_completed_queue(eth->netdev[i], done[i], bytes[i]); ++ total += done[i]; ++ eth->tx_packets += done[i]; ++ eth->tx_bytes += bytes[i]; ++ } ++ ++ dim_update_sample(eth->tx_events, eth->tx_packets, eth->tx_bytes, ++ &dim_sample); ++ net_dim(ð->tx_dim, dim_sample); ++ ++ if (mtk_queue_stopped(eth) && ++ (atomic_read(&ring->free_count) > ring->thresh)) ++ mtk_wake_queue(eth); ++ ++ return total; ++} ++ ++static void mtk_handle_status_irq(struct mtk_eth *eth) ++{ ++ u32 status2 = mtk_r32(eth, MTK_INT_STATUS2); ++ ++ if (unlikely(status2 & (MTK_GDM1_AF | MTK_GDM2_AF))) { ++ mtk_stats_update(eth); ++ mtk_w32(eth, (MTK_GDM1_AF | MTK_GDM2_AF), ++ MTK_INT_STATUS2); ++ } ++} ++ ++static int mtk_napi_tx(struct napi_struct *napi, int budget) ++{ ++ struct mtk_eth *eth = container_of(napi, struct mtk_eth, tx_napi); ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ int tx_done = 0; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ mtk_handle_status_irq(eth); ++ mtk_w32(eth, MTK_TX_DONE_INT, reg_map->tx_irq_status); ++ tx_done = mtk_poll_tx(eth, budget); ++ ++ if (unlikely(netif_msg_intr(eth))) { ++ dev_info(eth->dev, ++ "done tx %d, intr 0x%08x/0x%x\n", tx_done, ++ mtk_r32(eth, reg_map->tx_irq_status), ++ mtk_r32(eth, reg_map->tx_irq_mask)); ++ } ++ ++ if (tx_done == budget) ++ return budget; ++ ++ if (mtk_r32(eth, reg_map->tx_irq_status) & MTK_TX_DONE_INT) ++ return budget; ++ ++ if (napi_complete_done(napi, tx_done)) ++ mtk_tx_irq_enable(eth, MTK_TX_DONE_INT); ++ ++ return tx_done; ++} ++ ++static int mtk_napi_rx(struct napi_struct *napi, int budget) ++{ ++ struct mtk_eth *eth = container_of(napi, struct mtk_eth, rx_napi); ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ int rx_done_total = 0; ++ ++ mtk_handle_status_irq(eth); ++ ++ do { ++ int rx_done; ++ ++ mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, ++ reg_map->pdma.irq_status); ++ rx_done = mtk_poll_rx(napi, budget - rx_done_total, eth); ++ rx_done_total += rx_done; ++ ++ if (unlikely(netif_msg_intr(eth))) { ++ dev_info(eth->dev, ++ "done rx %d, intr 0x%08x/0x%x\n", rx_done, ++ mtk_r32(eth, reg_map->pdma.irq_status), ++ mtk_r32(eth, reg_map->pdma.irq_mask)); ++ } ++ ++ if (rx_done_total == budget) ++ return budget; ++ ++ } while (mtk_r32(eth, reg_map->pdma.irq_status) & ++ eth->soc->txrx.rx_irq_done_mask); ++ ++ if (napi_complete_done(napi, rx_done_total)) ++ mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask); ++ ++ return rx_done_total; ++} ++ ++static int mtk_tx_alloc(struct mtk_eth *eth) ++{ ++ const struct mtk_soc_data *soc = eth->soc; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ int i, sz = soc->txrx.txd_size; ++ struct mtk_tx_dma_v2 *txd; ++ ++ ring->buf = kcalloc(MTK_DMA_SIZE, sizeof(*ring->buf), ++ GFP_KERNEL); ++ if (!ring->buf) ++ goto no_tx_mem; ++ ++ ring->dma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz, ++ &ring->phys, GFP_KERNEL); ++ if (!ring->dma) ++ goto no_tx_mem; ++ ++ for (i = 0; i < MTK_DMA_SIZE; i++) { ++ int next = (i + 1) % MTK_DMA_SIZE; ++ u32 next_ptr = ring->phys + next * sz; ++ ++ txd = ring->dma + i * sz; ++ txd->txd2 = next_ptr; ++ txd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU; ++ txd->txd4 = 0; ++ if (MTK_HAS_CAPS(soc->caps, MTK_NETSYS_V2)) { ++ txd->txd5 = 0; ++ txd->txd6 = 0; ++ txd->txd7 = 0; ++ txd->txd8 = 0; ++ } ++ } ++ ++ /* On MT7688 (PDMA only) this driver uses the ring->dma structs ++ * only as the framework. The real HW descriptors are the PDMA ++ * descriptors in ring->dma_pdma. ++ */ ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ ring->dma_pdma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz, ++ &ring->phys_pdma, GFP_KERNEL); ++ if (!ring->dma_pdma) ++ goto no_tx_mem; ++ ++ for (i = 0; i < MTK_DMA_SIZE; i++) { ++ ring->dma_pdma[i].txd2 = TX_DMA_DESP2_DEF; ++ ring->dma_pdma[i].txd4 = 0; ++ } ++ } ++ ++ ring->dma_size = MTK_DMA_SIZE; ++ atomic_set(&ring->free_count, MTK_DMA_SIZE - 2); ++ ring->next_free = ring->dma; ++ ring->last_free = (void *)txd; ++ ring->last_free_ptr = (u32)(ring->phys + ((MTK_DMA_SIZE - 1) * sz)); ++ ring->thresh = MAX_SKB_FRAGS; ++ ++ /* make sure that all changes to the dma ring are flushed before we ++ * continue ++ */ ++ wmb(); ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ mtk_w32(eth, ring->phys, soc->reg_map->qdma.ctx_ptr); ++ mtk_w32(eth, ring->phys, soc->reg_map->qdma.dtx_ptr); ++ mtk_w32(eth, ++ ring->phys + ((MTK_DMA_SIZE - 1) * sz), ++ soc->reg_map->qdma.crx_ptr); ++ mtk_w32(eth, ring->last_free_ptr, soc->reg_map->qdma.drx_ptr); ++ mtk_w32(eth, (QDMA_RES_THRES << 8) | QDMA_RES_THRES, ++ soc->reg_map->qdma.qtx_cfg); ++ } else { ++ mtk_w32(eth, ring->phys_pdma, MT7628_TX_BASE_PTR0); ++ mtk_w32(eth, MTK_DMA_SIZE, MT7628_TX_MAX_CNT0); ++ mtk_w32(eth, 0, MT7628_TX_CTX_IDX0); ++ mtk_w32(eth, MT7628_PST_DTX_IDX0, soc->reg_map->pdma.rst_idx); ++ } ++ ++ return 0; ++ ++no_tx_mem: ++ return -ENOMEM; ++} ++ ++static void mtk_tx_clean(struct mtk_eth *eth) ++{ ++ const struct mtk_soc_data *soc = eth->soc; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ int i; ++ ++ if (ring->buf) { ++ for (i = 0; i < MTK_DMA_SIZE; i++) ++ mtk_tx_unmap(eth, &ring->buf[i], NULL, false); ++ kfree(ring->buf); ++ ring->buf = NULL; ++ } ++ ++ if (ring->dma) { ++ dma_free_coherent(eth->dma_dev, ++ MTK_DMA_SIZE * soc->txrx.txd_size, ++ ring->dma, ring->phys); ++ ring->dma = NULL; ++ } ++ ++ if (ring->dma_pdma) { ++ dma_free_coherent(eth->dma_dev, ++ MTK_DMA_SIZE * soc->txrx.txd_size, ++ ring->dma_pdma, ring->phys_pdma); ++ ring->dma_pdma = NULL; ++ } ++} ++ ++static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag) ++{ ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ struct mtk_rx_ring *ring; ++ int rx_data_len, rx_dma_size; ++ int i; ++ ++ if (rx_flag == MTK_RX_FLAGS_QDMA) { ++ if (ring_no) ++ return -EINVAL; ++ ring = ð->rx_ring_qdma; ++ } else { ++ ring = ð->rx_ring[ring_no]; ++ } ++ ++ if (rx_flag == MTK_RX_FLAGS_HWLRO) { ++ rx_data_len = MTK_MAX_LRO_RX_LENGTH; ++ rx_dma_size = MTK_HW_LRO_DMA_SIZE; ++ } else { ++ rx_data_len = ETH_DATA_LEN; ++ rx_dma_size = MTK_DMA_SIZE; ++ } ++ ++ ring->frag_size = mtk_max_frag_size(rx_data_len); ++ ring->buf_size = mtk_max_buf_size(ring->frag_size); ++ ring->data = kcalloc(rx_dma_size, sizeof(*ring->data), ++ GFP_KERNEL); ++ if (!ring->data) ++ return -ENOMEM; ++ ++ if (mtk_page_pool_enabled(eth)) { ++ struct page_pool *pp; ++ ++ pp = mtk_create_page_pool(eth, &ring->xdp_q, ring_no, ++ rx_dma_size); ++ if (IS_ERR(pp)) ++ return PTR_ERR(pp); ++ ++ ring->page_pool = pp; ++ } ++ ++ ring->dma = dma_alloc_coherent(eth->dma_dev, ++ rx_dma_size * eth->soc->txrx.rxd_size, ++ &ring->phys, GFP_KERNEL); ++ if (!ring->dma) ++ return -ENOMEM; ++ ++ for (i = 0; i < rx_dma_size; i++) { ++ struct mtk_rx_dma_v2 *rxd; ++ dma_addr_t dma_addr; ++ void *data; ++ ++ rxd = ring->dma + i * eth->soc->txrx.rxd_size; ++ if (ring->page_pool) { ++ data = mtk_page_pool_get_buff(ring->page_pool, ++ &dma_addr, GFP_KERNEL); ++ if (!data) ++ return -ENOMEM; ++ } else { ++ if (ring->frag_size <= PAGE_SIZE) ++ data = netdev_alloc_frag(ring->frag_size); ++ else ++ data = mtk_max_lro_buf_alloc(GFP_KERNEL); ++ ++ if (!data) ++ return -ENOMEM; ++ ++ dma_addr = dma_map_single(eth->dma_dev, ++ data + NET_SKB_PAD + eth->ip_align, ++ ring->buf_size, DMA_FROM_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, ++ dma_addr))) { ++ skb_free_frag(data); ++ return -ENOMEM; ++ } ++ } ++ rxd->rxd1 = (unsigned int)dma_addr; ++ ring->data[i] = data; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ rxd->rxd2 = RX_DMA_LSO; ++ else ++ rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size); ++ ++ rxd->rxd3 = 0; ++ rxd->rxd4 = 0; ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ rxd->rxd5 = 0; ++ rxd->rxd6 = 0; ++ rxd->rxd7 = 0; ++ rxd->rxd8 = 0; ++ } ++ } ++ ++ ring->dma_size = rx_dma_size; ++ ring->calc_idx_update = false; ++ ring->calc_idx = rx_dma_size - 1; ++ if (rx_flag == MTK_RX_FLAGS_QDMA) ++ ring->crx_idx_reg = reg_map->qdma.qcrx_ptr + ++ ring_no * MTK_QRX_OFFSET; ++ else ++ ring->crx_idx_reg = reg_map->pdma.pcrx_ptr + ++ ring_no * MTK_QRX_OFFSET; ++ /* make sure that all changes to the dma ring are flushed before we ++ * continue ++ */ ++ wmb(); ++ ++ if (rx_flag == MTK_RX_FLAGS_QDMA) { ++ mtk_w32(eth, ring->phys, ++ reg_map->qdma.rx_ptr + ring_no * MTK_QRX_OFFSET); ++ mtk_w32(eth, rx_dma_size, ++ reg_map->qdma.rx_cnt_cfg + ring_no * MTK_QRX_OFFSET); ++ mtk_w32(eth, MTK_PST_DRX_IDX_CFG(ring_no), ++ reg_map->qdma.rst_idx); ++ } else { ++ mtk_w32(eth, ring->phys, ++ reg_map->pdma.rx_ptr + ring_no * MTK_QRX_OFFSET); ++ mtk_w32(eth, rx_dma_size, ++ reg_map->pdma.rx_cnt_cfg + ring_no * MTK_QRX_OFFSET); ++ mtk_w32(eth, MTK_PST_DRX_IDX_CFG(ring_no), ++ reg_map->pdma.rst_idx); ++ } ++ mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg); ++ ++ return 0; ++} ++ ++static void mtk_rx_clean(struct mtk_eth *eth, struct mtk_rx_ring *ring) ++{ ++ int i; ++ ++ if (ring->data && ring->dma) { ++ for (i = 0; i < ring->dma_size; i++) { ++ struct mtk_rx_dma *rxd; ++ ++ if (!ring->data[i]) ++ continue; ++ ++ rxd = ring->dma + i * eth->soc->txrx.rxd_size; ++ if (!rxd->rxd1) ++ continue; ++ ++ dma_unmap_single(eth->dma_dev, rxd->rxd1, ++ ring->buf_size, DMA_FROM_DEVICE); ++ mtk_rx_put_buff(ring, ring->data[i], false); ++ } ++ kfree(ring->data); ++ ring->data = NULL; ++ } ++ ++ if (ring->dma) { ++ dma_free_coherent(eth->dma_dev, ++ ring->dma_size * eth->soc->txrx.rxd_size, ++ ring->dma, ring->phys); ++ ring->dma = NULL; ++ } ++ ++ if (ring->page_pool) { ++ if (xdp_rxq_info_is_reg(&ring->xdp_q)) ++ xdp_rxq_info_unreg(&ring->xdp_q); ++ page_pool_destroy(ring->page_pool); ++ ring->page_pool = NULL; ++ } ++} ++ ++static int mtk_hwlro_rx_init(struct mtk_eth *eth) ++{ ++ int i; ++ u32 ring_ctrl_dw1 = 0, ring_ctrl_dw2 = 0, ring_ctrl_dw3 = 0; ++ u32 lro_ctrl_dw0 = 0, lro_ctrl_dw3 = 0; ++ ++ /* set LRO rings to auto-learn modes */ ++ ring_ctrl_dw2 |= MTK_RING_AUTO_LERAN_MODE; ++ ++ /* validate LRO ring */ ++ ring_ctrl_dw2 |= MTK_RING_VLD; ++ ++ /* set AGE timer (unit: 20us) */ ++ ring_ctrl_dw2 |= MTK_RING_AGE_TIME_H; ++ ring_ctrl_dw1 |= MTK_RING_AGE_TIME_L; ++ ++ /* set max AGG timer (unit: 20us) */ ++ ring_ctrl_dw2 |= MTK_RING_MAX_AGG_TIME; ++ ++ /* set max LRO AGG count */ ++ ring_ctrl_dw2 |= MTK_RING_MAX_AGG_CNT_L; ++ ring_ctrl_dw3 |= MTK_RING_MAX_AGG_CNT_H; ++ ++ for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) { ++ mtk_w32(eth, ring_ctrl_dw1, MTK_LRO_CTRL_DW1_CFG(i)); ++ mtk_w32(eth, ring_ctrl_dw2, MTK_LRO_CTRL_DW2_CFG(i)); ++ mtk_w32(eth, ring_ctrl_dw3, MTK_LRO_CTRL_DW3_CFG(i)); ++ } ++ ++ /* IPv4 checksum update enable */ ++ lro_ctrl_dw0 |= MTK_L3_CKS_UPD_EN; ++ ++ /* switch priority comparison to packet count mode */ ++ lro_ctrl_dw0 |= MTK_LRO_ALT_PKT_CNT_MODE; ++ ++ /* bandwidth threshold setting */ ++ mtk_w32(eth, MTK_HW_LRO_BW_THRE, MTK_PDMA_LRO_CTRL_DW2); ++ ++ /* auto-learn score delta setting */ ++ mtk_w32(eth, MTK_HW_LRO_REPLACE_DELTA, MTK_PDMA_LRO_ALT_SCORE_DELTA); ++ ++ /* set refresh timer for altering flows to 1 sec. (unit: 20us) */ ++ mtk_w32(eth, (MTK_HW_LRO_TIMER_UNIT << 16) | MTK_HW_LRO_REFRESH_TIME, ++ MTK_PDMA_LRO_ALT_REFRESH_TIMER); ++ ++ /* set HW LRO mode & the max aggregation count for rx packets */ ++ lro_ctrl_dw3 |= MTK_ADMA_MODE | (MTK_HW_LRO_MAX_AGG_CNT & 0xff); ++ ++ /* the minimal remaining room of SDL0 in RXD for lro aggregation */ ++ lro_ctrl_dw3 |= MTK_LRO_MIN_RXD_SDL; ++ ++ /* enable HW LRO */ ++ lro_ctrl_dw0 |= MTK_LRO_EN; ++ ++ mtk_w32(eth, lro_ctrl_dw3, MTK_PDMA_LRO_CTRL_DW3); ++ mtk_w32(eth, lro_ctrl_dw0, MTK_PDMA_LRO_CTRL_DW0); ++ ++ return 0; ++} ++ ++static void mtk_hwlro_rx_uninit(struct mtk_eth *eth) ++{ ++ int i; ++ u32 val; ++ ++ /* relinquish lro rings, flush aggregated packets */ ++ mtk_w32(eth, MTK_LRO_RING_RELINQUISH_REQ, MTK_PDMA_LRO_CTRL_DW0); ++ ++ /* wait for relinquishments done */ ++ for (i = 0; i < 10; i++) { ++ val = mtk_r32(eth, MTK_PDMA_LRO_CTRL_DW0); ++ if (val & MTK_LRO_RING_RELINQUISH_DONE) { ++ msleep(20); ++ continue; ++ } ++ break; ++ } ++ ++ /* invalidate lro rings */ ++ for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) ++ mtk_w32(eth, 0, MTK_LRO_CTRL_DW2_CFG(i)); ++ ++ /* disable HW LRO */ ++ mtk_w32(eth, 0, MTK_PDMA_LRO_CTRL_DW0); ++} ++ ++static void mtk_hwlro_val_ipaddr(struct mtk_eth *eth, int idx, __be32 ip) ++{ ++ u32 reg_val; ++ ++ reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx)); ++ ++ /* invalidate the IP setting */ ++ mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx)); ++ ++ mtk_w32(eth, ip, MTK_LRO_DIP_DW0_CFG(idx)); ++ ++ /* validate the IP setting */ ++ mtk_w32(eth, (reg_val | MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx)); ++} ++ ++static void mtk_hwlro_inval_ipaddr(struct mtk_eth *eth, int idx) ++{ ++ u32 reg_val; ++ ++ reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx)); ++ ++ /* invalidate the IP setting */ ++ mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx)); ++ ++ mtk_w32(eth, 0, MTK_LRO_DIP_DW0_CFG(idx)); ++} ++ ++static int mtk_hwlro_get_ip_cnt(struct mtk_mac *mac) ++{ ++ int cnt = 0; ++ int i; ++ ++ for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) { ++ if (mac->hwlro_ip[i]) ++ cnt++; ++ } ++ ++ return cnt; ++} ++ ++static int mtk_hwlro_add_ipaddr(struct net_device *dev, ++ struct ethtool_rxnfc *cmd) ++{ ++ struct ethtool_rx_flow_spec *fsp = ++ (struct ethtool_rx_flow_spec *)&cmd->fs; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int hwlro_idx; ++ ++ if ((fsp->flow_type != TCP_V4_FLOW) || ++ (!fsp->h_u.tcp_ip4_spec.ip4dst) || ++ (fsp->location > 1)) ++ return -EINVAL; ++ ++ mac->hwlro_ip[fsp->location] = htonl(fsp->h_u.tcp_ip4_spec.ip4dst); ++ hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location; ++ ++ mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac); ++ ++ mtk_hwlro_val_ipaddr(eth, hwlro_idx, mac->hwlro_ip[fsp->location]); ++ ++ return 0; ++} ++ ++static int mtk_hwlro_del_ipaddr(struct net_device *dev, ++ struct ethtool_rxnfc *cmd) ++{ ++ struct ethtool_rx_flow_spec *fsp = ++ (struct ethtool_rx_flow_spec *)&cmd->fs; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int hwlro_idx; ++ ++ if (fsp->location > 1) ++ return -EINVAL; ++ ++ mac->hwlro_ip[fsp->location] = 0; ++ hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location; ++ ++ mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac); ++ ++ mtk_hwlro_inval_ipaddr(eth, hwlro_idx); ++ ++ return 0; ++} ++ ++static void mtk_hwlro_netdev_disable(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int i, hwlro_idx; ++ ++ for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) { ++ mac->hwlro_ip[i] = 0; ++ hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + i; ++ ++ mtk_hwlro_inval_ipaddr(eth, hwlro_idx); ++ } ++ ++ mac->hwlro_ip_cnt = 0; ++} ++ ++static int mtk_hwlro_get_fdir_entry(struct net_device *dev, ++ struct ethtool_rxnfc *cmd) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct ethtool_rx_flow_spec *fsp = ++ (struct ethtool_rx_flow_spec *)&cmd->fs; ++ ++ if (fsp->location >= ARRAY_SIZE(mac->hwlro_ip)) ++ return -EINVAL; ++ ++ /* only tcp dst ipv4 is meaningful, others are meaningless */ ++ fsp->flow_type = TCP_V4_FLOW; ++ fsp->h_u.tcp_ip4_spec.ip4dst = ntohl(mac->hwlro_ip[fsp->location]); ++ fsp->m_u.tcp_ip4_spec.ip4dst = 0; ++ ++ fsp->h_u.tcp_ip4_spec.ip4src = 0; ++ fsp->m_u.tcp_ip4_spec.ip4src = 0xffffffff; ++ fsp->h_u.tcp_ip4_spec.psrc = 0; ++ fsp->m_u.tcp_ip4_spec.psrc = 0xffff; ++ fsp->h_u.tcp_ip4_spec.pdst = 0; ++ fsp->m_u.tcp_ip4_spec.pdst = 0xffff; ++ fsp->h_u.tcp_ip4_spec.tos = 0; ++ fsp->m_u.tcp_ip4_spec.tos = 0xff; ++ ++ return 0; ++} ++ ++static int mtk_hwlro_get_fdir_all(struct net_device *dev, ++ struct ethtool_rxnfc *cmd, ++ u32 *rule_locs) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ int cnt = 0; ++ int i; ++ ++ for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) { ++ if (mac->hwlro_ip[i]) { ++ rule_locs[cnt] = i; ++ cnt++; ++ } ++ } ++ ++ cmd->rule_cnt = cnt; ++ ++ return 0; ++} ++ ++static netdev_features_t mtk_fix_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ if (!(features & NETIF_F_LRO)) { ++ struct mtk_mac *mac = netdev_priv(dev); ++ int ip_cnt = mtk_hwlro_get_ip_cnt(mac); ++ ++ if (ip_cnt) { ++ netdev_info(dev, "RX flow is programmed, LRO should keep on\n"); ++ ++ features |= NETIF_F_LRO; ++ } ++ } ++ ++ return features; ++} ++ ++static int mtk_set_features(struct net_device *dev, netdev_features_t features) ++{ ++ int err = 0; ++ ++ if (!((dev->features ^ features) & NETIF_F_LRO)) ++ return 0; ++ ++ if (!(features & NETIF_F_LRO)) ++ mtk_hwlro_netdev_disable(dev); ++ ++ return err; ++} ++ ++/* wait for DMA to finish whatever it is doing before we start using it again */ ++static int mtk_dma_busy_wait(struct mtk_eth *eth) ++{ ++ unsigned int reg; ++ int ret; ++ u32 val; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ reg = eth->soc->reg_map->qdma.glo_cfg; ++ else ++ reg = eth->soc->reg_map->pdma.glo_cfg; ++ ++ ret = readx_poll_timeout_atomic(__raw_readl, eth->base + reg, val, ++ !(val & (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)), ++ 5, MTK_DMA_BUSY_TIMEOUT_US); ++ if (ret) ++ dev_err(eth->dev, "DMA init timeout\n"); ++ ++ return ret; ++} ++ ++static int mtk_dma_init(struct mtk_eth *eth) ++{ ++ int err; ++ u32 i; ++ ++ if (mtk_dma_busy_wait(eth)) ++ return -EBUSY; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ /* QDMA needs scratch memory for internal reordering of the ++ * descriptors ++ */ ++ err = mtk_init_fq_dma(eth); ++ if (err) ++ return err; ++ } ++ ++ err = mtk_tx_alloc(eth); ++ if (err) ++ return err; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_QDMA); ++ if (err) ++ return err; ++ } ++ ++ err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_NORMAL); ++ if (err) ++ return err; ++ ++ if (eth->hwlro) { ++ for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) { ++ err = mtk_rx_alloc(eth, i, MTK_RX_FLAGS_HWLRO); ++ if (err) ++ return err; ++ } ++ err = mtk_hwlro_rx_init(eth); ++ if (err) ++ return err; ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ /* Enable random early drop and set drop threshold ++ * automatically ++ */ ++ mtk_w32(eth, FC_THRES_DROP_MODE | FC_THRES_DROP_EN | ++ FC_THRES_MIN, eth->soc->reg_map->qdma.fc_th); ++ mtk_w32(eth, 0x0, eth->soc->reg_map->qdma.hred); ++ } ++ ++ return 0; ++} ++ ++static void mtk_dma_free(struct mtk_eth *eth) ++{ ++ const struct mtk_soc_data *soc = eth->soc; ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) ++ if (eth->netdev[i]) ++ netdev_reset_queue(eth->netdev[i]); ++ if (eth->scratch_ring) { ++ dma_free_coherent(eth->dma_dev, ++ MTK_DMA_SIZE * soc->txrx.txd_size, ++ eth->scratch_ring, eth->phy_scratch_ring); ++ eth->scratch_ring = NULL; ++ eth->phy_scratch_ring = 0; ++ } ++ mtk_tx_clean(eth); ++ mtk_rx_clean(eth, ð->rx_ring[0]); ++ mtk_rx_clean(eth, ð->rx_ring_qdma); ++ ++ if (eth->hwlro) { ++ mtk_hwlro_rx_uninit(eth); ++ for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) ++ mtk_rx_clean(eth, ð->rx_ring[i]); ++ } ++ ++ kfree(eth->scratch_head); ++} ++ ++static void mtk_tx_timeout(struct net_device *dev, unsigned int txqueue) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ eth->netdev[mac->id]->stats.tx_errors++; ++ netif_err(eth, tx_err, dev, ++ "transmit timed out\n"); ++ schedule_work(ð->pending_work); ++} ++ ++static irqreturn_t mtk_handle_irq_rx(int irq, void *_eth) ++{ ++ struct mtk_eth *eth = _eth; ++ ++ eth->rx_events++; ++ if (likely(napi_schedule_prep(ð->rx_napi))) { ++ __napi_schedule(ð->rx_napi); ++ mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask); ++ } ++ ++ return IRQ_HANDLED; ++} ++ ++static irqreturn_t mtk_handle_irq_tx(int irq, void *_eth) ++{ ++ struct mtk_eth *eth = _eth; ++ ++ eth->tx_events++; ++ if (likely(napi_schedule_prep(ð->tx_napi))) { ++ __napi_schedule(ð->tx_napi); ++ mtk_tx_irq_disable(eth, MTK_TX_DONE_INT); ++ } ++ ++ return IRQ_HANDLED; ++} ++ ++static irqreturn_t mtk_handle_irq(int irq, void *_eth) ++{ ++ struct mtk_eth *eth = _eth; ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ ++ if (mtk_r32(eth, reg_map->pdma.irq_mask) & ++ eth->soc->txrx.rx_irq_done_mask) { ++ if (mtk_r32(eth, reg_map->pdma.irq_status) & ++ eth->soc->txrx.rx_irq_done_mask) ++ mtk_handle_irq_rx(irq, _eth); ++ } ++ if (mtk_r32(eth, reg_map->tx_irq_mask) & MTK_TX_DONE_INT) { ++ if (mtk_r32(eth, reg_map->tx_irq_status) & MTK_TX_DONE_INT) ++ mtk_handle_irq_tx(irq, _eth); ++ } ++ ++ return IRQ_HANDLED; ++} ++ ++#ifdef CONFIG_NET_POLL_CONTROLLER ++static void mtk_poll_controller(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ mtk_tx_irq_disable(eth, MTK_TX_DONE_INT); ++ mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask); ++ mtk_handle_irq_rx(eth->irq[2], dev); ++ mtk_tx_irq_enable(eth, MTK_TX_DONE_INT); ++ mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask); ++} ++#endif ++ ++static int mtk_start_dma(struct mtk_eth *eth) ++{ ++ u32 val, rx_2b_offset = (NET_IP_ALIGN == 2) ? MTK_RX_2B_OFFSET : 0; ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ int err; ++ ++ err = mtk_dma_init(eth); ++ if (err) { ++ mtk_dma_free(eth); ++ return err; ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ val = mtk_r32(eth, reg_map->qdma.glo_cfg); ++ val |= MTK_TX_DMA_EN | MTK_RX_DMA_EN | ++ MTK_TX_BT_32DWORDS | MTK_NDP_CO_PRO | ++ MTK_RX_2B_OFFSET | MTK_TX_WB_DDONE; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) ++ val |= MTK_MUTLI_CNT | MTK_RESV_BUF | ++ MTK_WCOMP_EN | MTK_DMAD_WR_WDONE | ++ MTK_CHK_DDONE_EN; ++ else ++ val |= MTK_RX_BT_32DWORDS; ++ mtk_w32(eth, val, reg_map->qdma.glo_cfg); ++ ++ mtk_w32(eth, ++ MTK_RX_DMA_EN | rx_2b_offset | ++ MTK_RX_BT_32DWORDS | MTK_MULTI_EN, ++ reg_map->pdma.glo_cfg); ++ } else { ++ mtk_w32(eth, MTK_TX_WB_DDONE | MTK_TX_DMA_EN | MTK_RX_DMA_EN | ++ MTK_MULTI_EN | MTK_PDMA_SIZE_8DWORDS, ++ reg_map->pdma.glo_cfg); ++ } ++ ++ return 0; ++} ++ ++static void mtk_gdm_config(struct mtk_eth *eth, u32 config) ++{ ++ int i; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ return; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ u32 val = mtk_r32(eth, MTK_GDMA_FWD_CFG(i)); ++ ++ /* default setup the forward port to send frame to PDMA */ ++ val &= ~0xffff; ++ ++ /* Enable RX checksum */ ++ val |= MTK_GDMA_ICS_EN | MTK_GDMA_TCS_EN | MTK_GDMA_UCS_EN; ++ ++ val |= config; ++ ++ if (!i && eth->netdev[0] && netdev_uses_dsa(eth->netdev[0])) ++ val |= MTK_GDMA_SPECIAL_TAG; ++ ++ mtk_w32(eth, val, MTK_GDMA_FWD_CFG(i)); ++ } ++ /* Reset and enable PSE */ ++ mtk_w32(eth, RST_GL_PSE, MTK_RST_GL); ++ mtk_w32(eth, 0, MTK_RST_GL); ++} ++ ++static int mtk_open(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int err; ++ ++ err = phylink_of_phy_connect(mac->phylink, mac->of_node, 0); ++ if (err) { ++ netdev_err(dev, "%s: could not attach PHY: %d\n", __func__, ++ err); ++ return err; ++ } ++ ++ /* we run 2 netdevs on the same dma ring so we only bring it up once */ ++ if (!refcount_read(ð->dma_refcnt)) { ++ u32 gdm_config = MTK_GDMA_TO_PDMA; ++ ++ err = mtk_start_dma(eth); ++ if (err) { ++ phylink_disconnect_phy(mac->phylink); ++ return err; ++ } ++ ++ if (eth->soc->offload_version && mtk_ppe_start(eth->ppe) == 0) ++ gdm_config = MTK_GDMA_TO_PPE; ++ ++ mtk_gdm_config(eth, gdm_config); ++ ++ napi_enable(ð->tx_napi); ++ napi_enable(ð->rx_napi); ++ mtk_tx_irq_enable(eth, MTK_TX_DONE_INT); ++ mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask); ++ refcount_set(ð->dma_refcnt, 1); ++ } ++ else ++ refcount_inc(ð->dma_refcnt); ++ ++ phylink_start(mac->phylink); ++ netif_start_queue(dev); ++ return 0; ++} ++ ++static void mtk_stop_dma(struct mtk_eth *eth, u32 glo_cfg) ++{ ++ u32 val; ++ int i; ++ ++ /* stop the dma engine */ ++ spin_lock_bh(ð->page_lock); ++ val = mtk_r32(eth, glo_cfg); ++ mtk_w32(eth, val & ~(MTK_TX_WB_DDONE | MTK_RX_DMA_EN | MTK_TX_DMA_EN), ++ glo_cfg); ++ spin_unlock_bh(ð->page_lock); ++ ++ /* wait for dma stop */ ++ for (i = 0; i < 10; i++) { ++ val = mtk_r32(eth, glo_cfg); ++ if (val & (MTK_TX_DMA_BUSY | MTK_RX_DMA_BUSY)) { ++ msleep(20); ++ continue; ++ } ++ break; ++ } ++} ++ ++static int mtk_stop(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ phylink_stop(mac->phylink); ++ ++ netif_tx_disable(dev); ++ ++ phylink_disconnect_phy(mac->phylink); ++ ++ /* only shutdown DMA if this is the last user */ ++ if (!refcount_dec_and_test(ð->dma_refcnt)) ++ return 0; ++ ++ mtk_gdm_config(eth, MTK_GDMA_DROP_ALL); ++ ++ mtk_tx_irq_disable(eth, MTK_TX_DONE_INT); ++ mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask); ++ napi_disable(ð->tx_napi); ++ napi_disable(ð->rx_napi); ++ ++ cancel_work_sync(ð->rx_dim.work); ++ cancel_work_sync(ð->tx_dim.work); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ mtk_stop_dma(eth, eth->soc->reg_map->qdma.glo_cfg); ++ mtk_stop_dma(eth, eth->soc->reg_map->pdma.glo_cfg); ++ ++ mtk_dma_free(eth); ++ ++ if (eth->soc->offload_version) ++ mtk_ppe_stop(eth->ppe); ++ ++ return 0; ++} ++ ++static int mtk_xdp_setup(struct net_device *dev, struct bpf_prog *prog, ++ struct netlink_ext_ack *extack) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ struct bpf_prog *old_prog; ++ bool need_update; ++ ++ if (eth->hwlro) { ++ NL_SET_ERR_MSG_MOD(extack, "XDP not supported with HWLRO"); ++ return -EOPNOTSUPP; ++ } ++ ++ if (dev->mtu > MTK_PP_MAX_BUF_SIZE) { ++ NL_SET_ERR_MSG_MOD(extack, "MTU too large for XDP"); ++ return -EOPNOTSUPP; ++ } ++ ++ need_update = !!eth->prog != !!prog; ++ if (netif_running(dev) && need_update) ++ mtk_stop(dev); ++ ++ old_prog = rcu_replace_pointer(eth->prog, prog, lockdep_rtnl_is_held()); ++ if (old_prog) ++ bpf_prog_put(old_prog); ++ ++ if (netif_running(dev) && need_update) ++ return mtk_open(dev); ++ ++ return 0; ++} ++ ++static int mtk_xdp(struct net_device *dev, struct netdev_bpf *xdp) ++{ ++ switch (xdp->command) { ++ case XDP_SETUP_PROG: ++ return mtk_xdp_setup(dev, xdp->prog, xdp->extack); ++ default: ++ return -EINVAL; ++ } ++} ++ ++static void ethsys_reset(struct mtk_eth *eth, u32 reset_bits) ++{ ++ regmap_update_bits(eth->ethsys, ETHSYS_RSTCTRL, ++ reset_bits, ++ reset_bits); ++ ++ usleep_range(1000, 1100); ++ regmap_update_bits(eth->ethsys, ETHSYS_RSTCTRL, ++ reset_bits, ++ ~reset_bits); ++ mdelay(10); ++} ++ ++static void mtk_clk_disable(struct mtk_eth *eth) ++{ ++ int clk; ++ ++ for (clk = MTK_CLK_MAX - 1; clk >= 0; clk--) ++ clk_disable_unprepare(eth->clks[clk]); ++} ++ ++static int mtk_clk_enable(struct mtk_eth *eth) ++{ ++ int clk, ret; ++ ++ for (clk = 0; clk < MTK_CLK_MAX ; clk++) { ++ ret = clk_prepare_enable(eth->clks[clk]); ++ if (ret) ++ goto err_disable_clks; ++ } ++ ++ return 0; ++ ++err_disable_clks: ++ while (--clk >= 0) ++ clk_disable_unprepare(eth->clks[clk]); ++ ++ return ret; ++} ++ ++static void mtk_dim_rx(struct work_struct *work) ++{ ++ struct dim *dim = container_of(work, struct dim, work); ++ struct mtk_eth *eth = container_of(dim, struct mtk_eth, rx_dim); ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ struct dim_cq_moder cur_profile; ++ u32 val, cur; ++ ++ cur_profile = net_dim_get_rx_moderation(eth->rx_dim.mode, ++ dim->profile_ix); ++ spin_lock_bh(ð->dim_lock); ++ ++ val = mtk_r32(eth, reg_map->pdma.delay_irq); ++ val &= MTK_PDMA_DELAY_TX_MASK; ++ val |= MTK_PDMA_DELAY_RX_EN; ++ ++ cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK); ++ val |= cur << MTK_PDMA_DELAY_RX_PTIME_SHIFT; ++ ++ cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK); ++ val |= cur << MTK_PDMA_DELAY_RX_PINT_SHIFT; ++ ++ mtk_w32(eth, val, reg_map->pdma.delay_irq); ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ mtk_w32(eth, val, reg_map->qdma.delay_irq); ++ ++ spin_unlock_bh(ð->dim_lock); ++ ++ dim->state = DIM_START_MEASURE; ++} ++ ++static void mtk_dim_tx(struct work_struct *work) ++{ ++ struct dim *dim = container_of(work, struct dim, work); ++ struct mtk_eth *eth = container_of(dim, struct mtk_eth, tx_dim); ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ struct dim_cq_moder cur_profile; ++ u32 val, cur; ++ ++ cur_profile = net_dim_get_tx_moderation(eth->tx_dim.mode, ++ dim->profile_ix); ++ spin_lock_bh(ð->dim_lock); ++ ++ val = mtk_r32(eth, reg_map->pdma.delay_irq); ++ val &= MTK_PDMA_DELAY_RX_MASK; ++ val |= MTK_PDMA_DELAY_TX_EN; ++ ++ cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK); ++ val |= cur << MTK_PDMA_DELAY_TX_PTIME_SHIFT; ++ ++ cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK); ++ val |= cur << MTK_PDMA_DELAY_TX_PINT_SHIFT; ++ ++ mtk_w32(eth, val, reg_map->pdma.delay_irq); ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ mtk_w32(eth, val, reg_map->qdma.delay_irq); ++ ++ spin_unlock_bh(ð->dim_lock); ++ ++ dim->state = DIM_START_MEASURE; ++} ++ ++static int mtk_hw_init(struct mtk_eth *eth) ++{ ++ u32 dma_mask = ETHSYS_DMA_AG_MAP_PDMA | ETHSYS_DMA_AG_MAP_QDMA | ++ ETHSYS_DMA_AG_MAP_PPE; ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ int i, val, ret; ++ ++ if (test_and_set_bit(MTK_HW_INIT, ð->state)) ++ return 0; ++ ++ pm_runtime_enable(eth->dev); ++ pm_runtime_get_sync(eth->dev); ++ ++ ret = mtk_clk_enable(eth); ++ if (ret) ++ goto err_disable_pm; ++ ++ if (eth->ethsys) ++ regmap_update_bits(eth->ethsys, ETHSYS_DMA_AG_MAP, dma_mask, ++ of_dma_is_coherent(eth->dma_dev->of_node) * dma_mask); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ ret = device_reset(eth->dev); ++ if (ret) { ++ dev_err(eth->dev, "MAC reset failed!\n"); ++ goto err_disable_pm; ++ } ++ ++ /* set interrupt delays based on current Net DIM sample */ ++ mtk_dim_rx(ð->rx_dim.work); ++ mtk_dim_tx(ð->tx_dim.work); ++ ++ /* disable delay and normal interrupt */ ++ mtk_tx_irq_disable(eth, ~0); ++ mtk_rx_irq_disable(eth, ~0); ++ ++ return 0; ++ } ++ ++ val = RSTCTRL_FE | RSTCTRL_PPE; ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ regmap_write(eth->ethsys, ETHSYS_FE_RST_CHK_IDLE_EN, 0); ++ ++ val |= RSTCTRL_ETH; ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_RSTCTRL_PPE1)) ++ val |= RSTCTRL_PPE1; ++ } ++ ++ ethsys_reset(eth, val); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ regmap_write(eth->ethsys, ETHSYS_FE_RST_CHK_IDLE_EN, ++ 0x3ffffff); ++ ++ /* Set FE to PDMAv2 if necessary */ ++ val = mtk_r32(eth, MTK_FE_GLO_MISC); ++ mtk_w32(eth, val | BIT(4), MTK_FE_GLO_MISC); ++ } ++ ++ if (eth->pctl) { ++ /* Set GE2 driving and slew rate */ ++ regmap_write(eth->pctl, GPIO_DRV_SEL10, 0xa00); ++ ++ /* set GE2 TDSEL */ ++ regmap_write(eth->pctl, GPIO_OD33_CTRL8, 0x5); ++ ++ /* set GE2 TUNE */ ++ regmap_write(eth->pctl, GPIO_BIAS_CTRL, 0x0); ++ } ++ ++ /* Set linkdown as the default for each GMAC. Its own MCR would be set ++ * up with the more appropriate value when mtk_mac_config call is being ++ * invoked. ++ */ ++ for (i = 0; i < MTK_MAC_COUNT; i++) ++ mtk_w32(eth, MAC_MCR_FORCE_LINK_DOWN, MTK_MAC_MCR(i)); ++ ++ /* Indicates CDM to parse the MTK special tag from CPU ++ * which also is working out for untag packets. ++ */ ++ val = mtk_r32(eth, MTK_CDMQ_IG_CTRL); ++ mtk_w32(eth, val | MTK_CDMQ_STAG_EN, MTK_CDMQ_IG_CTRL); ++ ++ /* Enable RX VLan Offloading */ ++ mtk_w32(eth, 1, MTK_CDMP_EG_CTRL); ++ ++ /* set interrupt delays based on current Net DIM sample */ ++ mtk_dim_rx(ð->rx_dim.work); ++ mtk_dim_tx(ð->tx_dim.work); ++ ++ /* disable delay and normal interrupt */ ++ mtk_tx_irq_disable(eth, ~0); ++ mtk_rx_irq_disable(eth, ~0); ++ ++ /* FE int grouping */ ++ mtk_w32(eth, MTK_TX_DONE_INT, reg_map->pdma.int_grp); ++ mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, reg_map->pdma.int_grp + 4); ++ mtk_w32(eth, MTK_TX_DONE_INT, reg_map->qdma.int_grp); ++ mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, reg_map->qdma.int_grp + 4); ++ mtk_w32(eth, 0x21021000, MTK_FE_INT_GRP); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ /* PSE should not drop port8 and port9 packets */ ++ mtk_w32(eth, 0x00000300, PSE_DROP_CFG); ++ ++ /* PSE Free Queue Flow Control */ ++ mtk_w32(eth, 0x01fa01f4, PSE_FQFC_CFG2); ++ ++ /* PSE config input queue threshold */ ++ mtk_w32(eth, 0x001a000e, PSE_IQ_REV(1)); ++ mtk_w32(eth, 0x01ff001a, PSE_IQ_REV(2)); ++ mtk_w32(eth, 0x000e01ff, PSE_IQ_REV(3)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(4)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(5)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(6)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(7)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(8)); ++ ++ /* PSE config output queue threshold */ ++ mtk_w32(eth, 0x000f000a, PSE_OQ_TH(1)); ++ mtk_w32(eth, 0x001a000f, PSE_OQ_TH(2)); ++ mtk_w32(eth, 0x000f001a, PSE_OQ_TH(3)); ++ mtk_w32(eth, 0x01ff000f, PSE_OQ_TH(4)); ++ mtk_w32(eth, 0x000f000f, PSE_OQ_TH(5)); ++ mtk_w32(eth, 0x0006000f, PSE_OQ_TH(6)); ++ mtk_w32(eth, 0x00060006, PSE_OQ_TH(7)); ++ mtk_w32(eth, 0x00060006, PSE_OQ_TH(8)); ++ ++ /* GDM and CDM Threshold */ ++ mtk_w32(eth, 0x00000004, MTK_GDM2_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDMW0_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDMW1_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDME0_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDME1_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDMM_THRES); ++ } ++ ++ return 0; ++ ++err_disable_pm: ++ pm_runtime_put_sync(eth->dev); ++ pm_runtime_disable(eth->dev); ++ ++ return ret; ++} ++ ++static int mtk_hw_deinit(struct mtk_eth *eth) ++{ ++ if (!test_and_clear_bit(MTK_HW_INIT, ð->state)) ++ return 0; ++ ++ mtk_clk_disable(eth); ++ ++ pm_runtime_put_sync(eth->dev); ++ pm_runtime_disable(eth->dev); ++ ++ return 0; ++} ++ ++static int __init mtk_init(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int ret; ++ ++ ret = of_get_ethdev_address(mac->of_node, dev); ++ if (ret) { ++ /* If the mac address is invalid, use random mac address */ ++ eth_hw_addr_random(dev); ++ dev_err(eth->dev, "generated random MAC address %pM\n", ++ dev->dev_addr); ++ } ++ ++ return 0; ++} ++ ++static void mtk_uninit(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ phylink_disconnect_phy(mac->phylink); ++ mtk_tx_irq_disable(eth, ~0); ++ mtk_rx_irq_disable(eth, ~0); ++} ++ ++static int mtk_change_mtu(struct net_device *dev, int new_mtu) ++{ ++ int length = new_mtu + MTK_RX_ETH_HLEN; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ u32 mcr_cur, mcr_new; ++ ++ if (rcu_access_pointer(eth->prog) && ++ length > MTK_PP_MAX_BUF_SIZE) { ++ netdev_err(dev, "Invalid MTU for XDP mode\n"); ++ return -EINVAL; ++ } ++ ++ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); ++ mcr_new = mcr_cur & ~MAC_MCR_MAX_RX_MASK; ++ ++ if (length <= 1518) ++ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1518); ++ else if (length <= 1536) ++ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1536); ++ else if (length <= 1552) ++ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1552); ++ else ++ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_2048); ++ ++ if (mcr_new != mcr_cur) ++ mtk_w32(mac->hw, mcr_new, MTK_MAC_MCR(mac->id)); ++ } ++ ++ dev->mtu = new_mtu; ++ ++ return 0; ++} ++ ++static int mtk_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ switch (cmd) { ++ case SIOCGMIIPHY: ++ case SIOCGMIIREG: ++ case SIOCSMIIREG: ++ return phylink_mii_ioctl(mac->phylink, ifr, cmd); ++ default: ++ break; ++ } ++ ++ return -EOPNOTSUPP; ++} ++ ++static void mtk_pending_work(struct work_struct *work) ++{ ++ struct mtk_eth *eth = container_of(work, struct mtk_eth, pending_work); ++ int err, i; ++ unsigned long restart = 0; ++ ++ rtnl_lock(); ++ ++ dev_dbg(eth->dev, "[%s][%d] reset\n", __func__, __LINE__); ++ ++ while (test_and_set_bit_lock(MTK_RESETTING, ð->state)) ++ cpu_relax(); ++ ++ dev_dbg(eth->dev, "[%s][%d] mtk_stop starts\n", __func__, __LINE__); ++ /* stop all devices to make sure that dma is properly shut down */ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ mtk_stop(eth->netdev[i]); ++ __set_bit(i, &restart); ++ } ++ dev_dbg(eth->dev, "[%s][%d] mtk_stop ends\n", __func__, __LINE__); ++ ++ /* restart underlying hardware such as power, clock, pin mux ++ * and the connected phy ++ */ ++ mtk_hw_deinit(eth); ++ ++ if (eth->dev->pins) ++ pinctrl_select_state(eth->dev->pins->p, ++ eth->dev->pins->default_state); ++ mtk_hw_init(eth); ++ ++ /* restart DMA and enable IRQs */ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!test_bit(i, &restart)) ++ continue; ++ err = mtk_open(eth->netdev[i]); ++ if (err) { ++ netif_alert(eth, ifup, eth->netdev[i], ++ "Driver up/down cycle failed, closing device.\n"); ++ dev_close(eth->netdev[i]); ++ } ++ } ++ ++ dev_dbg(eth->dev, "[%s][%d] reset done\n", __func__, __LINE__); ++ ++ clear_bit_unlock(MTK_RESETTING, ð->state); ++ ++ rtnl_unlock(); ++} ++ ++static int mtk_free_dev(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ free_netdev(eth->netdev[i]); ++ } ++ ++ return 0; ++} ++ ++static int mtk_unreg_dev(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ unregister_netdev(eth->netdev[i]); ++ } ++ ++ return 0; ++} ++ ++static int mtk_cleanup(struct mtk_eth *eth) ++{ ++ mtk_unreg_dev(eth); ++ mtk_free_dev(eth); ++ cancel_work_sync(ð->pending_work); ++ ++ return 0; ++} ++ ++static int mtk_get_link_ksettings(struct net_device *ndev, ++ struct ethtool_link_ksettings *cmd) ++{ ++ struct mtk_mac *mac = netdev_priv(ndev); ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return -EBUSY; ++ ++ return phylink_ethtool_ksettings_get(mac->phylink, cmd); ++} ++ ++static int mtk_set_link_ksettings(struct net_device *ndev, ++ const struct ethtool_link_ksettings *cmd) ++{ ++ struct mtk_mac *mac = netdev_priv(ndev); ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return -EBUSY; ++ ++ return phylink_ethtool_ksettings_set(mac->phylink, cmd); ++} ++ ++static void mtk_get_drvinfo(struct net_device *dev, ++ struct ethtool_drvinfo *info) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ strlcpy(info->driver, mac->hw->dev->driver->name, sizeof(info->driver)); ++ strlcpy(info->bus_info, dev_name(mac->hw->dev), sizeof(info->bus_info)); ++ info->n_stats = ARRAY_SIZE(mtk_ethtool_stats); ++} ++ ++static u32 mtk_get_msglevel(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ return mac->hw->msg_enable; ++} ++ ++static void mtk_set_msglevel(struct net_device *dev, u32 value) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ mac->hw->msg_enable = value; ++} ++ ++static int mtk_nway_reset(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return -EBUSY; ++ ++ if (!mac->phylink) ++ return -ENOTSUPP; ++ ++ return phylink_ethtool_nway_reset(mac->phylink); ++} ++ ++static void mtk_get_strings(struct net_device *dev, u32 stringset, u8 *data) ++{ ++ int i; ++ ++ switch (stringset) { ++ case ETH_SS_STATS: { ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ for (i = 0; i < ARRAY_SIZE(mtk_ethtool_stats); i++) { ++ memcpy(data, mtk_ethtool_stats[i].str, ETH_GSTRING_LEN); ++ data += ETH_GSTRING_LEN; ++ } ++ if (mtk_page_pool_enabled(mac->hw)) ++ page_pool_ethtool_stats_get_strings(data); ++ break; ++ } ++ default: ++ break; ++ } ++} ++ ++static int mtk_get_sset_count(struct net_device *dev, int sset) ++{ ++ switch (sset) { ++ case ETH_SS_STATS: { ++ int count = ARRAY_SIZE(mtk_ethtool_stats); ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ if (mtk_page_pool_enabled(mac->hw)) ++ count += page_pool_ethtool_stats_get_count(); ++ return count; ++ } ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++static void mtk_ethtool_pp_stats(struct mtk_eth *eth, u64 *data) ++{ ++ struct page_pool_stats stats = {}; ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(eth->rx_ring); i++) { ++ struct mtk_rx_ring *ring = ð->rx_ring[i]; ++ ++ if (!ring->page_pool) ++ continue; ++ ++ page_pool_get_stats(ring->page_pool, &stats); ++ } ++ page_pool_ethtool_stats_get(data, &stats); ++} ++ ++static void mtk_get_ethtool_stats(struct net_device *dev, ++ struct ethtool_stats *stats, u64 *data) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_hw_stats *hwstats = mac->hw_stats; ++ u64 *data_src, *data_dst; ++ unsigned int start; ++ int i; ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return; ++ ++ if (netif_running(dev) && netif_device_present(dev)) { ++ if (spin_trylock_bh(&hwstats->stats_lock)) { ++ mtk_stats_update_mac(mac); ++ spin_unlock_bh(&hwstats->stats_lock); ++ } ++ } ++ ++ data_src = (u64 *)hwstats; ++ ++ do { ++ data_dst = data; ++ start = u64_stats_fetch_begin_irq(&hwstats->syncp); ++ ++ for (i = 0; i < ARRAY_SIZE(mtk_ethtool_stats); i++) ++ *data_dst++ = *(data_src + mtk_ethtool_stats[i].offset); ++ if (mtk_page_pool_enabled(mac->hw)) ++ mtk_ethtool_pp_stats(mac->hw, data_dst); ++ } while (u64_stats_fetch_retry_irq(&hwstats->syncp, start)); ++} ++ ++static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, ++ u32 *rule_locs) ++{ ++ int ret = -EOPNOTSUPP; ++ ++ switch (cmd->cmd) { ++ case ETHTOOL_GRXRINGS: ++ if (dev->hw_features & NETIF_F_LRO) { ++ cmd->data = MTK_MAX_RX_RING_NUM; ++ ret = 0; ++ } ++ break; ++ case ETHTOOL_GRXCLSRLCNT: ++ if (dev->hw_features & NETIF_F_LRO) { ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ cmd->rule_cnt = mac->hwlro_ip_cnt; ++ ret = 0; ++ } ++ break; ++ case ETHTOOL_GRXCLSRULE: ++ if (dev->hw_features & NETIF_F_LRO) ++ ret = mtk_hwlro_get_fdir_entry(dev, cmd); ++ break; ++ case ETHTOOL_GRXCLSRLALL: ++ if (dev->hw_features & NETIF_F_LRO) ++ ret = mtk_hwlro_get_fdir_all(dev, cmd, ++ rule_locs); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++static int mtk_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) ++{ ++ int ret = -EOPNOTSUPP; ++ ++ switch (cmd->cmd) { ++ case ETHTOOL_SRXCLSRLINS: ++ if (dev->hw_features & NETIF_F_LRO) ++ ret = mtk_hwlro_add_ipaddr(dev, cmd); ++ break; ++ case ETHTOOL_SRXCLSRLDEL: ++ if (dev->hw_features & NETIF_F_LRO) ++ ret = mtk_hwlro_del_ipaddr(dev, cmd); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++static const struct ethtool_ops mtk_ethtool_ops = { ++ .get_link_ksettings = mtk_get_link_ksettings, ++ .set_link_ksettings = mtk_set_link_ksettings, ++ .get_drvinfo = mtk_get_drvinfo, ++ .get_msglevel = mtk_get_msglevel, ++ .set_msglevel = mtk_set_msglevel, ++ .nway_reset = mtk_nway_reset, ++ .get_link = ethtool_op_get_link, ++ .get_strings = mtk_get_strings, ++ .get_sset_count = mtk_get_sset_count, ++ .get_ethtool_stats = mtk_get_ethtool_stats, ++ .get_rxnfc = mtk_get_rxnfc, ++ .set_rxnfc = mtk_set_rxnfc, ++}; ++ ++static const struct net_device_ops mtk_netdev_ops = { ++ .ndo_init = mtk_init, ++ .ndo_uninit = mtk_uninit, ++ .ndo_open = mtk_open, ++ .ndo_stop = mtk_stop, ++ .ndo_start_xmit = mtk_start_xmit, ++ .ndo_set_mac_address = mtk_set_mac_address, ++ .ndo_validate_addr = eth_validate_addr, ++ .ndo_eth_ioctl = mtk_do_ioctl, ++ .ndo_change_mtu = mtk_change_mtu, ++ .ndo_tx_timeout = mtk_tx_timeout, ++ .ndo_get_stats64 = mtk_get_stats64, ++ .ndo_fix_features = mtk_fix_features, ++ .ndo_set_features = mtk_set_features, ++#ifdef CONFIG_NET_POLL_CONTROLLER ++ .ndo_poll_controller = mtk_poll_controller, ++#endif ++ .ndo_setup_tc = mtk_eth_setup_tc, ++ .ndo_bpf = mtk_xdp, ++ .ndo_xdp_xmit = mtk_xdp_xmit, ++}; ++ ++static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np) ++{ ++ const __be32 *_id = of_get_property(np, "reg", NULL); ++ phy_interface_t phy_mode; ++ struct phylink *phylink; ++ struct mtk_mac *mac; ++ int id, err; ++ ++ if (!_id) { ++ dev_err(eth->dev, "missing mac id\n"); ++ return -EINVAL; ++ } ++ ++ id = be32_to_cpup(_id); ++ if (id >= MTK_MAC_COUNT) { ++ dev_err(eth->dev, "%d is not a valid mac id\n", id); ++ return -EINVAL; ++ } ++ ++ if (eth->netdev[id]) { ++ dev_err(eth->dev, "duplicate mac id found: %d\n", id); ++ return -EINVAL; ++ } ++ ++ eth->netdev[id] = alloc_etherdev(sizeof(*mac)); ++ if (!eth->netdev[id]) { ++ dev_err(eth->dev, "alloc_etherdev failed\n"); ++ return -ENOMEM; ++ } ++ mac = netdev_priv(eth->netdev[id]); ++ eth->mac[id] = mac; ++ mac->id = id; ++ mac->hw = eth; ++ mac->of_node = np; ++ ++ memset(mac->hwlro_ip, 0, sizeof(mac->hwlro_ip)); ++ mac->hwlro_ip_cnt = 0; ++ ++ mac->hw_stats = devm_kzalloc(eth->dev, ++ sizeof(*mac->hw_stats), ++ GFP_KERNEL); ++ if (!mac->hw_stats) { ++ dev_err(eth->dev, "failed to allocate counter memory\n"); ++ err = -ENOMEM; ++ goto free_netdev; ++ } ++ spin_lock_init(&mac->hw_stats->stats_lock); ++ u64_stats_init(&mac->hw_stats->syncp); ++ mac->hw_stats->reg_offset = id * MTK_STAT_OFFSET; ++ ++ /* phylink create */ ++ err = of_get_phy_mode(np, &phy_mode); ++ if (err) { ++ dev_err(eth->dev, "incorrect phy-mode\n"); ++ goto free_netdev; ++ } ++ ++ /* mac config is not set */ ++ mac->interface = PHY_INTERFACE_MODE_NA; ++ mac->speed = SPEED_UNKNOWN; ++ ++ mac->phylink_config.dev = ð->netdev[id]->dev; ++ mac->phylink_config.type = PHYLINK_NETDEV; ++ /* This driver makes use of state->speed in mac_config */ ++ mac->phylink_config.legacy_pre_march2020 = true; ++ mac->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | ++ MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD; ++ ++ __set_bit(PHY_INTERFACE_MODE_MII, ++ mac->phylink_config.supported_interfaces); ++ __set_bit(PHY_INTERFACE_MODE_GMII, ++ mac->phylink_config.supported_interfaces); ++ ++ if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_RGMII)) ++ phy_interface_set_rgmii(mac->phylink_config.supported_interfaces); ++ ++ if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_TRGMII) && !mac->id) ++ __set_bit(PHY_INTERFACE_MODE_TRGMII, ++ mac->phylink_config.supported_interfaces); ++ ++ if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_SGMII)) { ++ __set_bit(PHY_INTERFACE_MODE_SGMII, ++ mac->phylink_config.supported_interfaces); ++ __set_bit(PHY_INTERFACE_MODE_1000BASEX, ++ mac->phylink_config.supported_interfaces); ++ __set_bit(PHY_INTERFACE_MODE_2500BASEX, ++ mac->phylink_config.supported_interfaces); ++ } ++ ++ phylink = phylink_create(&mac->phylink_config, ++ of_fwnode_handle(mac->of_node), ++ phy_mode, &mtk_phylink_ops); ++ if (IS_ERR(phylink)) { ++ err = PTR_ERR(phylink); ++ goto free_netdev; ++ } ++ ++ mac->phylink = phylink; ++ ++ SET_NETDEV_DEV(eth->netdev[id], eth->dev); ++ eth->netdev[id]->watchdog_timeo = 5 * HZ; ++ eth->netdev[id]->netdev_ops = &mtk_netdev_ops; ++ eth->netdev[id]->base_addr = (unsigned long)eth->base; ++ ++ eth->netdev[id]->hw_features = eth->soc->hw_features; ++ if (eth->hwlro) ++ eth->netdev[id]->hw_features |= NETIF_F_LRO; ++ ++ eth->netdev[id]->vlan_features = eth->soc->hw_features & ++ ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX); ++ eth->netdev[id]->features |= eth->soc->hw_features; ++ eth->netdev[id]->ethtool_ops = &mtk_ethtool_ops; ++ ++ eth->netdev[id]->irq = eth->irq[0]; ++ eth->netdev[id]->dev.of_node = np; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH - MTK_RX_ETH_HLEN; ++ else ++ eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN; ++ ++ return 0; ++ ++free_netdev: ++ free_netdev(eth->netdev[id]); ++ return err; ++} ++ ++void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev) ++{ ++ struct net_device *dev, *tmp; ++ LIST_HEAD(dev_list); ++ int i; ++ ++ rtnl_lock(); ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ dev = eth->netdev[i]; ++ ++ if (!dev || !(dev->flags & IFF_UP)) ++ continue; ++ ++ list_add_tail(&dev->close_list, &dev_list); ++ } ++ ++ dev_close_many(&dev_list, false); ++ ++ eth->dma_dev = dma_dev; ++ ++ list_for_each_entry_safe(dev, tmp, &dev_list, close_list) { ++ list_del_init(&dev->close_list); ++ dev_open(dev, NULL); ++ } ++ ++ rtnl_unlock(); ++} ++ ++static int mtk_probe(struct platform_device *pdev) ++{ ++ struct device_node *mac_np; ++ struct mtk_eth *eth; ++ int err, i; ++ ++ eth = devm_kzalloc(&pdev->dev, sizeof(*eth), GFP_KERNEL); ++ if (!eth) ++ return -ENOMEM; ++ ++ eth->soc = of_device_get_match_data(&pdev->dev); ++ ++ eth->dev = &pdev->dev; ++ eth->dma_dev = &pdev->dev; ++ eth->base = devm_platform_ioremap_resource(pdev, 0); ++ if (IS_ERR(eth->base)) ++ return PTR_ERR(eth->base); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ eth->ip_align = NET_IP_ALIGN; ++ ++ spin_lock_init(ð->page_lock); ++ spin_lock_init(ð->tx_irq_lock); ++ spin_lock_init(ð->rx_irq_lock); ++ spin_lock_init(ð->dim_lock); ++ ++ eth->rx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; ++ INIT_WORK(ð->rx_dim.work, mtk_dim_rx); ++ ++ eth->tx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; ++ INIT_WORK(ð->tx_dim.work, mtk_dim_tx); ++ ++ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ eth->ethsys = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, ++ "mediatek,ethsys"); ++ if (IS_ERR(eth->ethsys)) { ++ dev_err(&pdev->dev, "no ethsys regmap found\n"); ++ return PTR_ERR(eth->ethsys); ++ } ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_INFRA)) { ++ eth->infra = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, ++ "mediatek,infracfg"); ++ if (IS_ERR(eth->infra)) { ++ dev_err(&pdev->dev, "no infracfg regmap found\n"); ++ return PTR_ERR(eth->infra); ++ } ++ } ++ ++ if (of_dma_is_coherent(pdev->dev.of_node)) { ++ struct regmap *cci; ++ ++ cci = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, ++ "cci-control-port"); ++ /* enable CPU/bus coherency */ ++ if (!IS_ERR(cci)) ++ regmap_write(cci, 0, 3); ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) { ++ eth->sgmii = devm_kzalloc(eth->dev, sizeof(*eth->sgmii), ++ GFP_KERNEL); ++ if (!eth->sgmii) ++ return -ENOMEM; ++ ++ err = mtk_sgmii_init(eth->sgmii, pdev->dev.of_node, ++ eth->soc->ana_rgc3); ++ ++ if (err) ++ return err; ++ } ++ ++ if (eth->soc->required_pctl) { ++ eth->pctl = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, ++ "mediatek,pctl"); ++ if (IS_ERR(eth->pctl)) { ++ dev_err(&pdev->dev, "no pctl regmap found\n"); ++ return PTR_ERR(eth->pctl); ++ } ++ } ++ ++ for (i = 0;; i++) { ++ struct device_node *np = of_parse_phandle(pdev->dev.of_node, ++ "mediatek,wed", i); ++ static const u32 wdma_regs[] = { ++ MTK_WDMA0_BASE, ++ MTK_WDMA1_BASE ++ }; ++ void __iomem *wdma; ++ ++ if (!np || i >= ARRAY_SIZE(wdma_regs)) ++ break; ++ ++ wdma = eth->base + wdma_regs[i]; ++ mtk_wed_add_hw(np, eth, wdma, i); ++ } ++ ++ for (i = 0; i < 3; i++) { ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_INT) && i > 0) ++ eth->irq[i] = eth->irq[0]; ++ else ++ eth->irq[i] = platform_get_irq(pdev, i); ++ if (eth->irq[i] < 0) { ++ dev_err(&pdev->dev, "no IRQ%d resource found\n", i); ++ err = -ENXIO; ++ goto err_wed_exit; ++ } ++ } ++ for (i = 0; i < ARRAY_SIZE(eth->clks); i++) { ++ eth->clks[i] = devm_clk_get(eth->dev, ++ mtk_clks_source_name[i]); ++ if (IS_ERR(eth->clks[i])) { ++ if (PTR_ERR(eth->clks[i]) == -EPROBE_DEFER) { ++ err = -EPROBE_DEFER; ++ goto err_wed_exit; ++ } ++ if (eth->soc->required_clks & BIT(i)) { ++ dev_err(&pdev->dev, "clock %s not found\n", ++ mtk_clks_source_name[i]); ++ err = -EINVAL; ++ goto err_wed_exit; ++ } ++ eth->clks[i] = NULL; ++ } ++ } ++ ++ eth->msg_enable = netif_msg_init(mtk_msg_level, MTK_DEFAULT_MSG_ENABLE); ++ INIT_WORK(ð->pending_work, mtk_pending_work); ++ ++ err = mtk_hw_init(eth); ++ if (err) ++ goto err_wed_exit; ++ ++ eth->hwlro = MTK_HAS_CAPS(eth->soc->caps, MTK_HWLRO); ++ ++ for_each_child_of_node(pdev->dev.of_node, mac_np) { ++ if (!of_device_is_compatible(mac_np, ++ "mediatek,eth-mac")) ++ continue; ++ ++ if (!of_device_is_available(mac_np)) ++ continue; ++ ++ err = mtk_add_mac(eth, mac_np); ++ if (err) { ++ of_node_put(mac_np); ++ goto err_deinit_hw; ++ } ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_INT)) { ++ err = devm_request_irq(eth->dev, eth->irq[0], ++ mtk_handle_irq, 0, ++ dev_name(eth->dev), eth); ++ } else { ++ err = devm_request_irq(eth->dev, eth->irq[1], ++ mtk_handle_irq_tx, 0, ++ dev_name(eth->dev), eth); ++ if (err) ++ goto err_free_dev; ++ ++ err = devm_request_irq(eth->dev, eth->irq[2], ++ mtk_handle_irq_rx, 0, ++ dev_name(eth->dev), eth); ++ } ++ if (err) ++ goto err_free_dev; ++ ++ /* No MT7628/88 support yet */ ++ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ err = mtk_mdio_init(eth); ++ if (err) ++ goto err_free_dev; ++ } ++ ++ if (eth->soc->offload_version) { ++ eth->ppe = mtk_ppe_init(eth, eth->base + MTK_ETH_PPE_BASE, 2); ++ if (!eth->ppe) { ++ err = -ENOMEM; ++ goto err_deinit_mdio; ++ } ++ ++ err = mtk_eth_offload_init(eth); ++ if (err) ++ goto err_deinit_mdio; ++ } ++ ++ for (i = 0; i < MTK_MAX_DEVS; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ ++ err = register_netdev(eth->netdev[i]); ++ if (err) { ++ dev_err(eth->dev, "error bringing up device\n"); ++ goto err_deinit_mdio; ++ } else ++ netif_info(eth, probe, eth->netdev[i], ++ "mediatek frame engine at 0x%08lx, irq %d\n", ++ eth->netdev[i]->base_addr, eth->irq[0]); ++ } ++ ++ /* we run 2 devices on the same DMA ring so we need a dummy device ++ * for NAPI to work ++ */ ++ init_dummy_netdev(ð->dummy_dev); ++ netif_napi_add(ð->dummy_dev, ð->tx_napi, mtk_napi_tx, ++ NAPI_POLL_WEIGHT); ++ netif_napi_add(ð->dummy_dev, ð->rx_napi, mtk_napi_rx, ++ NAPI_POLL_WEIGHT); ++ ++ platform_set_drvdata(pdev, eth); ++ ++ return 0; ++ ++err_deinit_mdio: ++ mtk_mdio_cleanup(eth); ++err_free_dev: ++ mtk_free_dev(eth); ++err_deinit_hw: ++ mtk_hw_deinit(eth); ++err_wed_exit: ++ mtk_wed_exit(); ++ ++ return err; ++} ++ ++static int mtk_remove(struct platform_device *pdev) ++{ ++ struct mtk_eth *eth = platform_get_drvdata(pdev); ++ struct mtk_mac *mac; ++ int i; ++ ++ /* stop all devices to make sure that dma is properly shut down */ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ mtk_stop(eth->netdev[i]); ++ mac = netdev_priv(eth->netdev[i]); ++ phylink_disconnect_phy(mac->phylink); ++ } ++ ++ mtk_wed_exit(); ++ mtk_hw_deinit(eth); ++ ++ netif_napi_del(ð->tx_napi); ++ netif_napi_del(ð->rx_napi); ++ mtk_cleanup(eth); ++ mtk_mdio_cleanup(eth); ++ ++ return 0; ++} ++ ++static const struct mtk_soc_data mt2701_data = { ++ .reg_map = &mtk_reg_map, ++ .caps = MT7623_CAPS | MTK_HWLRO, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7623_CLKS_BITMAP, ++ .required_pctl = true, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7621_data = { ++ .reg_map = &mtk_reg_map, ++ .caps = MT7621_CAPS, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7621_CLKS_BITMAP, ++ .required_pctl = false, ++ .offload_version = 2, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7622_data = { ++ .reg_map = &mtk_reg_map, ++ .ana_rgc3 = 0x2028, ++ .caps = MT7622_CAPS | MTK_HWLRO, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7622_CLKS_BITMAP, ++ .required_pctl = false, ++ .offload_version = 2, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7623_data = { ++ .reg_map = &mtk_reg_map, ++ .caps = MT7623_CAPS | MTK_HWLRO, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7623_CLKS_BITMAP, ++ .required_pctl = true, ++ .offload_version = 2, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7629_data = { ++ .reg_map = &mtk_reg_map, ++ .ana_rgc3 = 0x128, ++ .caps = MT7629_CAPS | MTK_HWLRO, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7629_CLKS_BITMAP, ++ .required_pctl = false, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7986_data = { ++ .reg_map = &mt7986_reg_map, ++ .ana_rgc3 = 0x128, ++ .caps = MT7986_CAPS, ++ .required_clks = MT7986_CLKS_BITMAP, ++ .required_pctl = false, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma_v2), ++ .rxd_size = sizeof(struct mtk_rx_dma_v2), ++ .rx_irq_done_mask = MTK_RX_DONE_INT_V2, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID_V2, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN_V2, ++ .dma_len_offset = 8, ++ }, ++}; ++ ++static const struct mtk_soc_data rt5350_data = { ++ .reg_map = &mt7628_reg_map, ++ .caps = MT7628_CAPS, ++ .hw_features = MTK_HW_FEATURES_MT7628, ++ .required_clks = MT7628_CLKS_BITMAP, ++ .required_pctl = false, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID_PDMA, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++const struct of_device_id of_mtk_match[] = { ++ { .compatible = "mediatek,mt2701-eth", .data = &mt2701_data}, ++ { .compatible = "mediatek,mt7621-eth", .data = &mt7621_data}, ++ { .compatible = "mediatek,mt7622-eth", .data = &mt7622_data}, ++ { .compatible = "mediatek,mt7623-eth", .data = &mt7623_data}, ++ { .compatible = "mediatek,mt7629-eth", .data = &mt7629_data}, ++ { .compatible = "mediatek,mt7986-eth", .data = &mt7986_data}, ++ { .compatible = "ralink,rt5350-eth", .data = &rt5350_data}, ++ {}, ++}; ++MODULE_DEVICE_TABLE(of, of_mtk_match); ++ ++static struct platform_driver mtk_driver = { ++ .probe = mtk_probe, ++ .remove = mtk_remove, ++ .driver = { ++ .name = "mtk_soc_eth", ++ .of_match_table = of_mtk_match, ++ }, ++}; ++ ++module_platform_driver(mtk_driver); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("John Crispin "); ++MODULE_DESCRIPTION("Ethernet driver for MediaTek SoC"); +diff -rupN linux.orig/drivers/net/ethernet/mellanox/mlxsw/spectrum.c linux/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +--- linux.orig/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 2022-12-04 10:40:26.692034106 -0500 +@@ -827,12 +827,12 @@ mlxsw_sp_port_get_sw_stats64(const struc for_each_possible_cpu(i) { p = per_cpu_ptr(mlxsw_sp_port->pcpu_stats, i); do { @@ -2234,11 +12394,10 @@ index 30c7b0e157218..fa2753318cdf7 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c -index 9259a74eca40b..318dbbb482797 100644 ---- a/drivers/net/ethernet/microsoft/mana/mana_en.c -+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c -@@ -315,10 +315,10 @@ static void mana_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/ethernet/microsoft/mana/mana_en.c linux/drivers/net/ethernet/microsoft/mana/mana_en.c +--- linux.orig/drivers/net/ethernet/microsoft/mana/mana_en.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/microsoft/mana/mana_en.c 2022-12-04 10:40:26.692034106 -0500 +@@ -315,10 +315,10 @@ static void mana_get_stats64(struct net_ rx_stats = &apc->rxqs[q]->stats; do { @@ -2251,7 +12410,7 @@ index 9259a74eca40b..318dbbb482797 100644 st->rx_packets += packets; st->rx_bytes += bytes; -@@ -328,10 +328,10 @@ static void mana_get_stats64(struct net_device *ndev, +@@ -328,10 +328,10 @@ static void mana_get_stats64(struct net_ tx_stats = &apc->tx_qp[q].txq.stats; do { @@ -2264,11 +12423,10 @@ index 9259a74eca40b..318dbbb482797 100644 st->tx_packets += packets; st->tx_bytes += bytes; -diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c -index c530db76880f0..96d55c91c9698 100644 ---- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c -+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c -@@ -90,13 +90,13 @@ static void mana_get_ethtool_stats(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/ethernet/microsoft/mana/mana_ethtool.c linux/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +--- linux.orig/drivers/net/ethernet/microsoft/mana/mana_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/microsoft/mana/mana_ethtool.c 2022-12-04 10:40:26.692034106 -0500 +@@ -90,13 +90,13 @@ static void mana_get_ethtool_stats(struc rx_stats = &apc->rxqs[q]->stats; do { @@ -2284,7 +12442,7 @@ index c530db76880f0..96d55c91c9698 100644 data[i++] = packets; data[i++] = bytes; -@@ -109,11 +109,11 @@ static void mana_get_ethtool_stats(struct net_device *ndev, +@@ -109,11 +109,11 @@ static void mana_get_ethtool_stats(struc tx_stats = &apc->tx_qp[q].txq.stats; do { @@ -2298,11 +12456,10 @@ index c530db76880f0..96d55c91c9698 100644 data[i++] = packets; data[i++] = bytes; -diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c -index 349a2b1a19a24..cf4d6f1129fa2 100644 ---- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c -+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c -@@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_common.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 2022-12-04 10:40:26.692034106 -0500 +@@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_de unsigned int start; do { @@ -2328,11 +12485,10 @@ index 349a2b1a19a24..cf4d6f1129fa2 100644 stats->tx_packets += data[0]; stats->tx_bytes += data[1]; stats->tx_errors += data[2]; -diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c -index b1b1b648e40cb..eeb1455a4e5db 100644 ---- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c -+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c -@@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) +diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 2022-12-04 10:40:26.692034106 -0500 +@@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct unsigned int start; do { @@ -2341,7 +12497,7 @@ index b1b1b648e40cb..eeb1455a4e5db 100644 data[0] = nn->r_vecs[i].rx_pkts; tmp[0] = nn->r_vecs[i].hw_csum_rx_ok; tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok; -@@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) +@@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct tmp[3] = nn->r_vecs[i].hw_csum_rx_error; tmp[4] = nn->r_vecs[i].rx_replace_buf_alloc_fail; tmp[5] = nn->r_vecs[i].hw_tls_rx; @@ -2354,7 +12510,7 @@ index b1b1b648e40cb..eeb1455a4e5db 100644 data[1] = nn->r_vecs[i].tx_pkts; data[2] = nn->r_vecs[i].tx_busy; tmp[6] = nn->r_vecs[i].hw_csum_tx; -@@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) +@@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct tmp[10] = nn->r_vecs[i].hw_tls_tx; tmp[11] = nn->r_vecs[i].tls_tx_fallback; tmp[12] = nn->r_vecs[i].tls_tx_no_fallback; @@ -2363,11 +12519,10 @@ index b1b1b648e40cb..eeb1455a4e5db 100644 data += NN_RVEC_PER_Q_STATS; -diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c -index 8b77582bdfa01..a6b6ca1fd55ee 100644 ---- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c -+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c -@@ -134,13 +134,13 @@ nfp_repr_get_host_stats64(const struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c 2022-12-04 10:40:26.692034106 -0500 +@@ -134,13 +134,13 @@ nfp_repr_get_host_stats64(const struct n repr_stats = per_cpu_ptr(repr->stats, i); do { @@ -2383,11 +12538,10 @@ index 8b77582bdfa01..a6b6ca1fd55ee 100644 stats->tx_bytes += tbytes; stats->tx_packets += tpkts; -diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c -index 5116badaf0919..50ebbd7e91c48 100644 ---- a/drivers/net/ethernet/nvidia/forcedeth.c -+++ b/drivers/net/ethernet/nvidia/forcedeth.c -@@ -1734,12 +1734,12 @@ static void nv_get_stats(int cpu, struct fe_priv *np, +diff -rupN linux.orig/drivers/net/ethernet/nvidia/forcedeth.c linux/drivers/net/ethernet/nvidia/forcedeth.c +--- linux.orig/drivers/net/ethernet/nvidia/forcedeth.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/nvidia/forcedeth.c 2022-12-04 10:40:26.692034106 -0500 +@@ -1734,12 +1734,12 @@ static void nv_get_stats(int cpu, struct u64 tx_packets, tx_bytes, tx_dropped; do { @@ -2402,7 +12556,7 @@ index 5116badaf0919..50ebbd7e91c48 100644 storage->rx_packets += rx_packets; storage->rx_bytes += rx_bytes; -@@ -1747,11 +1747,11 @@ static void nv_get_stats(int cpu, struct fe_priv *np, +@@ -1747,11 +1747,11 @@ static void nv_get_stats(int cpu, struct storage->rx_missed_errors += rx_missed_errors; do { @@ -2416,11 +12570,10 @@ index 5116badaf0919..50ebbd7e91c48 100644 storage->tx_packets += tx_packets; storage->tx_bytes += tx_bytes; -diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c -index 1b2119b1d48aa..3f5e6572d20e7 100644 ---- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c -+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c -@@ -135,9 +135,9 @@ static void rmnet_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c linux/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c +--- linux.orig/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c 2022-12-04 10:40:26.692034106 -0500 +@@ -135,9 +135,9 @@ static void rmnet_get_stats64(struct net pcpu_ptr = per_cpu_ptr(priv->pcpu_stats, cpu); do { @@ -2432,11 +12585,10 @@ index 1b2119b1d48aa..3f5e6572d20e7 100644 total_stats.rx_pkts += snapshot.rx_pkts; total_stats.rx_bytes += snapshot.rx_bytes; -diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c -index 15b40fd93cd2e..82bd0eb614634 100644 ---- a/drivers/net/ethernet/realtek/8139too.c -+++ b/drivers/net/ethernet/realtek/8139too.c -@@ -2532,16 +2532,16 @@ rtl8139_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/ethernet/realtek/8139too.c linux/drivers/net/ethernet/realtek/8139too.c +--- linux.orig/drivers/net/ethernet/realtek/8139too.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/realtek/8139too.c 2022-12-04 10:40:26.692034106 -0500 +@@ -2532,16 +2532,16 @@ rtl8139_get_stats64(struct net_device *d netdev_stats_to_stats64(stats, &dev->stats); do { @@ -2457,11 +12609,10 @@ index 15b40fd93cd2e..82bd0eb614634 100644 } /* Set or clear the multicast filter for this adaptor. -diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c -index f0c8de2c60755..d4f7238333bb7 100644 ---- a/drivers/net/ethernet/socionext/sni_ave.c -+++ b/drivers/net/ethernet/socionext/sni_ave.c -@@ -1506,16 +1506,16 @@ static void ave_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/ethernet/socionext/sni_ave.c linux/drivers/net/ethernet/socionext/sni_ave.c +--- linux.orig/drivers/net/ethernet/socionext/sni_ave.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/socionext/sni_ave.c 2022-12-04 10:40:26.692034106 -0500 +@@ -1508,16 +1508,16 @@ static void ave_get_stats64(struct net_d unsigned int start; do { @@ -2482,11 +12633,2010 @@ index f0c8de2c60755..d4f7238333bb7 100644 stats->rx_errors = priv->stats_rx.errors; stats->tx_errors = priv->stats_tx.errors; -diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c -index f4a6b590a1e39..1b62400c19049 100644 ---- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c -+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c -@@ -1365,12 +1365,12 @@ static void am65_cpsw_nuss_ndo_get_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/socionext/sni_ave.c.orig linux/drivers/net/ethernet/socionext/sni_ave.c.orig +--- linux.orig/drivers/net/ethernet/socionext/sni_ave.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/ethernet/socionext/sni_ave.c.orig 2022-12-04 10:40:18.168055947 -0500 +@@ -0,0 +1,1996 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * sni_ave.c - Socionext UniPhier AVE ethernet driver ++ * Copyright 2014 Panasonic Corporation ++ * Copyright 2015-2017 Socionext Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* General Register Group */ ++#define AVE_IDR 0x000 /* ID */ ++#define AVE_VR 0x004 /* Version */ ++#define AVE_GRR 0x008 /* Global Reset */ ++#define AVE_CFGR 0x00c /* Configuration */ ++ ++/* Interrupt Register Group */ ++#define AVE_GIMR 0x100 /* Global Interrupt Mask */ ++#define AVE_GISR 0x104 /* Global Interrupt Status */ ++ ++/* MAC Register Group */ ++#define AVE_TXCR 0x200 /* TX Setup */ ++#define AVE_RXCR 0x204 /* RX Setup */ ++#define AVE_RXMAC1R 0x208 /* MAC address (lower) */ ++#define AVE_RXMAC2R 0x20c /* MAC address (upper) */ ++#define AVE_MDIOCTR 0x214 /* MDIO Control */ ++#define AVE_MDIOAR 0x218 /* MDIO Address */ ++#define AVE_MDIOWDR 0x21c /* MDIO Data */ ++#define AVE_MDIOSR 0x220 /* MDIO Status */ ++#define AVE_MDIORDR 0x224 /* MDIO Rd Data */ ++ ++/* Descriptor Control Register Group */ ++#define AVE_DESCC 0x300 /* Descriptor Control */ ++#define AVE_TXDC 0x304 /* TX Descriptor Configuration */ ++#define AVE_RXDC0 0x308 /* RX Descriptor Ring0 Configuration */ ++#define AVE_IIRQC 0x34c /* Interval IRQ Control */ ++ ++/* Packet Filter Register Group */ ++#define AVE_PKTF_BASE 0x800 /* PF Base Address */ ++#define AVE_PFMBYTE_BASE 0xd00 /* PF Mask Byte Base Address */ ++#define AVE_PFMBIT_BASE 0xe00 /* PF Mask Bit Base Address */ ++#define AVE_PFSEL_BASE 0xf00 /* PF Selector Base Address */ ++#define AVE_PFEN 0xffc /* Packet Filter Enable */ ++#define AVE_PKTF(ent) (AVE_PKTF_BASE + (ent) * 0x40) ++#define AVE_PFMBYTE(ent) (AVE_PFMBYTE_BASE + (ent) * 8) ++#define AVE_PFMBIT(ent) (AVE_PFMBIT_BASE + (ent) * 4) ++#define AVE_PFSEL(ent) (AVE_PFSEL_BASE + (ent) * 4) ++ ++/* 64bit descriptor memory */ ++#define AVE_DESC_SIZE_64 12 /* Descriptor Size */ ++ ++#define AVE_TXDM_64 0x1000 /* Tx Descriptor Memory */ ++#define AVE_RXDM_64 0x1c00 /* Rx Descriptor Memory */ ++ ++#define AVE_TXDM_SIZE_64 0x0ba0 /* Tx Descriptor Memory Size 3KB */ ++#define AVE_RXDM_SIZE_64 0x6000 /* Rx Descriptor Memory Size 24KB */ ++ ++/* 32bit descriptor memory */ ++#define AVE_DESC_SIZE_32 8 /* Descriptor Size */ ++ ++#define AVE_TXDM_32 0x1000 /* Tx Descriptor Memory */ ++#define AVE_RXDM_32 0x1800 /* Rx Descriptor Memory */ ++ ++#define AVE_TXDM_SIZE_32 0x07c0 /* Tx Descriptor Memory Size 2KB */ ++#define AVE_RXDM_SIZE_32 0x4000 /* Rx Descriptor Memory Size 16KB */ ++ ++/* RMII Bridge Register Group */ ++#define AVE_RSTCTRL 0x8028 /* Reset control */ ++#define AVE_RSTCTRL_RMIIRST BIT(16) ++#define AVE_LINKSEL 0x8034 /* Link speed setting */ ++#define AVE_LINKSEL_100M BIT(0) ++ ++/* AVE_GRR */ ++#define AVE_GRR_RXFFR BIT(5) /* Reset RxFIFO */ ++#define AVE_GRR_PHYRST BIT(4) /* Reset external PHY */ ++#define AVE_GRR_GRST BIT(0) /* Reset all MAC */ ++ ++/* AVE_CFGR */ ++#define AVE_CFGR_FLE BIT(31) /* Filter Function */ ++#define AVE_CFGR_CHE BIT(30) /* Checksum Function */ ++#define AVE_CFGR_MII BIT(27) /* Func mode (1:MII/RMII, 0:RGMII) */ ++#define AVE_CFGR_IPFCEN BIT(24) /* IP fragment sum Enable */ ++ ++/* AVE_GISR (common with GIMR) */ ++#define AVE_GI_PHY BIT(24) /* PHY interrupt */ ++#define AVE_GI_TX BIT(16) /* Tx complete */ ++#define AVE_GI_RXERR BIT(8) /* Receive frame more than max size */ ++#define AVE_GI_RXOVF BIT(7) /* Overflow at the RxFIFO */ ++#define AVE_GI_RXDROP BIT(6) /* Drop packet */ ++#define AVE_GI_RXIINT BIT(5) /* Interval interrupt */ ++ ++/* AVE_TXCR */ ++#define AVE_TXCR_FLOCTR BIT(18) /* Flow control */ ++#define AVE_TXCR_TXSPD_1G BIT(17) ++#define AVE_TXCR_TXSPD_100 BIT(16) ++ ++/* AVE_RXCR */ ++#define AVE_RXCR_RXEN BIT(30) /* Rx enable */ ++#define AVE_RXCR_FDUPEN BIT(22) /* Interface mode */ ++#define AVE_RXCR_FLOCTR BIT(21) /* Flow control */ ++#define AVE_RXCR_AFEN BIT(19) /* MAC address filter */ ++#define AVE_RXCR_DRPEN BIT(18) /* Drop pause frame */ ++#define AVE_RXCR_MPSIZ_MASK GENMASK(10, 0) ++ ++/* AVE_MDIOCTR */ ++#define AVE_MDIOCTR_RREQ BIT(3) /* Read request */ ++#define AVE_MDIOCTR_WREQ BIT(2) /* Write request */ ++ ++/* AVE_MDIOSR */ ++#define AVE_MDIOSR_STS BIT(0) /* access status */ ++ ++/* AVE_DESCC */ ++#define AVE_DESCC_STATUS_MASK GENMASK(31, 16) ++#define AVE_DESCC_RD0 BIT(8) /* Enable Rx descriptor Ring0 */ ++#define AVE_DESCC_RDSTP BIT(4) /* Pause Rx descriptor */ ++#define AVE_DESCC_TD BIT(0) /* Enable Tx descriptor */ ++ ++/* AVE_TXDC */ ++#define AVE_TXDC_SIZE GENMASK(27, 16) /* Size of Tx descriptor */ ++#define AVE_TXDC_ADDR GENMASK(11, 0) /* Start address */ ++#define AVE_TXDC_ADDR_START 0 ++ ++/* AVE_RXDC0 */ ++#define AVE_RXDC0_SIZE GENMASK(30, 16) /* Size of Rx descriptor */ ++#define AVE_RXDC0_ADDR GENMASK(14, 0) /* Start address */ ++#define AVE_RXDC0_ADDR_START 0 ++ ++/* AVE_IIRQC */ ++#define AVE_IIRQC_EN0 BIT(27) /* Enable interval interrupt Ring0 */ ++#define AVE_IIRQC_BSCK GENMASK(15, 0) /* Interval count unit */ ++ ++/* Command status for descriptor */ ++#define AVE_STS_OWN BIT(31) /* Descriptor ownership */ ++#define AVE_STS_INTR BIT(29) /* Request for interrupt */ ++#define AVE_STS_OK BIT(27) /* Normal transmit */ ++/* TX */ ++#define AVE_STS_NOCSUM BIT(28) /* No use HW checksum */ ++#define AVE_STS_1ST BIT(26) /* Head of buffer chain */ ++#define AVE_STS_LAST BIT(25) /* Tail of buffer chain */ ++#define AVE_STS_OWC BIT(21) /* Out of window,Late Collision */ ++#define AVE_STS_EC BIT(20) /* Excess collision occurred */ ++#define AVE_STS_PKTLEN_TX_MASK GENMASK(15, 0) ++/* RX */ ++#define AVE_STS_CSSV BIT(21) /* Checksum check performed */ ++#define AVE_STS_CSER BIT(20) /* Checksum error detected */ ++#define AVE_STS_PKTLEN_RX_MASK GENMASK(10, 0) ++ ++/* Packet filter */ ++#define AVE_PFMBYTE_MASK0 (GENMASK(31, 8) | GENMASK(5, 0)) ++#define AVE_PFMBYTE_MASK1 GENMASK(25, 0) ++#define AVE_PFMBIT_MASK GENMASK(15, 0) ++ ++#define AVE_PF_SIZE 17 /* Number of all packet filter */ ++#define AVE_PF_MULTICAST_SIZE 7 /* Number of multicast filter */ ++ ++#define AVE_PFNUM_FILTER 0 /* No.0 */ ++#define AVE_PFNUM_UNICAST 1 /* No.1 */ ++#define AVE_PFNUM_BROADCAST 2 /* No.2 */ ++#define AVE_PFNUM_MULTICAST 11 /* No.11-17 */ ++ ++/* NETIF Message control */ ++#define AVE_DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | \ ++ NETIF_MSG_PROBE | \ ++ NETIF_MSG_LINK | \ ++ NETIF_MSG_TIMER | \ ++ NETIF_MSG_IFDOWN | \ ++ NETIF_MSG_IFUP | \ ++ NETIF_MSG_RX_ERR | \ ++ NETIF_MSG_TX_ERR) ++ ++/* Parameter for descriptor */ ++#define AVE_NR_TXDESC 64 /* Tx descriptor */ ++#define AVE_NR_RXDESC 256 /* Rx descriptor */ ++ ++#define AVE_DESC_OFS_CMDSTS 0 ++#define AVE_DESC_OFS_ADDRL 4 ++#define AVE_DESC_OFS_ADDRU 8 ++ ++/* Parameter for ethernet frame */ ++#define AVE_MAX_ETHFRAME 1518 ++#define AVE_FRAME_HEADROOM 2 ++ ++/* Parameter for interrupt */ ++#define AVE_INTM_COUNT 20 ++#define AVE_FORCE_TXINTCNT 1 ++ ++/* SG */ ++#define SG_ETPINMODE 0x540 ++#define SG_ETPINMODE_EXTPHY BIT(1) /* for LD11 */ ++#define SG_ETPINMODE_RMII(ins) BIT(ins) ++ ++#define IS_DESC_64BIT(p) ((p)->data->is_desc_64bit) ++ ++#define AVE_MAX_CLKS 4 ++#define AVE_MAX_RSTS 2 ++ ++enum desc_id { ++ AVE_DESCID_RX, ++ AVE_DESCID_TX, ++}; ++ ++enum desc_state { ++ AVE_DESC_RX_PERMIT, ++ AVE_DESC_RX_SUSPEND, ++ AVE_DESC_START, ++ AVE_DESC_STOP, ++}; ++ ++struct ave_desc { ++ struct sk_buff *skbs; ++ dma_addr_t skbs_dma; ++ size_t skbs_dmalen; ++}; ++ ++struct ave_desc_info { ++ u32 ndesc; /* number of descriptor */ ++ u32 daddr; /* start address of descriptor */ ++ u32 proc_idx; /* index of processing packet */ ++ u32 done_idx; /* index of processed packet */ ++ struct ave_desc *desc; /* skb info related descriptor */ ++}; ++ ++struct ave_stats { ++ struct u64_stats_sync syncp; ++ u64 packets; ++ u64 bytes; ++ u64 errors; ++ u64 dropped; ++ u64 collisions; ++ u64 fifo_errors; ++}; ++ ++struct ave_private { ++ void __iomem *base; ++ int irq; ++ int phy_id; ++ unsigned int desc_size; ++ u32 msg_enable; ++ int nclks; ++ struct clk *clk[AVE_MAX_CLKS]; ++ int nrsts; ++ struct reset_control *rst[AVE_MAX_RSTS]; ++ phy_interface_t phy_mode; ++ struct phy_device *phydev; ++ struct mii_bus *mdio; ++ struct regmap *regmap; ++ unsigned int pinmode_mask; ++ unsigned int pinmode_val; ++ u32 wolopts; ++ ++ /* stats */ ++ struct ave_stats stats_rx; ++ struct ave_stats stats_tx; ++ ++ /* NAPI support */ ++ struct net_device *ndev; ++ struct napi_struct napi_rx; ++ struct napi_struct napi_tx; ++ ++ /* descriptor */ ++ struct ave_desc_info rx; ++ struct ave_desc_info tx; ++ ++ /* flow control */ ++ int pause_auto; ++ int pause_rx; ++ int pause_tx; ++ ++ const struct ave_soc_data *data; ++}; ++ ++struct ave_soc_data { ++ bool is_desc_64bit; ++ const char *clock_names[AVE_MAX_CLKS]; ++ const char *reset_names[AVE_MAX_RSTS]; ++ int (*get_pinmode)(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg); ++}; ++ ++static u32 ave_desc_read(struct net_device *ndev, enum desc_id id, int entry, ++ int offset) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 addr; ++ ++ addr = ((id == AVE_DESCID_TX) ? priv->tx.daddr : priv->rx.daddr) ++ + entry * priv->desc_size + offset; ++ ++ return readl(priv->base + addr); ++} ++ ++static u32 ave_desc_read_cmdsts(struct net_device *ndev, enum desc_id id, ++ int entry) ++{ ++ return ave_desc_read(ndev, id, entry, AVE_DESC_OFS_CMDSTS); ++} ++ ++static void ave_desc_write(struct net_device *ndev, enum desc_id id, ++ int entry, int offset, u32 val) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 addr; ++ ++ addr = ((id == AVE_DESCID_TX) ? priv->tx.daddr : priv->rx.daddr) ++ + entry * priv->desc_size + offset; ++ ++ writel(val, priv->base + addr); ++} ++ ++static void ave_desc_write_cmdsts(struct net_device *ndev, enum desc_id id, ++ int entry, u32 val) ++{ ++ ave_desc_write(ndev, id, entry, AVE_DESC_OFS_CMDSTS, val); ++} ++ ++static void ave_desc_write_addr(struct net_device *ndev, enum desc_id id, ++ int entry, dma_addr_t paddr) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ ave_desc_write(ndev, id, entry, AVE_DESC_OFS_ADDRL, ++ lower_32_bits(paddr)); ++ if (IS_DESC_64BIT(priv)) ++ ave_desc_write(ndev, id, ++ entry, AVE_DESC_OFS_ADDRU, ++ upper_32_bits(paddr)); ++} ++ ++static u32 ave_irq_disable_all(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 ret; ++ ++ ret = readl(priv->base + AVE_GIMR); ++ writel(0, priv->base + AVE_GIMR); ++ ++ return ret; ++} ++ ++static void ave_irq_restore(struct net_device *ndev, u32 val) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ writel(val, priv->base + AVE_GIMR); ++} ++ ++static void ave_irq_enable(struct net_device *ndev, u32 bitflag) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ writel(readl(priv->base + AVE_GIMR) | bitflag, priv->base + AVE_GIMR); ++ writel(bitflag, priv->base + AVE_GISR); ++} ++ ++static void ave_hw_write_macaddr(struct net_device *ndev, ++ const unsigned char *mac_addr, ++ int reg1, int reg2) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ writel(mac_addr[0] | mac_addr[1] << 8 | ++ mac_addr[2] << 16 | mac_addr[3] << 24, priv->base + reg1); ++ writel(mac_addr[4] | mac_addr[5] << 8, priv->base + reg2); ++} ++ ++static void ave_hw_read_version(struct net_device *ndev, char *buf, int len) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 major, minor, vr; ++ ++ vr = readl(priv->base + AVE_VR); ++ major = (vr & GENMASK(15, 8)) >> 8; ++ minor = (vr & GENMASK(7, 0)); ++ snprintf(buf, len, "v%u.%u", major, minor); ++} ++ ++static void ave_ethtool_get_drvinfo(struct net_device *ndev, ++ struct ethtool_drvinfo *info) ++{ ++ struct device *dev = ndev->dev.parent; ++ ++ strlcpy(info->driver, dev->driver->name, sizeof(info->driver)); ++ strlcpy(info->bus_info, dev_name(dev), sizeof(info->bus_info)); ++ ave_hw_read_version(ndev, info->fw_version, sizeof(info->fw_version)); ++} ++ ++static u32 ave_ethtool_get_msglevel(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ return priv->msg_enable; ++} ++ ++static void ave_ethtool_set_msglevel(struct net_device *ndev, u32 val) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ priv->msg_enable = val; ++} ++ ++static void ave_ethtool_get_wol(struct net_device *ndev, ++ struct ethtool_wolinfo *wol) ++{ ++ wol->supported = 0; ++ wol->wolopts = 0; ++ ++ if (ndev->phydev) ++ phy_ethtool_get_wol(ndev->phydev, wol); ++} ++ ++static int __ave_ethtool_set_wol(struct net_device *ndev, ++ struct ethtool_wolinfo *wol) ++{ ++ if (!ndev->phydev || ++ (wol->wolopts & (WAKE_ARP | WAKE_MAGICSECURE))) ++ return -EOPNOTSUPP; ++ ++ return phy_ethtool_set_wol(ndev->phydev, wol); ++} ++ ++static int ave_ethtool_set_wol(struct net_device *ndev, ++ struct ethtool_wolinfo *wol) ++{ ++ int ret; ++ ++ ret = __ave_ethtool_set_wol(ndev, wol); ++ if (!ret) ++ device_set_wakeup_enable(&ndev->dev, !!wol->wolopts); ++ ++ return ret; ++} ++ ++static void ave_ethtool_get_pauseparam(struct net_device *ndev, ++ struct ethtool_pauseparam *pause) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ pause->autoneg = priv->pause_auto; ++ pause->rx_pause = priv->pause_rx; ++ pause->tx_pause = priv->pause_tx; ++} ++ ++static int ave_ethtool_set_pauseparam(struct net_device *ndev, ++ struct ethtool_pauseparam *pause) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ struct phy_device *phydev = ndev->phydev; ++ ++ if (!phydev) ++ return -EINVAL; ++ ++ priv->pause_auto = pause->autoneg; ++ priv->pause_rx = pause->rx_pause; ++ priv->pause_tx = pause->tx_pause; ++ ++ phy_set_asym_pause(phydev, pause->rx_pause, pause->tx_pause); ++ ++ return 0; ++} ++ ++static const struct ethtool_ops ave_ethtool_ops = { ++ .get_link_ksettings = phy_ethtool_get_link_ksettings, ++ .set_link_ksettings = phy_ethtool_set_link_ksettings, ++ .get_drvinfo = ave_ethtool_get_drvinfo, ++ .nway_reset = phy_ethtool_nway_reset, ++ .get_link = ethtool_op_get_link, ++ .get_msglevel = ave_ethtool_get_msglevel, ++ .set_msglevel = ave_ethtool_set_msglevel, ++ .get_wol = ave_ethtool_get_wol, ++ .set_wol = ave_ethtool_set_wol, ++ .get_pauseparam = ave_ethtool_get_pauseparam, ++ .set_pauseparam = ave_ethtool_set_pauseparam, ++}; ++ ++static int ave_mdiobus_read(struct mii_bus *bus, int phyid, int regnum) ++{ ++ struct net_device *ndev = bus->priv; ++ struct ave_private *priv; ++ u32 mdioctl, mdiosr; ++ int ret; ++ ++ priv = netdev_priv(ndev); ++ ++ /* write address */ ++ writel((phyid << 8) | regnum, priv->base + AVE_MDIOAR); ++ ++ /* read request */ ++ mdioctl = readl(priv->base + AVE_MDIOCTR); ++ writel((mdioctl | AVE_MDIOCTR_RREQ) & ~AVE_MDIOCTR_WREQ, ++ priv->base + AVE_MDIOCTR); ++ ++ ret = readl_poll_timeout(priv->base + AVE_MDIOSR, mdiosr, ++ !(mdiosr & AVE_MDIOSR_STS), 20, 2000); ++ if (ret) { ++ netdev_err(ndev, "failed to read (phy:%d reg:%x)\n", ++ phyid, regnum); ++ return ret; ++ } ++ ++ return readl(priv->base + AVE_MDIORDR) & GENMASK(15, 0); ++} ++ ++static int ave_mdiobus_write(struct mii_bus *bus, int phyid, int regnum, ++ u16 val) ++{ ++ struct net_device *ndev = bus->priv; ++ struct ave_private *priv; ++ u32 mdioctl, mdiosr; ++ int ret; ++ ++ priv = netdev_priv(ndev); ++ ++ /* write address */ ++ writel((phyid << 8) | regnum, priv->base + AVE_MDIOAR); ++ ++ /* write data */ ++ writel(val, priv->base + AVE_MDIOWDR); ++ ++ /* write request */ ++ mdioctl = readl(priv->base + AVE_MDIOCTR); ++ writel((mdioctl | AVE_MDIOCTR_WREQ) & ~AVE_MDIOCTR_RREQ, ++ priv->base + AVE_MDIOCTR); ++ ++ ret = readl_poll_timeout(priv->base + AVE_MDIOSR, mdiosr, ++ !(mdiosr & AVE_MDIOSR_STS), 20, 2000); ++ if (ret) ++ netdev_err(ndev, "failed to write (phy:%d reg:%x)\n", ++ phyid, regnum); ++ ++ return ret; ++} ++ ++static int ave_dma_map(struct net_device *ndev, struct ave_desc *desc, ++ void *ptr, size_t len, enum dma_data_direction dir, ++ dma_addr_t *paddr) ++{ ++ dma_addr_t map_addr; ++ ++ map_addr = dma_map_single(ndev->dev.parent, ptr, len, dir); ++ if (unlikely(dma_mapping_error(ndev->dev.parent, map_addr))) ++ return -ENOMEM; ++ ++ desc->skbs_dma = map_addr; ++ desc->skbs_dmalen = len; ++ *paddr = map_addr; ++ ++ return 0; ++} ++ ++static void ave_dma_unmap(struct net_device *ndev, struct ave_desc *desc, ++ enum dma_data_direction dir) ++{ ++ if (!desc->skbs_dma) ++ return; ++ ++ dma_unmap_single(ndev->dev.parent, ++ desc->skbs_dma, desc->skbs_dmalen, dir); ++ desc->skbs_dma = 0; ++} ++ ++/* Prepare Rx descriptor and memory */ ++static int ave_rxdesc_prepare(struct net_device *ndev, int entry) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ struct sk_buff *skb; ++ dma_addr_t paddr; ++ int ret; ++ ++ skb = priv->rx.desc[entry].skbs; ++ if (!skb) { ++ skb = netdev_alloc_skb(ndev, AVE_MAX_ETHFRAME); ++ if (!skb) { ++ netdev_err(ndev, "can't allocate skb for Rx\n"); ++ return -ENOMEM; ++ } ++ skb->data += AVE_FRAME_HEADROOM; ++ skb->tail += AVE_FRAME_HEADROOM; ++ } ++ ++ /* set disable to cmdsts */ ++ ave_desc_write_cmdsts(ndev, AVE_DESCID_RX, entry, ++ AVE_STS_INTR | AVE_STS_OWN); ++ ++ /* map Rx buffer ++ * Rx buffer set to the Rx descriptor has two restrictions: ++ * - Rx buffer address is 4 byte aligned. ++ * - Rx buffer begins with 2 byte headroom, and data will be put from ++ * (buffer + 2). ++ * To satisfy this, specify the address to put back the buffer ++ * pointer advanced by AVE_FRAME_HEADROOM, and expand the map size ++ * by AVE_FRAME_HEADROOM. ++ */ ++ ret = ave_dma_map(ndev, &priv->rx.desc[entry], ++ skb->data - AVE_FRAME_HEADROOM, ++ AVE_MAX_ETHFRAME + AVE_FRAME_HEADROOM, ++ DMA_FROM_DEVICE, &paddr); ++ if (ret) { ++ netdev_err(ndev, "can't map skb for Rx\n"); ++ dev_kfree_skb_any(skb); ++ return ret; ++ } ++ priv->rx.desc[entry].skbs = skb; ++ ++ /* set buffer pointer */ ++ ave_desc_write_addr(ndev, AVE_DESCID_RX, entry, paddr); ++ ++ /* set enable to cmdsts */ ++ ave_desc_write_cmdsts(ndev, AVE_DESCID_RX, entry, ++ AVE_STS_INTR | AVE_MAX_ETHFRAME); ++ ++ return ret; ++} ++ ++/* Switch state of descriptor */ ++static int ave_desc_switch(struct net_device *ndev, enum desc_state state) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ int ret = 0; ++ u32 val; ++ ++ switch (state) { ++ case AVE_DESC_START: ++ writel(AVE_DESCC_TD | AVE_DESCC_RD0, priv->base + AVE_DESCC); ++ break; ++ ++ case AVE_DESC_STOP: ++ writel(0, priv->base + AVE_DESCC); ++ if (readl_poll_timeout(priv->base + AVE_DESCC, val, !val, ++ 150, 15000)) { ++ netdev_err(ndev, "can't stop descriptor\n"); ++ ret = -EBUSY; ++ } ++ break; ++ ++ case AVE_DESC_RX_SUSPEND: ++ val = readl(priv->base + AVE_DESCC); ++ val |= AVE_DESCC_RDSTP; ++ val &= ~AVE_DESCC_STATUS_MASK; ++ writel(val, priv->base + AVE_DESCC); ++ if (readl_poll_timeout(priv->base + AVE_DESCC, val, ++ val & (AVE_DESCC_RDSTP << 16), ++ 150, 150000)) { ++ netdev_err(ndev, "can't suspend descriptor\n"); ++ ret = -EBUSY; ++ } ++ break; ++ ++ case AVE_DESC_RX_PERMIT: ++ val = readl(priv->base + AVE_DESCC); ++ val &= ~AVE_DESCC_RDSTP; ++ val &= ~AVE_DESCC_STATUS_MASK; ++ writel(val, priv->base + AVE_DESCC); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++static int ave_tx_complete(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 proc_idx, done_idx, ndesc, cmdsts; ++ unsigned int nr_freebuf = 0; ++ unsigned int tx_packets = 0; ++ unsigned int tx_bytes = 0; ++ ++ proc_idx = priv->tx.proc_idx; ++ done_idx = priv->tx.done_idx; ++ ndesc = priv->tx.ndesc; ++ ++ /* free pre-stored skb from done_idx to proc_idx */ ++ while (proc_idx != done_idx) { ++ cmdsts = ave_desc_read_cmdsts(ndev, AVE_DESCID_TX, done_idx); ++ ++ /* do nothing if owner is HW (==1 for Tx) */ ++ if (cmdsts & AVE_STS_OWN) ++ break; ++ ++ /* check Tx status and updates statistics */ ++ if (cmdsts & AVE_STS_OK) { ++ tx_bytes += cmdsts & AVE_STS_PKTLEN_TX_MASK; ++ /* success */ ++ if (cmdsts & AVE_STS_LAST) ++ tx_packets++; ++ } else { ++ /* error */ ++ if (cmdsts & AVE_STS_LAST) { ++ priv->stats_tx.errors++; ++ if (cmdsts & (AVE_STS_OWC | AVE_STS_EC)) ++ priv->stats_tx.collisions++; ++ } ++ } ++ ++ /* release skb */ ++ if (priv->tx.desc[done_idx].skbs) { ++ ave_dma_unmap(ndev, &priv->tx.desc[done_idx], ++ DMA_TO_DEVICE); ++ dev_consume_skb_any(priv->tx.desc[done_idx].skbs); ++ priv->tx.desc[done_idx].skbs = NULL; ++ nr_freebuf++; ++ } ++ done_idx = (done_idx + 1) % ndesc; ++ } ++ ++ priv->tx.done_idx = done_idx; ++ ++ /* update stats */ ++ u64_stats_update_begin(&priv->stats_tx.syncp); ++ priv->stats_tx.packets += tx_packets; ++ priv->stats_tx.bytes += tx_bytes; ++ u64_stats_update_end(&priv->stats_tx.syncp); ++ ++ /* wake queue for freeing buffer */ ++ if (unlikely(netif_queue_stopped(ndev)) && nr_freebuf) ++ netif_wake_queue(ndev); ++ ++ return nr_freebuf; ++} ++ ++static int ave_rx_receive(struct net_device *ndev, int num) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ unsigned int rx_packets = 0; ++ unsigned int rx_bytes = 0; ++ u32 proc_idx, done_idx; ++ struct sk_buff *skb; ++ unsigned int pktlen; ++ int restpkt, npkts; ++ u32 ndesc, cmdsts; ++ ++ proc_idx = priv->rx.proc_idx; ++ done_idx = priv->rx.done_idx; ++ ndesc = priv->rx.ndesc; ++ restpkt = ((proc_idx + ndesc - 1) - done_idx) % ndesc; ++ ++ for (npkts = 0; npkts < num; npkts++) { ++ /* we can't receive more packet, so fill desc quickly */ ++ if (--restpkt < 0) ++ break; ++ ++ cmdsts = ave_desc_read_cmdsts(ndev, AVE_DESCID_RX, proc_idx); ++ ++ /* do nothing if owner is HW (==0 for Rx) */ ++ if (!(cmdsts & AVE_STS_OWN)) ++ break; ++ ++ if (!(cmdsts & AVE_STS_OK)) { ++ priv->stats_rx.errors++; ++ proc_idx = (proc_idx + 1) % ndesc; ++ continue; ++ } ++ ++ pktlen = cmdsts & AVE_STS_PKTLEN_RX_MASK; ++ ++ /* get skbuff for rx */ ++ skb = priv->rx.desc[proc_idx].skbs; ++ priv->rx.desc[proc_idx].skbs = NULL; ++ ++ ave_dma_unmap(ndev, &priv->rx.desc[proc_idx], DMA_FROM_DEVICE); ++ ++ skb->dev = ndev; ++ skb_put(skb, pktlen); ++ skb->protocol = eth_type_trans(skb, ndev); ++ ++ if ((cmdsts & AVE_STS_CSSV) && (!(cmdsts & AVE_STS_CSER))) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ ++ rx_packets++; ++ rx_bytes += pktlen; ++ ++ netif_receive_skb(skb); ++ ++ proc_idx = (proc_idx + 1) % ndesc; ++ } ++ ++ priv->rx.proc_idx = proc_idx; ++ ++ /* update stats */ ++ u64_stats_update_begin(&priv->stats_rx.syncp); ++ priv->stats_rx.packets += rx_packets; ++ priv->stats_rx.bytes += rx_bytes; ++ u64_stats_update_end(&priv->stats_rx.syncp); ++ ++ /* refill the Rx buffers */ ++ while (proc_idx != done_idx) { ++ if (ave_rxdesc_prepare(ndev, done_idx)) ++ break; ++ done_idx = (done_idx + 1) % ndesc; ++ } ++ ++ priv->rx.done_idx = done_idx; ++ ++ return npkts; ++} ++ ++static int ave_napi_poll_rx(struct napi_struct *napi, int budget) ++{ ++ struct ave_private *priv; ++ struct net_device *ndev; ++ int num; ++ ++ priv = container_of(napi, struct ave_private, napi_rx); ++ ndev = priv->ndev; ++ ++ num = ave_rx_receive(ndev, budget); ++ if (num < budget) { ++ napi_complete_done(napi, num); ++ ++ /* enable Rx interrupt when NAPI finishes */ ++ ave_irq_enable(ndev, AVE_GI_RXIINT); ++ } ++ ++ return num; ++} ++ ++static int ave_napi_poll_tx(struct napi_struct *napi, int budget) ++{ ++ struct ave_private *priv; ++ struct net_device *ndev; ++ int num; ++ ++ priv = container_of(napi, struct ave_private, napi_tx); ++ ndev = priv->ndev; ++ ++ num = ave_tx_complete(ndev); ++ napi_complete(napi); ++ ++ /* enable Tx interrupt when NAPI finishes */ ++ ave_irq_enable(ndev, AVE_GI_TX); ++ ++ return num; ++} ++ ++static void ave_global_reset(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 val; ++ ++ /* set config register */ ++ val = AVE_CFGR_FLE | AVE_CFGR_IPFCEN | AVE_CFGR_CHE; ++ if (!phy_interface_mode_is_rgmii(priv->phy_mode)) ++ val |= AVE_CFGR_MII; ++ writel(val, priv->base + AVE_CFGR); ++ ++ /* reset RMII register */ ++ val = readl(priv->base + AVE_RSTCTRL); ++ val &= ~AVE_RSTCTRL_RMIIRST; ++ writel(val, priv->base + AVE_RSTCTRL); ++ ++ /* assert reset */ ++ writel(AVE_GRR_GRST | AVE_GRR_PHYRST, priv->base + AVE_GRR); ++ msleep(20); ++ ++ /* 1st, negate PHY reset only */ ++ writel(AVE_GRR_GRST, priv->base + AVE_GRR); ++ msleep(40); ++ ++ /* negate reset */ ++ writel(0, priv->base + AVE_GRR); ++ msleep(40); ++ ++ /* negate RMII register */ ++ val = readl(priv->base + AVE_RSTCTRL); ++ val |= AVE_RSTCTRL_RMIIRST; ++ writel(val, priv->base + AVE_RSTCTRL); ++ ++ ave_irq_disable_all(ndev); ++} ++ ++static void ave_rxfifo_reset(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 rxcr_org; ++ ++ /* save and disable MAC receive op */ ++ rxcr_org = readl(priv->base + AVE_RXCR); ++ writel(rxcr_org & (~AVE_RXCR_RXEN), priv->base + AVE_RXCR); ++ ++ /* suspend Rx descriptor */ ++ ave_desc_switch(ndev, AVE_DESC_RX_SUSPEND); ++ ++ /* receive all packets before descriptor starts */ ++ ave_rx_receive(ndev, priv->rx.ndesc); ++ ++ /* assert reset */ ++ writel(AVE_GRR_RXFFR, priv->base + AVE_GRR); ++ udelay(50); ++ ++ /* negate reset */ ++ writel(0, priv->base + AVE_GRR); ++ udelay(20); ++ ++ /* negate interrupt status */ ++ writel(AVE_GI_RXOVF, priv->base + AVE_GISR); ++ ++ /* permit descriptor */ ++ ave_desc_switch(ndev, AVE_DESC_RX_PERMIT); ++ ++ /* restore MAC reccieve op */ ++ writel(rxcr_org, priv->base + AVE_RXCR); ++} ++ ++static irqreturn_t ave_irq_handler(int irq, void *netdev) ++{ ++ struct net_device *ndev = (struct net_device *)netdev; ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 gimr_val, gisr_val; ++ ++ gimr_val = ave_irq_disable_all(ndev); ++ ++ /* get interrupt status */ ++ gisr_val = readl(priv->base + AVE_GISR); ++ ++ /* PHY */ ++ if (gisr_val & AVE_GI_PHY) ++ writel(AVE_GI_PHY, priv->base + AVE_GISR); ++ ++ /* check exceeding packet */ ++ if (gisr_val & AVE_GI_RXERR) { ++ writel(AVE_GI_RXERR, priv->base + AVE_GISR); ++ netdev_err(ndev, "receive a packet exceeding frame buffer\n"); ++ } ++ ++ gisr_val &= gimr_val; ++ if (!gisr_val) ++ goto exit_isr; ++ ++ /* RxFIFO overflow */ ++ if (gisr_val & AVE_GI_RXOVF) { ++ priv->stats_rx.fifo_errors++; ++ ave_rxfifo_reset(ndev); ++ goto exit_isr; ++ } ++ ++ /* Rx drop */ ++ if (gisr_val & AVE_GI_RXDROP) { ++ priv->stats_rx.dropped++; ++ writel(AVE_GI_RXDROP, priv->base + AVE_GISR); ++ } ++ ++ /* Rx interval */ ++ if (gisr_val & AVE_GI_RXIINT) { ++ napi_schedule(&priv->napi_rx); ++ /* still force to disable Rx interrupt until NAPI finishes */ ++ gimr_val &= ~AVE_GI_RXIINT; ++ } ++ ++ /* Tx completed */ ++ if (gisr_val & AVE_GI_TX) { ++ napi_schedule(&priv->napi_tx); ++ /* still force to disable Tx interrupt until NAPI finishes */ ++ gimr_val &= ~AVE_GI_TX; ++ } ++ ++exit_isr: ++ ave_irq_restore(ndev, gimr_val); ++ ++ return IRQ_HANDLED; ++} ++ ++static int ave_pfsel_start(struct net_device *ndev, unsigned int entry) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 val; ++ ++ if (WARN_ON(entry > AVE_PF_SIZE)) ++ return -EINVAL; ++ ++ val = readl(priv->base + AVE_PFEN); ++ writel(val | BIT(entry), priv->base + AVE_PFEN); ++ ++ return 0; ++} ++ ++static int ave_pfsel_stop(struct net_device *ndev, unsigned int entry) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 val; ++ ++ if (WARN_ON(entry > AVE_PF_SIZE)) ++ return -EINVAL; ++ ++ val = readl(priv->base + AVE_PFEN); ++ writel(val & ~BIT(entry), priv->base + AVE_PFEN); ++ ++ return 0; ++} ++ ++static int ave_pfsel_set_macaddr(struct net_device *ndev, ++ unsigned int entry, ++ const unsigned char *mac_addr, ++ unsigned int set_size) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ if (WARN_ON(entry > AVE_PF_SIZE)) ++ return -EINVAL; ++ if (WARN_ON(set_size > 6)) ++ return -EINVAL; ++ ++ ave_pfsel_stop(ndev, entry); ++ ++ /* set MAC address for the filter */ ++ ave_hw_write_macaddr(ndev, mac_addr, ++ AVE_PKTF(entry), AVE_PKTF(entry) + 4); ++ ++ /* set byte mask */ ++ writel(GENMASK(31, set_size) & AVE_PFMBYTE_MASK0, ++ priv->base + AVE_PFMBYTE(entry)); ++ writel(AVE_PFMBYTE_MASK1, priv->base + AVE_PFMBYTE(entry) + 4); ++ ++ /* set bit mask filter */ ++ writel(AVE_PFMBIT_MASK, priv->base + AVE_PFMBIT(entry)); ++ ++ /* set selector to ring 0 */ ++ writel(0, priv->base + AVE_PFSEL(entry)); ++ ++ /* restart filter */ ++ ave_pfsel_start(ndev, entry); ++ ++ return 0; ++} ++ ++static void ave_pfsel_set_promisc(struct net_device *ndev, ++ unsigned int entry, u32 rxring) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ if (WARN_ON(entry > AVE_PF_SIZE)) ++ return; ++ ++ ave_pfsel_stop(ndev, entry); ++ ++ /* set byte mask */ ++ writel(AVE_PFMBYTE_MASK0, priv->base + AVE_PFMBYTE(entry)); ++ writel(AVE_PFMBYTE_MASK1, priv->base + AVE_PFMBYTE(entry) + 4); ++ ++ /* set bit mask filter */ ++ writel(AVE_PFMBIT_MASK, priv->base + AVE_PFMBIT(entry)); ++ ++ /* set selector to rxring */ ++ writel(rxring, priv->base + AVE_PFSEL(entry)); ++ ++ ave_pfsel_start(ndev, entry); ++} ++ ++static void ave_pfsel_init(struct net_device *ndev) ++{ ++ unsigned char bcast_mac[ETH_ALEN]; ++ int i; ++ ++ eth_broadcast_addr(bcast_mac); ++ ++ for (i = 0; i < AVE_PF_SIZE; i++) ++ ave_pfsel_stop(ndev, i); ++ ++ /* promiscious entry, select ring 0 */ ++ ave_pfsel_set_promisc(ndev, AVE_PFNUM_FILTER, 0); ++ ++ /* unicast entry */ ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_UNICAST, ndev->dev_addr, 6); ++ ++ /* broadcast entry */ ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_BROADCAST, bcast_mac, 6); ++} ++ ++static void ave_phy_adjust_link(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ struct phy_device *phydev = ndev->phydev; ++ u32 val, txcr, rxcr, rxcr_org; ++ u16 rmt_adv = 0, lcl_adv = 0; ++ u8 cap; ++ ++ /* set RGMII speed */ ++ val = readl(priv->base + AVE_TXCR); ++ val &= ~(AVE_TXCR_TXSPD_100 | AVE_TXCR_TXSPD_1G); ++ ++ if (phy_interface_is_rgmii(phydev) && phydev->speed == SPEED_1000) ++ val |= AVE_TXCR_TXSPD_1G; ++ else if (phydev->speed == SPEED_100) ++ val |= AVE_TXCR_TXSPD_100; ++ ++ writel(val, priv->base + AVE_TXCR); ++ ++ /* set RMII speed (100M/10M only) */ ++ if (!phy_interface_is_rgmii(phydev)) { ++ val = readl(priv->base + AVE_LINKSEL); ++ if (phydev->speed == SPEED_10) ++ val &= ~AVE_LINKSEL_100M; ++ else ++ val |= AVE_LINKSEL_100M; ++ writel(val, priv->base + AVE_LINKSEL); ++ } ++ ++ /* check current RXCR/TXCR */ ++ rxcr = readl(priv->base + AVE_RXCR); ++ txcr = readl(priv->base + AVE_TXCR); ++ rxcr_org = rxcr; ++ ++ if (phydev->duplex) { ++ rxcr |= AVE_RXCR_FDUPEN; ++ ++ if (phydev->pause) ++ rmt_adv |= LPA_PAUSE_CAP; ++ if (phydev->asym_pause) ++ rmt_adv |= LPA_PAUSE_ASYM; ++ ++ lcl_adv = linkmode_adv_to_lcl_adv_t(phydev->advertising); ++ cap = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv); ++ if (cap & FLOW_CTRL_TX) ++ txcr |= AVE_TXCR_FLOCTR; ++ else ++ txcr &= ~AVE_TXCR_FLOCTR; ++ if (cap & FLOW_CTRL_RX) ++ rxcr |= AVE_RXCR_FLOCTR; ++ else ++ rxcr &= ~AVE_RXCR_FLOCTR; ++ } else { ++ rxcr &= ~AVE_RXCR_FDUPEN; ++ rxcr &= ~AVE_RXCR_FLOCTR; ++ txcr &= ~AVE_TXCR_FLOCTR; ++ } ++ ++ if (rxcr_org != rxcr) { ++ /* disable Rx mac */ ++ writel(rxcr & ~AVE_RXCR_RXEN, priv->base + AVE_RXCR); ++ /* change and enable TX/Rx mac */ ++ writel(txcr, priv->base + AVE_TXCR); ++ writel(rxcr, priv->base + AVE_RXCR); ++ } ++ ++ phy_print_status(phydev); ++} ++ ++static void ave_macaddr_init(struct net_device *ndev) ++{ ++ ave_hw_write_macaddr(ndev, ndev->dev_addr, AVE_RXMAC1R, AVE_RXMAC2R); ++ ++ /* pfsel unicast entry */ ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_UNICAST, ndev->dev_addr, 6); ++} ++ ++static int ave_init(struct net_device *ndev) ++{ ++ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; ++ struct ave_private *priv = netdev_priv(ndev); ++ struct device *dev = ndev->dev.parent; ++ struct device_node *np = dev->of_node; ++ struct device_node *mdio_np; ++ struct phy_device *phydev; ++ int nc, nr, ret; ++ ++ /* enable clk because of hw access until ndo_open */ ++ for (nc = 0; nc < priv->nclks; nc++) { ++ ret = clk_prepare_enable(priv->clk[nc]); ++ if (ret) { ++ dev_err(dev, "can't enable clock\n"); ++ goto out_clk_disable; ++ } ++ } ++ ++ for (nr = 0; nr < priv->nrsts; nr++) { ++ ret = reset_control_deassert(priv->rst[nr]); ++ if (ret) { ++ dev_err(dev, "can't deassert reset\n"); ++ goto out_reset_assert; ++ } ++ } ++ ++ ret = regmap_update_bits(priv->regmap, SG_ETPINMODE, ++ priv->pinmode_mask, priv->pinmode_val); ++ if (ret) ++ goto out_reset_assert; ++ ++ ave_global_reset(ndev); ++ ++ mdio_np = of_get_child_by_name(np, "mdio"); ++ if (!mdio_np) { ++ dev_err(dev, "mdio node not found\n"); ++ ret = -EINVAL; ++ goto out_reset_assert; ++ } ++ ret = of_mdiobus_register(priv->mdio, mdio_np); ++ of_node_put(mdio_np); ++ if (ret) { ++ dev_err(dev, "failed to register mdiobus\n"); ++ goto out_reset_assert; ++ } ++ ++ phydev = of_phy_get_and_connect(ndev, np, ave_phy_adjust_link); ++ if (!phydev) { ++ dev_err(dev, "could not attach to PHY\n"); ++ ret = -ENODEV; ++ goto out_mdio_unregister; ++ } ++ ++ priv->phydev = phydev; ++ ++ ave_ethtool_get_wol(ndev, &wol); ++ device_set_wakeup_capable(&ndev->dev, !!wol.supported); ++ ++ /* set wol initial state disabled */ ++ wol.wolopts = 0; ++ __ave_ethtool_set_wol(ndev, &wol); ++ ++ if (!phy_interface_is_rgmii(phydev)) ++ phy_set_max_speed(phydev, SPEED_100); ++ ++ phy_support_asym_pause(phydev); ++ ++ phydev->mac_managed_pm = true; ++ ++ phy_attached_info(phydev); ++ ++ return 0; ++ ++out_mdio_unregister: ++ mdiobus_unregister(priv->mdio); ++out_reset_assert: ++ while (--nr >= 0) ++ reset_control_assert(priv->rst[nr]); ++out_clk_disable: ++ while (--nc >= 0) ++ clk_disable_unprepare(priv->clk[nc]); ++ ++ return ret; ++} ++ ++static void ave_uninit(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ int i; ++ ++ phy_disconnect(priv->phydev); ++ mdiobus_unregister(priv->mdio); ++ ++ /* disable clk because of hw access after ndo_stop */ ++ for (i = 0; i < priv->nrsts; i++) ++ reset_control_assert(priv->rst[i]); ++ for (i = 0; i < priv->nclks; i++) ++ clk_disable_unprepare(priv->clk[i]); ++} ++ ++static int ave_open(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ int entry; ++ int ret; ++ u32 val; ++ ++ ret = request_irq(priv->irq, ave_irq_handler, IRQF_SHARED, ndev->name, ++ ndev); ++ if (ret) ++ return ret; ++ ++ priv->tx.desc = kcalloc(priv->tx.ndesc, sizeof(*priv->tx.desc), ++ GFP_KERNEL); ++ if (!priv->tx.desc) { ++ ret = -ENOMEM; ++ goto out_free_irq; ++ } ++ ++ priv->rx.desc = kcalloc(priv->rx.ndesc, sizeof(*priv->rx.desc), ++ GFP_KERNEL); ++ if (!priv->rx.desc) { ++ kfree(priv->tx.desc); ++ ret = -ENOMEM; ++ goto out_free_irq; ++ } ++ ++ /* initialize Tx work and descriptor */ ++ priv->tx.proc_idx = 0; ++ priv->tx.done_idx = 0; ++ for (entry = 0; entry < priv->tx.ndesc; entry++) { ++ ave_desc_write_cmdsts(ndev, AVE_DESCID_TX, entry, 0); ++ ave_desc_write_addr(ndev, AVE_DESCID_TX, entry, 0); ++ } ++ writel(AVE_TXDC_ADDR_START | ++ (((priv->tx.ndesc * priv->desc_size) << 16) & AVE_TXDC_SIZE), ++ priv->base + AVE_TXDC); ++ ++ /* initialize Rx work and descriptor */ ++ priv->rx.proc_idx = 0; ++ priv->rx.done_idx = 0; ++ for (entry = 0; entry < priv->rx.ndesc; entry++) { ++ if (ave_rxdesc_prepare(ndev, entry)) ++ break; ++ } ++ writel(AVE_RXDC0_ADDR_START | ++ (((priv->rx.ndesc * priv->desc_size) << 16) & AVE_RXDC0_SIZE), ++ priv->base + AVE_RXDC0); ++ ++ ave_desc_switch(ndev, AVE_DESC_START); ++ ++ ave_pfsel_init(ndev); ++ ave_macaddr_init(ndev); ++ ++ /* set Rx configuration */ ++ /* full duplex, enable pause drop, enalbe flow control */ ++ val = AVE_RXCR_RXEN | AVE_RXCR_FDUPEN | AVE_RXCR_DRPEN | ++ AVE_RXCR_FLOCTR | (AVE_MAX_ETHFRAME & AVE_RXCR_MPSIZ_MASK); ++ writel(val, priv->base + AVE_RXCR); ++ ++ /* set Tx configuration */ ++ /* enable flow control, disable loopback */ ++ writel(AVE_TXCR_FLOCTR, priv->base + AVE_TXCR); ++ ++ /* enable timer, clear EN,INTM, and mask interval unit(BSCK) */ ++ val = readl(priv->base + AVE_IIRQC) & AVE_IIRQC_BSCK; ++ val |= AVE_IIRQC_EN0 | (AVE_INTM_COUNT << 16); ++ writel(val, priv->base + AVE_IIRQC); ++ ++ val = AVE_GI_RXIINT | AVE_GI_RXOVF | AVE_GI_TX | AVE_GI_RXDROP; ++ ave_irq_restore(ndev, val); ++ ++ napi_enable(&priv->napi_rx); ++ napi_enable(&priv->napi_tx); ++ ++ phy_start(ndev->phydev); ++ phy_start_aneg(ndev->phydev); ++ netif_start_queue(ndev); ++ ++ return 0; ++ ++out_free_irq: ++ disable_irq(priv->irq); ++ free_irq(priv->irq, ndev); ++ ++ return ret; ++} ++ ++static int ave_stop(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ int entry; ++ ++ ave_irq_disable_all(ndev); ++ disable_irq(priv->irq); ++ free_irq(priv->irq, ndev); ++ ++ netif_tx_disable(ndev); ++ phy_stop(ndev->phydev); ++ napi_disable(&priv->napi_tx); ++ napi_disable(&priv->napi_rx); ++ ++ ave_desc_switch(ndev, AVE_DESC_STOP); ++ ++ /* free Tx buffer */ ++ for (entry = 0; entry < priv->tx.ndesc; entry++) { ++ if (!priv->tx.desc[entry].skbs) ++ continue; ++ ++ ave_dma_unmap(ndev, &priv->tx.desc[entry], DMA_TO_DEVICE); ++ dev_kfree_skb_any(priv->tx.desc[entry].skbs); ++ priv->tx.desc[entry].skbs = NULL; ++ } ++ priv->tx.proc_idx = 0; ++ priv->tx.done_idx = 0; ++ ++ /* free Rx buffer */ ++ for (entry = 0; entry < priv->rx.ndesc; entry++) { ++ if (!priv->rx.desc[entry].skbs) ++ continue; ++ ++ ave_dma_unmap(ndev, &priv->rx.desc[entry], DMA_FROM_DEVICE); ++ dev_kfree_skb_any(priv->rx.desc[entry].skbs); ++ priv->rx.desc[entry].skbs = NULL; ++ } ++ priv->rx.proc_idx = 0; ++ priv->rx.done_idx = 0; ++ ++ kfree(priv->tx.desc); ++ kfree(priv->rx.desc); ++ ++ return 0; ++} ++ ++static netdev_tx_t ave_start_xmit(struct sk_buff *skb, struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 proc_idx, done_idx, ndesc, cmdsts; ++ int ret, freepkt; ++ dma_addr_t paddr; ++ ++ proc_idx = priv->tx.proc_idx; ++ done_idx = priv->tx.done_idx; ++ ndesc = priv->tx.ndesc; ++ freepkt = ((done_idx + ndesc - 1) - proc_idx) % ndesc; ++ ++ /* stop queue when not enough entry */ ++ if (unlikely(freepkt < 1)) { ++ netif_stop_queue(ndev); ++ return NETDEV_TX_BUSY; ++ } ++ ++ /* add padding for short packet */ ++ if (skb_put_padto(skb, ETH_ZLEN)) { ++ priv->stats_tx.dropped++; ++ return NETDEV_TX_OK; ++ } ++ ++ /* map Tx buffer ++ * Tx buffer set to the Tx descriptor doesn't have any restriction. ++ */ ++ ret = ave_dma_map(ndev, &priv->tx.desc[proc_idx], ++ skb->data, skb->len, DMA_TO_DEVICE, &paddr); ++ if (ret) { ++ dev_kfree_skb_any(skb); ++ priv->stats_tx.dropped++; ++ return NETDEV_TX_OK; ++ } ++ ++ priv->tx.desc[proc_idx].skbs = skb; ++ ++ ave_desc_write_addr(ndev, AVE_DESCID_TX, proc_idx, paddr); ++ ++ cmdsts = AVE_STS_OWN | AVE_STS_1ST | AVE_STS_LAST | ++ (skb->len & AVE_STS_PKTLEN_TX_MASK); ++ ++ /* set interrupt per AVE_FORCE_TXINTCNT or when queue is stopped */ ++ if (!(proc_idx % AVE_FORCE_TXINTCNT) || netif_queue_stopped(ndev)) ++ cmdsts |= AVE_STS_INTR; ++ ++ /* disable checksum calculation when skb doesn't calurate checksum */ ++ if (skb->ip_summed == CHECKSUM_NONE || ++ skb->ip_summed == CHECKSUM_UNNECESSARY) ++ cmdsts |= AVE_STS_NOCSUM; ++ ++ ave_desc_write_cmdsts(ndev, AVE_DESCID_TX, proc_idx, cmdsts); ++ ++ priv->tx.proc_idx = (proc_idx + 1) % ndesc; ++ ++ return NETDEV_TX_OK; ++} ++ ++static int ave_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd) ++{ ++ return phy_mii_ioctl(ndev->phydev, ifr, cmd); ++} ++ ++static const u8 v4multi_macadr[] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 }; ++static const u8 v6multi_macadr[] = { 0x33, 0x00, 0x00, 0x00, 0x00, 0x00 }; ++ ++static void ave_set_rx_mode(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ struct netdev_hw_addr *hw_adr; ++ int count, mc_cnt; ++ u32 val; ++ ++ /* MAC addr filter enable for promiscious mode */ ++ mc_cnt = netdev_mc_count(ndev); ++ val = readl(priv->base + AVE_RXCR); ++ if (ndev->flags & IFF_PROMISC || !mc_cnt) ++ val &= ~AVE_RXCR_AFEN; ++ else ++ val |= AVE_RXCR_AFEN; ++ writel(val, priv->base + AVE_RXCR); ++ ++ /* set all multicast address */ ++ if ((ndev->flags & IFF_ALLMULTI) || mc_cnt > AVE_PF_MULTICAST_SIZE) { ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST, ++ v4multi_macadr, 1); ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST + 1, ++ v6multi_macadr, 1); ++ } else { ++ /* stop all multicast filter */ ++ for (count = 0; count < AVE_PF_MULTICAST_SIZE; count++) ++ ave_pfsel_stop(ndev, AVE_PFNUM_MULTICAST + count); ++ ++ /* set multicast addresses */ ++ count = 0; ++ netdev_for_each_mc_addr(hw_adr, ndev) { ++ if (count == mc_cnt) ++ break; ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST + count, ++ hw_adr->addr, 6); ++ count++; ++ } ++ } ++} ++ ++static void ave_get_stats64(struct net_device *ndev, ++ struct rtnl_link_stats64 *stats) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ unsigned int start; ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&priv->stats_rx.syncp); ++ stats->rx_packets = priv->stats_rx.packets; ++ stats->rx_bytes = priv->stats_rx.bytes; ++ } while (u64_stats_fetch_retry_irq(&priv->stats_rx.syncp, start)); ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&priv->stats_tx.syncp); ++ stats->tx_packets = priv->stats_tx.packets; ++ stats->tx_bytes = priv->stats_tx.bytes; ++ } while (u64_stats_fetch_retry_irq(&priv->stats_tx.syncp, start)); ++ ++ stats->rx_errors = priv->stats_rx.errors; ++ stats->tx_errors = priv->stats_tx.errors; ++ stats->rx_dropped = priv->stats_rx.dropped; ++ stats->tx_dropped = priv->stats_tx.dropped; ++ stats->rx_fifo_errors = priv->stats_rx.fifo_errors; ++ stats->collisions = priv->stats_tx.collisions; ++} ++ ++static int ave_set_mac_address(struct net_device *ndev, void *p) ++{ ++ int ret = eth_mac_addr(ndev, p); ++ ++ if (ret) ++ return ret; ++ ++ ave_macaddr_init(ndev); ++ ++ return 0; ++} ++ ++static const struct net_device_ops ave_netdev_ops = { ++ .ndo_init = ave_init, ++ .ndo_uninit = ave_uninit, ++ .ndo_open = ave_open, ++ .ndo_stop = ave_stop, ++ .ndo_start_xmit = ave_start_xmit, ++ .ndo_eth_ioctl = ave_ioctl, ++ .ndo_set_rx_mode = ave_set_rx_mode, ++ .ndo_get_stats64 = ave_get_stats64, ++ .ndo_set_mac_address = ave_set_mac_address, ++}; ++ ++static int ave_probe(struct platform_device *pdev) ++{ ++ const struct ave_soc_data *data; ++ struct device *dev = &pdev->dev; ++ char buf[ETHTOOL_FWVERS_LEN]; ++ struct of_phandle_args args; ++ phy_interface_t phy_mode; ++ struct ave_private *priv; ++ struct net_device *ndev; ++ struct device_node *np; ++ void __iomem *base; ++ const char *name; ++ int i, irq, ret; ++ u64 dma_mask; ++ u32 ave_id; ++ ++ data = of_device_get_match_data(dev); ++ if (WARN_ON(!data)) ++ return -EINVAL; ++ ++ np = dev->of_node; ++ ret = of_get_phy_mode(np, &phy_mode); ++ if (ret) { ++ dev_err(dev, "phy-mode not found\n"); ++ return ret; ++ } ++ ++ irq = platform_get_irq(pdev, 0); ++ if (irq < 0) ++ return irq; ++ ++ base = devm_platform_ioremap_resource(pdev, 0); ++ if (IS_ERR(base)) ++ return PTR_ERR(base); ++ ++ ndev = devm_alloc_etherdev(dev, sizeof(struct ave_private)); ++ if (!ndev) { ++ dev_err(dev, "can't allocate ethernet device\n"); ++ return -ENOMEM; ++ } ++ ++ ndev->netdev_ops = &ave_netdev_ops; ++ ndev->ethtool_ops = &ave_ethtool_ops; ++ SET_NETDEV_DEV(ndev, dev); ++ ++ ndev->features |= (NETIF_F_IP_CSUM | NETIF_F_RXCSUM); ++ ndev->hw_features |= (NETIF_F_IP_CSUM | NETIF_F_RXCSUM); ++ ++ ndev->max_mtu = AVE_MAX_ETHFRAME - (ETH_HLEN + ETH_FCS_LEN); ++ ++ ret = of_get_ethdev_address(np, ndev); ++ if (ret) { ++ /* if the mac address is invalid, use random mac address */ ++ eth_hw_addr_random(ndev); ++ dev_warn(dev, "Using random MAC address: %pM\n", ++ ndev->dev_addr); ++ } ++ ++ priv = netdev_priv(ndev); ++ priv->base = base; ++ priv->irq = irq; ++ priv->ndev = ndev; ++ priv->msg_enable = netif_msg_init(-1, AVE_DEFAULT_MSG_ENABLE); ++ priv->phy_mode = phy_mode; ++ priv->data = data; ++ ++ if (IS_DESC_64BIT(priv)) { ++ priv->desc_size = AVE_DESC_SIZE_64; ++ priv->tx.daddr = AVE_TXDM_64; ++ priv->rx.daddr = AVE_RXDM_64; ++ dma_mask = DMA_BIT_MASK(64); ++ } else { ++ priv->desc_size = AVE_DESC_SIZE_32; ++ priv->tx.daddr = AVE_TXDM_32; ++ priv->rx.daddr = AVE_RXDM_32; ++ dma_mask = DMA_BIT_MASK(32); ++ } ++ ret = dma_set_mask(dev, dma_mask); ++ if (ret) ++ return ret; ++ ++ priv->tx.ndesc = AVE_NR_TXDESC; ++ priv->rx.ndesc = AVE_NR_RXDESC; ++ ++ u64_stats_init(&priv->stats_tx.syncp); ++ u64_stats_init(&priv->stats_rx.syncp); ++ ++ for (i = 0; i < AVE_MAX_CLKS; i++) { ++ name = priv->data->clock_names[i]; ++ if (!name) ++ break; ++ priv->clk[i] = devm_clk_get(dev, name); ++ if (IS_ERR(priv->clk[i])) ++ return PTR_ERR(priv->clk[i]); ++ priv->nclks++; ++ } ++ ++ for (i = 0; i < AVE_MAX_RSTS; i++) { ++ name = priv->data->reset_names[i]; ++ if (!name) ++ break; ++ priv->rst[i] = devm_reset_control_get_shared(dev, name); ++ if (IS_ERR(priv->rst[i])) ++ return PTR_ERR(priv->rst[i]); ++ priv->nrsts++; ++ } ++ ++ ret = of_parse_phandle_with_fixed_args(np, ++ "socionext,syscon-phy-mode", ++ 1, 0, &args); ++ if (ret) { ++ dev_err(dev, "can't get syscon-phy-mode property\n"); ++ return ret; ++ } ++ priv->regmap = syscon_node_to_regmap(args.np); ++ of_node_put(args.np); ++ if (IS_ERR(priv->regmap)) { ++ dev_err(dev, "can't map syscon-phy-mode\n"); ++ return PTR_ERR(priv->regmap); ++ } ++ ret = priv->data->get_pinmode(priv, phy_mode, args.args[0]); ++ if (ret) { ++ dev_err(dev, "invalid phy-mode setting\n"); ++ return ret; ++ } ++ ++ priv->mdio = devm_mdiobus_alloc(dev); ++ if (!priv->mdio) ++ return -ENOMEM; ++ priv->mdio->priv = ndev; ++ priv->mdio->parent = dev; ++ priv->mdio->read = ave_mdiobus_read; ++ priv->mdio->write = ave_mdiobus_write; ++ priv->mdio->name = "uniphier-mdio"; ++ snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "%s-%x", ++ pdev->name, pdev->id); ++ ++ /* Register as a NAPI supported driver */ ++ netif_napi_add(ndev, &priv->napi_rx, ave_napi_poll_rx, ++ NAPI_POLL_WEIGHT); ++ netif_napi_add_tx(ndev, &priv->napi_tx, ave_napi_poll_tx); ++ ++ platform_set_drvdata(pdev, ndev); ++ ++ ret = register_netdev(ndev); ++ if (ret) { ++ dev_err(dev, "failed to register netdevice\n"); ++ goto out_del_napi; ++ } ++ ++ /* get ID and version */ ++ ave_id = readl(priv->base + AVE_IDR); ++ ave_hw_read_version(ndev, buf, sizeof(buf)); ++ ++ dev_info(dev, "Socionext %c%c%c%c Ethernet IP %s (irq=%d, phy=%s)\n", ++ (ave_id >> 24) & 0xff, (ave_id >> 16) & 0xff, ++ (ave_id >> 8) & 0xff, (ave_id >> 0) & 0xff, ++ buf, priv->irq, phy_modes(phy_mode)); ++ ++ return 0; ++ ++out_del_napi: ++ netif_napi_del(&priv->napi_rx); ++ netif_napi_del(&priv->napi_tx); ++ ++ return ret; ++} ++ ++static int ave_remove(struct platform_device *pdev) ++{ ++ struct net_device *ndev = platform_get_drvdata(pdev); ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ unregister_netdev(ndev); ++ netif_napi_del(&priv->napi_rx); ++ netif_napi_del(&priv->napi_tx); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_PM_SLEEP ++static int ave_suspend(struct device *dev) ++{ ++ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; ++ struct net_device *ndev = dev_get_drvdata(dev); ++ struct ave_private *priv = netdev_priv(ndev); ++ int ret = 0; ++ ++ if (netif_running(ndev)) { ++ ret = ave_stop(ndev); ++ netif_device_detach(ndev); ++ } ++ ++ ave_ethtool_get_wol(ndev, &wol); ++ priv->wolopts = wol.wolopts; ++ ++ return ret; ++} ++ ++static int ave_resume(struct device *dev) ++{ ++ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; ++ struct net_device *ndev = dev_get_drvdata(dev); ++ struct ave_private *priv = netdev_priv(ndev); ++ int ret = 0; ++ ++ ave_global_reset(ndev); ++ ++ ret = phy_init_hw(ndev->phydev); ++ if (ret) ++ return ret; ++ ++ ave_ethtool_get_wol(ndev, &wol); ++ wol.wolopts = priv->wolopts; ++ __ave_ethtool_set_wol(ndev, &wol); ++ ++ if (ndev->phydev) { ++ ret = phy_resume(ndev->phydev); ++ if (ret) ++ return ret; ++ } ++ ++ if (netif_running(ndev)) { ++ ret = ave_open(ndev); ++ netif_device_attach(ndev); ++ } ++ ++ return ret; ++} ++ ++static SIMPLE_DEV_PM_OPS(ave_pm_ops, ave_suspend, ave_resume); ++#define AVE_PM_OPS (&ave_pm_ops) ++#else ++#define AVE_PM_OPS NULL ++#endif ++ ++static int ave_pro4_get_pinmode(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg) ++{ ++ if (arg > 0) ++ return -EINVAL; ++ ++ priv->pinmode_mask = SG_ETPINMODE_RMII(0); ++ ++ switch (phy_mode) { ++ case PHY_INTERFACE_MODE_RMII: ++ priv->pinmode_val = SG_ETPINMODE_RMII(0); ++ break; ++ case PHY_INTERFACE_MODE_MII: ++ case PHY_INTERFACE_MODE_RGMII: ++ case PHY_INTERFACE_MODE_RGMII_ID: ++ case PHY_INTERFACE_MODE_RGMII_RXID: ++ case PHY_INTERFACE_MODE_RGMII_TXID: ++ priv->pinmode_val = 0; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ave_ld11_get_pinmode(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg) ++{ ++ if (arg > 0) ++ return -EINVAL; ++ ++ priv->pinmode_mask = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0); ++ ++ switch (phy_mode) { ++ case PHY_INTERFACE_MODE_INTERNAL: ++ priv->pinmode_val = 0; ++ break; ++ case PHY_INTERFACE_MODE_RMII: ++ priv->pinmode_val = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ave_ld20_get_pinmode(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg) ++{ ++ if (arg > 0) ++ return -EINVAL; ++ ++ priv->pinmode_mask = SG_ETPINMODE_RMII(0); ++ ++ switch (phy_mode) { ++ case PHY_INTERFACE_MODE_RMII: ++ priv->pinmode_val = SG_ETPINMODE_RMII(0); ++ break; ++ case PHY_INTERFACE_MODE_RGMII: ++ case PHY_INTERFACE_MODE_RGMII_ID: ++ case PHY_INTERFACE_MODE_RGMII_RXID: ++ case PHY_INTERFACE_MODE_RGMII_TXID: ++ priv->pinmode_val = 0; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ave_pxs3_get_pinmode(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg) ++{ ++ if (arg > 1) ++ return -EINVAL; ++ ++ priv->pinmode_mask = SG_ETPINMODE_RMII(arg); ++ ++ switch (phy_mode) { ++ case PHY_INTERFACE_MODE_RMII: ++ priv->pinmode_val = SG_ETPINMODE_RMII(arg); ++ break; ++ case PHY_INTERFACE_MODE_RGMII: ++ case PHY_INTERFACE_MODE_RGMII_ID: ++ case PHY_INTERFACE_MODE_RGMII_RXID: ++ case PHY_INTERFACE_MODE_RGMII_TXID: ++ priv->pinmode_val = 0; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static const struct ave_soc_data ave_pro4_data = { ++ .is_desc_64bit = false, ++ .clock_names = { ++ "gio", "ether", "ether-gb", "ether-phy", ++ }, ++ .reset_names = { ++ "gio", "ether", ++ }, ++ .get_pinmode = ave_pro4_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_pxs2_data = { ++ .is_desc_64bit = false, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_pro4_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_ld11_data = { ++ .is_desc_64bit = false, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_ld11_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_ld20_data = { ++ .is_desc_64bit = true, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_ld20_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_pxs3_data = { ++ .is_desc_64bit = false, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_pxs3_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_nx1_data = { ++ .is_desc_64bit = true, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_pxs3_get_pinmode, ++}; ++ ++static const struct of_device_id of_ave_match[] = { ++ { ++ .compatible = "socionext,uniphier-pro4-ave4", ++ .data = &ave_pro4_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-pxs2-ave4", ++ .data = &ave_pxs2_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-ld11-ave4", ++ .data = &ave_ld11_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-ld20-ave4", ++ .data = &ave_ld20_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-pxs3-ave4", ++ .data = &ave_pxs3_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-nx1-ave4", ++ .data = &ave_nx1_data, ++ }, ++ { /* Sentinel */ } ++}; ++MODULE_DEVICE_TABLE(of, of_ave_match); ++ ++static struct platform_driver ave_driver = { ++ .probe = ave_probe, ++ .remove = ave_remove, ++ .driver = { ++ .name = "ave", ++ .pm = AVE_PM_OPS, ++ .of_match_table = of_ave_match, ++ }, ++}; ++module_platform_driver(ave_driver); ++ ++MODULE_AUTHOR("Kunihiko Hayashi "); ++MODULE_DESCRIPTION("Socionext UniPhier AVE ethernet driver"); ++MODULE_LICENSE("GPL v2"); +diff -rupN linux.orig/drivers/net/ethernet/ti/am65-cpsw-nuss.c linux/drivers/net/ethernet/ti/am65-cpsw-nuss.c +--- linux.orig/drivers/net/ethernet/ti/am65-cpsw-nuss.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/ti/am65-cpsw-nuss.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1365,12 +1365,12 @@ static void am65_cpsw_nuss_ndo_get_stats cpu_stats = per_cpu_ptr(ndev_priv->stats, cpu); do { @@ -2501,11 +14651,10 @@ index f4a6b590a1e39..1b62400c19049 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c -index b15d44261e766..68c7b2c05aab3 100644 ---- a/drivers/net/ethernet/ti/netcp_core.c -+++ b/drivers/net/ethernet/ti/netcp_core.c -@@ -1916,16 +1916,16 @@ netcp_get_stats(struct net_device *ndev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/ethernet/ti/netcp_core.c linux/drivers/net/ethernet/ti/netcp_core.c +--- linux.orig/drivers/net/ethernet/ti/netcp_core.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/ti/netcp_core.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1916,16 +1916,16 @@ netcp_get_stats(struct net_device *ndev, unsigned int start; do { @@ -2526,11 +14675,10 @@ index b15d44261e766..68c7b2c05aab3 100644 stats->rx_packets = rxpackets; stats->rx_bytes = rxbytes; -diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c -index 509c5e9b29dfa..5301c907b5ae3 100644 ---- a/drivers/net/ethernet/via/via-rhine.c -+++ b/drivers/net/ethernet/via/via-rhine.c -@@ -2217,16 +2217,16 @@ rhine_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/ethernet/via/via-rhine.c linux/drivers/net/ethernet/via/via-rhine.c +--- linux.orig/drivers/net/ethernet/via/via-rhine.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/via/via-rhine.c 2022-12-04 10:40:26.696034096 -0500 +@@ -2217,16 +2217,16 @@ rhine_get_stats64(struct net_device *dev netdev_stats_to_stats64(stats, &dev->stats); do { @@ -2551,11 +14699,10 @@ index 509c5e9b29dfa..5301c907b5ae3 100644 } static void rhine_set_rx_mode(struct net_device *dev) -diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c -index 9262988d26a32..2c233b59e7d93 100644 ---- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c -+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c -@@ -1305,16 +1305,16 @@ axienet_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/ethernet/xilinx/xilinx_axienet_main.c linux/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +--- linux.orig/drivers/net/ethernet/xilinx/xilinx_axienet_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/xilinx/xilinx_axienet_main.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1305,16 +1305,16 @@ axienet_get_stats64(struct net_device *d netdev_stats_to_stats64(stats, &dev->stats); do { @@ -2576,11 +14723,10 @@ index 9262988d26a32..2c233b59e7d93 100644 } static const struct net_device_ops axienet_netdev_ops = { -diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c -index 8113ac17ab70a..2fd8b9c51e839 100644 ---- a/drivers/net/hyperv/netvsc_drv.c -+++ b/drivers/net/hyperv/netvsc_drv.c -@@ -1264,12 +1264,12 @@ static void netvsc_get_vf_stats(struct net_device *net, +diff -rupN linux.orig/drivers/net/hyperv/netvsc_drv.c linux/drivers/net/hyperv/netvsc_drv.c +--- linux.orig/drivers/net/hyperv/netvsc_drv.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/hyperv/netvsc_drv.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1264,12 +1264,12 @@ static void netvsc_get_vf_stats(struct n unsigned int start; do { @@ -2595,7 +14741,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 tot->rx_packets += rx_packets; tot->tx_packets += tx_packets; -@@ -1294,12 +1294,12 @@ static void netvsc_get_pcpu_stats(struct net_device *net, +@@ -1294,12 +1294,12 @@ static void netvsc_get_pcpu_stats(struct unsigned int start; do { @@ -2610,7 +14756,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 this_tot->rx_packets = this_tot->vf_rx_packets; this_tot->tx_packets = this_tot->vf_tx_packets; this_tot->rx_bytes = this_tot->vf_rx_bytes; -@@ -1318,20 +1318,20 @@ static void netvsc_get_pcpu_stats(struct net_device *net, +@@ -1318,20 +1318,20 @@ static void netvsc_get_pcpu_stats(struct tx_stats = &nvchan->tx_stats; do { @@ -2635,7 +14781,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 this_tot->rx_bytes += bytes; this_tot->rx_packets += packets; -@@ -1370,21 +1370,21 @@ static void netvsc_get_stats64(struct net_device *net, +@@ -1370,21 +1370,21 @@ static void netvsc_get_stats64(struct ne tx_stats = &nvchan->tx_stats; do { @@ -2661,7 +14807,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 t->rx_bytes += bytes; t->rx_packets += packets; -@@ -1527,24 +1527,24 @@ static void netvsc_get_ethtool_stats(struct net_device *dev, +@@ -1527,24 +1527,24 @@ static void netvsc_get_ethtool_stats(str tx_stats = &nvdev->chan_table[j].tx_stats; do { @@ -2690,11 +14836,10 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 data[i++] = packets; data[i++] = bytes; data[i++] = xdp_drop; -diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c -index 1c64d5347b8e0..78253ad57b2ef 100644 ---- a/drivers/net/ifb.c -+++ b/drivers/net/ifb.c -@@ -162,18 +162,18 @@ static void ifb_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ifb.c linux/drivers/net/ifb.c +--- linux.orig/drivers/net/ifb.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ifb.c 2022-12-04 10:40:26.696034096 -0500 +@@ -162,18 +162,18 @@ static void ifb_stats64(struct net_devic for (i = 0; i < dev->num_tx_queues; i++,txp++) { do { @@ -2717,7 +14862,7 @@ index 1c64d5347b8e0..78253ad57b2ef 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; } -@@ -245,12 +245,12 @@ static void ifb_fill_stats_data(u64 **data, +@@ -245,12 +245,12 @@ static void ifb_fill_stats_data(u64 **da int j; do { @@ -2732,11 +14877,10 @@ index 1c64d5347b8e0..78253ad57b2ef 100644 *data += IFB_Q_STATS_LEN; } -diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c -index 49ba8a50dfb1e..8a58d74638cd8 100644 ---- a/drivers/net/ipvlan/ipvlan_main.c -+++ b/drivers/net/ipvlan/ipvlan_main.c -@@ -299,13 +299,13 @@ static void ipvlan_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ipvlan/ipvlan_main.c linux/drivers/net/ipvlan/ipvlan_main.c +--- linux.orig/drivers/net/ipvlan/ipvlan_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ipvlan/ipvlan_main.c 2022-12-04 10:40:26.696034096 -0500 +@@ -301,13 +301,13 @@ static void ipvlan_get_stats64(struct ne for_each_possible_cpu(idx) { pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx); do { @@ -2752,11 +14896,1096 @@ index 49ba8a50dfb1e..8a58d74638cd8 100644 strt)); s->rx_packets += rx_pkts; -diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c -index 14e8d04cb4347..c4ad98d39ea60 100644 ---- a/drivers/net/loopback.c -+++ b/drivers/net/loopback.c -@@ -106,10 +106,10 @@ void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes) +diff -rupN linux.orig/drivers/net/ipvlan/ipvlan_main.c.orig linux/drivers/net/ipvlan/ipvlan_main.c.orig +--- linux.orig/drivers/net/ipvlan/ipvlan_main.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/ipvlan/ipvlan_main.c.orig 2022-12-04 10:40:18.180055916 -0500 +@@ -0,0 +1,1082 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* Copyright (c) 2014 Mahesh Bandewar ++ */ ++ ++#include ++ ++#include "ipvlan.h" ++ ++static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, ++ struct netlink_ext_ack *extack) ++{ ++ struct ipvl_dev *ipvlan; ++ unsigned int flags; ++ int err; ++ ++ ASSERT_RTNL(); ++ if (port->mode != nval) { ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) { ++ flags = ipvlan->dev->flags; ++ if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) { ++ err = dev_change_flags(ipvlan->dev, ++ flags | IFF_NOARP, ++ extack); ++ } else { ++ err = dev_change_flags(ipvlan->dev, ++ flags & ~IFF_NOARP, ++ extack); ++ } ++ if (unlikely(err)) ++ goto fail; ++ } ++ if (nval == IPVLAN_MODE_L3S) { ++ /* New mode is L3S */ ++ err = ipvlan_l3s_register(port); ++ if (err) ++ goto fail; ++ } else if (port->mode == IPVLAN_MODE_L3S) { ++ /* Old mode was L3S */ ++ ipvlan_l3s_unregister(port); ++ } ++ port->mode = nval; ++ } ++ return 0; ++ ++fail: ++ /* Undo the flags changes that have been done so far. */ ++ list_for_each_entry_continue_reverse(ipvlan, &port->ipvlans, pnode) { ++ flags = ipvlan->dev->flags; ++ if (port->mode == IPVLAN_MODE_L3 || ++ port->mode == IPVLAN_MODE_L3S) ++ dev_change_flags(ipvlan->dev, flags | IFF_NOARP, ++ NULL); ++ else ++ dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP, ++ NULL); ++ } ++ ++ return err; ++} ++ ++static int ipvlan_port_create(struct net_device *dev) ++{ ++ struct ipvl_port *port; ++ int err, idx; ++ ++ port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL); ++ if (!port) ++ return -ENOMEM; ++ ++ write_pnet(&port->pnet, dev_net(dev)); ++ port->dev = dev; ++ port->mode = IPVLAN_MODE_L3; ++ INIT_LIST_HEAD(&port->ipvlans); ++ for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++) ++ INIT_HLIST_HEAD(&port->hlhead[idx]); ++ ++ skb_queue_head_init(&port->backlog); ++ INIT_WORK(&port->wq, ipvlan_process_multicast); ++ ida_init(&port->ida); ++ port->dev_id_start = 1; ++ ++ err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); ++ if (err) ++ goto err; ++ ++ netdev_hold(dev, &port->dev_tracker, GFP_KERNEL); ++ return 0; ++ ++err: ++ kfree(port); ++ return err; ++} ++ ++static void ipvlan_port_destroy(struct net_device *dev) ++{ ++ struct ipvl_port *port = ipvlan_port_get_rtnl(dev); ++ struct sk_buff *skb; ++ ++ netdev_put(dev, &port->dev_tracker); ++ if (port->mode == IPVLAN_MODE_L3S) ++ ipvlan_l3s_unregister(port); ++ netdev_rx_handler_unregister(dev); ++ cancel_work_sync(&port->wq); ++ while ((skb = __skb_dequeue(&port->backlog)) != NULL) { ++ dev_put(skb->dev); ++ kfree_skb(skb); ++ } ++ ida_destroy(&port->ida); ++ kfree(port); ++} ++ ++#define IPVLAN_ALWAYS_ON_OFLOADS \ ++ (NETIF_F_SG | NETIF_F_HW_CSUM | \ ++ NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL) ++ ++#define IPVLAN_ALWAYS_ON \ ++ (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED) ++ ++#define IPVLAN_FEATURES \ ++ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ ++ NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \ ++ NETIF_F_GRO | NETIF_F_RXCSUM | \ ++ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) ++ ++ /* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */ ++ ++#define IPVLAN_STATE_MASK \ ++ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) ++ ++static int ipvlan_init(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ struct ipvl_port *port; ++ int err; ++ ++ dev->state = (dev->state & ~IPVLAN_STATE_MASK) | ++ (phy_dev->state & IPVLAN_STATE_MASK); ++ dev->features = phy_dev->features & IPVLAN_FEATURES; ++ dev->features |= IPVLAN_ALWAYS_ON; ++ dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; ++ dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; ++ dev->hw_enc_features |= dev->features; ++ netif_inherit_tso_max(dev, phy_dev); ++ dev->hard_header_len = phy_dev->hard_header_len; ++ ++ netdev_lockdep_set_classes(dev); ++ ++ ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); ++ if (!ipvlan->pcpu_stats) ++ return -ENOMEM; ++ ++ if (!netif_is_ipvlan_port(phy_dev)) { ++ err = ipvlan_port_create(phy_dev); ++ if (err < 0) { ++ free_percpu(ipvlan->pcpu_stats); ++ return err; ++ } ++ } ++ port = ipvlan_port_get_rtnl(phy_dev); ++ port->count += 1; ++ return 0; ++} ++ ++static void ipvlan_uninit(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ struct ipvl_port *port; ++ ++ free_percpu(ipvlan->pcpu_stats); ++ ++ port = ipvlan_port_get_rtnl(phy_dev); ++ port->count -= 1; ++ if (!port->count) ++ ipvlan_port_destroy(port->dev); ++} ++ ++static int ipvlan_open(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_addr *addr; ++ ++ if (ipvlan->port->mode == IPVLAN_MODE_L3 || ++ ipvlan->port->mode == IPVLAN_MODE_L3S) ++ dev->flags |= IFF_NOARP; ++ else ++ dev->flags &= ~IFF_NOARP; ++ ++ rcu_read_lock(); ++ list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ++ ipvlan_ht_addr_add(ipvlan, addr); ++ rcu_read_unlock(); ++ ++ return 0; ++} ++ ++static int ipvlan_stop(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ struct ipvl_addr *addr; ++ ++ dev_uc_unsync(phy_dev, dev); ++ dev_mc_unsync(phy_dev, dev); ++ ++ rcu_read_lock(); ++ list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ++ ipvlan_ht_addr_del(addr); ++ rcu_read_unlock(); ++ ++ return 0; ++} ++ ++static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ const struct ipvl_dev *ipvlan = netdev_priv(dev); ++ int skblen = skb->len; ++ int ret; ++ ++ ret = ipvlan_queue_xmit(skb, dev); ++ if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { ++ struct ipvl_pcpu_stats *pcptr; ++ ++ pcptr = this_cpu_ptr(ipvlan->pcpu_stats); ++ ++ u64_stats_update_begin(&pcptr->syncp); ++ u64_stats_inc(&pcptr->tx_pkts); ++ u64_stats_add(&pcptr->tx_bytes, skblen); ++ u64_stats_update_end(&pcptr->syncp); ++ } else { ++ this_cpu_inc(ipvlan->pcpu_stats->tx_drps); ++ } ++ return ret; ++} ++ ++static netdev_features_t ipvlan_fix_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ features |= NETIF_F_ALL_FOR_ALL; ++ features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES); ++ features = netdev_increment_features(ipvlan->phy_dev->features, ++ features, features); ++ features |= IPVLAN_ALWAYS_ON; ++ features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON); ++ ++ return features; ++} ++ ++static void ipvlan_change_rx_flags(struct net_device *dev, int change) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ ++ if (change & IFF_ALLMULTI) ++ dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1); ++} ++ ++static void ipvlan_set_multicast_mac_filter(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { ++ bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE); ++ } else { ++ struct netdev_hw_addr *ha; ++ DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE); ++ ++ bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE); ++ netdev_for_each_mc_addr(ha, dev) ++ __set_bit(ipvlan_mac_hash(ha->addr), mc_filters); ++ ++ /* Turn-on broadcast bit irrespective of address family, ++ * since broadcast is deferred to a work-queue, hence no ++ * impact on fast-path processing. ++ */ ++ __set_bit(ipvlan_mac_hash(dev->broadcast), mc_filters); ++ ++ bitmap_copy(ipvlan->mac_filters, mc_filters, ++ IPVLAN_MAC_FILTER_SIZE); ++ } ++ dev_uc_sync(ipvlan->phy_dev, dev); ++ dev_mc_sync(ipvlan->phy_dev, dev); ++} ++ ++static void ipvlan_get_stats64(struct net_device *dev, ++ struct rtnl_link_stats64 *s) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (ipvlan->pcpu_stats) { ++ struct ipvl_pcpu_stats *pcptr; ++ u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes; ++ u32 rx_errs = 0, tx_drps = 0; ++ u32 strt; ++ int idx; ++ ++ for_each_possible_cpu(idx) { ++ pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx); ++ do { ++ strt= u64_stats_fetch_begin_irq(&pcptr->syncp); ++ rx_pkts = u64_stats_read(&pcptr->rx_pkts); ++ rx_bytes = u64_stats_read(&pcptr->rx_bytes); ++ rx_mcast = u64_stats_read(&pcptr->rx_mcast); ++ tx_pkts = u64_stats_read(&pcptr->tx_pkts); ++ tx_bytes = u64_stats_read(&pcptr->tx_bytes); ++ } while (u64_stats_fetch_retry_irq(&pcptr->syncp, ++ strt)); ++ ++ s->rx_packets += rx_pkts; ++ s->rx_bytes += rx_bytes; ++ s->multicast += rx_mcast; ++ s->tx_packets += tx_pkts; ++ s->tx_bytes += tx_bytes; ++ ++ /* u32 values are updated without syncp protection. */ ++ rx_errs += READ_ONCE(pcptr->rx_errs); ++ tx_drps += READ_ONCE(pcptr->tx_drps); ++ } ++ s->rx_errors = rx_errs; ++ s->rx_dropped = rx_errs; ++ s->tx_dropped = tx_drps; ++ } ++} ++ ++static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ ++ return vlan_vid_add(phy_dev, proto, vid); ++} ++ ++static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, ++ u16 vid) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ ++ vlan_vid_del(phy_dev, proto, vid); ++ return 0; ++} ++ ++static int ipvlan_get_iflink(const struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ return ipvlan->phy_dev->ifindex; ++} ++ ++static const struct net_device_ops ipvlan_netdev_ops = { ++ .ndo_init = ipvlan_init, ++ .ndo_uninit = ipvlan_uninit, ++ .ndo_open = ipvlan_open, ++ .ndo_stop = ipvlan_stop, ++ .ndo_start_xmit = ipvlan_start_xmit, ++ .ndo_fix_features = ipvlan_fix_features, ++ .ndo_change_rx_flags = ipvlan_change_rx_flags, ++ .ndo_set_rx_mode = ipvlan_set_multicast_mac_filter, ++ .ndo_get_stats64 = ipvlan_get_stats64, ++ .ndo_vlan_rx_add_vid = ipvlan_vlan_rx_add_vid, ++ .ndo_vlan_rx_kill_vid = ipvlan_vlan_rx_kill_vid, ++ .ndo_get_iflink = ipvlan_get_iflink, ++}; ++ ++static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev, ++ unsigned short type, const void *daddr, ++ const void *saddr, unsigned len) ++{ ++ const struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ ++ /* TODO Probably use a different field than dev_addr so that the ++ * mac-address on the virtual device is portable and can be carried ++ * while the packets use the mac-addr on the physical device. ++ */ ++ return dev_hard_header(skb, phy_dev, type, daddr, ++ saddr ? : phy_dev->dev_addr, len); ++} ++ ++static const struct header_ops ipvlan_header_ops = { ++ .create = ipvlan_hard_header, ++ .parse = eth_header_parse, ++ .cache = eth_header_cache, ++ .cache_update = eth_header_cache_update, ++}; ++ ++static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) ++{ ++ ipvlan->dev->mtu = dev->mtu; ++} ++ ++static bool netif_is_ipvlan(const struct net_device *dev) ++{ ++ /* both ipvlan and ipvtap devices use the same netdev_ops */ ++ return dev->netdev_ops == &ipvlan_netdev_ops; ++} ++ ++static int ipvlan_ethtool_get_link_ksettings(struct net_device *dev, ++ struct ethtool_link_ksettings *cmd) ++{ ++ const struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ return __ethtool_get_link_ksettings(ipvlan->phy_dev, cmd); ++} ++ ++static void ipvlan_ethtool_get_drvinfo(struct net_device *dev, ++ struct ethtool_drvinfo *drvinfo) ++{ ++ strlcpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver)); ++ strlcpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version)); ++} ++ ++static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev) ++{ ++ const struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ return ipvlan->msg_enable; ++} ++ ++static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ ipvlan->msg_enable = value; ++} ++ ++static const struct ethtool_ops ipvlan_ethtool_ops = { ++ .get_link = ethtool_op_get_link, ++ .get_link_ksettings = ipvlan_ethtool_get_link_ksettings, ++ .get_drvinfo = ipvlan_ethtool_get_drvinfo, ++ .get_msglevel = ipvlan_ethtool_get_msglevel, ++ .set_msglevel = ipvlan_ethtool_set_msglevel, ++}; ++ ++static int ipvlan_nl_changelink(struct net_device *dev, ++ struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); ++ int err = 0; ++ ++ if (!data) ++ return 0; ++ if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ if (data[IFLA_IPVLAN_MODE]) { ++ u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); ++ ++ err = ipvlan_set_port_mode(port, nmode, extack); ++ } ++ ++ if (!err && data[IFLA_IPVLAN_FLAGS]) { ++ u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); ++ ++ if (flags & IPVLAN_F_PRIVATE) ++ ipvlan_mark_private(port); ++ else ++ ipvlan_clear_private(port); ++ ++ if (flags & IPVLAN_F_VEPA) ++ ipvlan_mark_vepa(port); ++ else ++ ipvlan_clear_vepa(port); ++ } ++ ++ return err; ++} ++ ++static size_t ipvlan_nl_getsize(const struct net_device *dev) ++{ ++ return (0 ++ + nla_total_size(2) /* IFLA_IPVLAN_MODE */ ++ + nla_total_size(2) /* IFLA_IPVLAN_FLAGS */ ++ ); ++} ++ ++static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ if (!data) ++ return 0; ++ ++ if (data[IFLA_IPVLAN_MODE]) { ++ u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); ++ ++ if (mode >= IPVLAN_MODE_MAX) ++ return -EINVAL; ++ } ++ if (data[IFLA_IPVLAN_FLAGS]) { ++ u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); ++ ++ /* Only two bits are used at this moment. */ ++ if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ++ return -EINVAL; ++ /* Also both flags can't be active at the same time. */ ++ if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) == ++ (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ipvlan_nl_fillinfo(struct sk_buff *skb, ++ const struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); ++ int ret = -EINVAL; ++ ++ if (!port) ++ goto err; ++ ++ ret = -EMSGSIZE; ++ if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode)) ++ goto err; ++ if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags)) ++ goto err; ++ ++ return 0; ++ ++err: ++ return ret; ++} ++ ++int ipvlan_link_new(struct net *src_net, struct net_device *dev, ++ struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_port *port; ++ struct net_device *phy_dev; ++ int err; ++ u16 mode = IPVLAN_MODE_L3; ++ ++ if (!tb[IFLA_LINK]) ++ return -EINVAL; ++ ++ phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); ++ if (!phy_dev) ++ return -ENODEV; ++ ++ if (netif_is_ipvlan(phy_dev)) { ++ struct ipvl_dev *tmp = netdev_priv(phy_dev); ++ ++ phy_dev = tmp->phy_dev; ++ if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN)) ++ return -EPERM; ++ } else if (!netif_is_ipvlan_port(phy_dev)) { ++ /* Exit early if the underlying link is invalid or busy */ ++ if (phy_dev->type != ARPHRD_ETHER || ++ phy_dev->flags & IFF_LOOPBACK) { ++ netdev_err(phy_dev, ++ "Master is either lo or non-ether device\n"); ++ return -EINVAL; ++ } ++ ++ if (netdev_is_rx_handler_busy(phy_dev)) { ++ netdev_err(phy_dev, "Device is already in use.\n"); ++ return -EBUSY; ++ } ++ } ++ ++ ipvlan->phy_dev = phy_dev; ++ ipvlan->dev = dev; ++ ipvlan->sfeatures = IPVLAN_FEATURES; ++ if (!tb[IFLA_MTU]) ++ ipvlan_adjust_mtu(ipvlan, phy_dev); ++ INIT_LIST_HEAD(&ipvlan->addrs); ++ spin_lock_init(&ipvlan->addrs_lock); ++ ++ /* TODO Probably put random address here to be presented to the ++ * world but keep using the physical-dev address for the outgoing ++ * packets. ++ */ ++ eth_hw_addr_set(dev, phy_dev->dev_addr); ++ ++ dev->priv_flags |= IFF_NO_RX_HANDLER; ++ ++ err = register_netdevice(dev); ++ if (err < 0) ++ return err; ++ ++ /* ipvlan_init() would have created the port, if required */ ++ port = ipvlan_port_get_rtnl(phy_dev); ++ ipvlan->port = port; ++ ++ /* If the port-id base is at the MAX value, then wrap it around and ++ * begin from 0x1 again. This may be due to a busy system where lots ++ * of slaves are getting created and deleted. ++ */ ++ if (port->dev_id_start == 0xFFFE) ++ port->dev_id_start = 0x1; ++ ++ /* Since L2 address is shared among all IPvlan slaves including ++ * master, use unique 16 bit dev-ids to diffentiate among them. ++ * Assign IDs between 0x1 and 0xFFFE (used by the master) to each ++ * slave link [see addrconf_ifid_eui48()]. ++ */ ++ err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE, ++ GFP_KERNEL); ++ if (err < 0) ++ err = ida_simple_get(&port->ida, 0x1, port->dev_id_start, ++ GFP_KERNEL); ++ if (err < 0) ++ goto unregister_netdev; ++ dev->dev_id = err; ++ ++ /* Increment id-base to the next slot for the future assignment */ ++ port->dev_id_start = err + 1; ++ ++ err = netdev_upper_dev_link(phy_dev, dev, extack); ++ if (err) ++ goto remove_ida; ++ ++ /* Flags are per port and latest update overrides. User has ++ * to be consistent in setting it just like the mode attribute. ++ */ ++ if (data && data[IFLA_IPVLAN_FLAGS]) ++ port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); ++ ++ if (data && data[IFLA_IPVLAN_MODE]) ++ mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); ++ ++ err = ipvlan_set_port_mode(port, mode, extack); ++ if (err) ++ goto unlink_netdev; ++ ++ list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); ++ netif_stacked_transfer_operstate(phy_dev, dev); ++ return 0; ++ ++unlink_netdev: ++ netdev_upper_dev_unlink(phy_dev, dev); ++remove_ida: ++ ida_simple_remove(&port->ida, dev->dev_id); ++unregister_netdev: ++ unregister_netdevice(dev); ++ return err; ++} ++EXPORT_SYMBOL_GPL(ipvlan_link_new); ++ ++void ipvlan_link_delete(struct net_device *dev, struct list_head *head) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_addr *addr, *next; ++ ++ spin_lock_bh(&ipvlan->addrs_lock); ++ list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ++ ipvlan_ht_addr_del(addr); ++ list_del_rcu(&addr->anode); ++ kfree_rcu(addr, rcu); ++ } ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ ++ ida_simple_remove(&ipvlan->port->ida, dev->dev_id); ++ list_del_rcu(&ipvlan->pnode); ++ unregister_netdevice_queue(dev, head); ++ netdev_upper_dev_unlink(ipvlan->phy_dev, dev); ++} ++EXPORT_SYMBOL_GPL(ipvlan_link_delete); ++ ++void ipvlan_link_setup(struct net_device *dev) ++{ ++ ether_setup(dev); ++ ++ dev->max_mtu = ETH_MAX_MTU; ++ dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); ++ dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE; ++ dev->netdev_ops = &ipvlan_netdev_ops; ++ dev->needs_free_netdev = true; ++ dev->header_ops = &ipvlan_header_ops; ++ dev->ethtool_ops = &ipvlan_ethtool_ops; ++} ++EXPORT_SYMBOL_GPL(ipvlan_link_setup); ++ ++static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] = ++{ ++ [IFLA_IPVLAN_MODE] = { .type = NLA_U16 }, ++ [IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 }, ++}; ++ ++static struct net *ipvlan_get_link_net(const struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ return dev_net(ipvlan->phy_dev); ++} ++ ++static struct rtnl_link_ops ipvlan_link_ops = { ++ .kind = "ipvlan", ++ .priv_size = sizeof(struct ipvl_dev), ++ ++ .setup = ipvlan_link_setup, ++ .newlink = ipvlan_link_new, ++ .dellink = ipvlan_link_delete, ++ .get_link_net = ipvlan_get_link_net, ++}; ++ ++int ipvlan_link_register(struct rtnl_link_ops *ops) ++{ ++ ops->get_size = ipvlan_nl_getsize; ++ ops->policy = ipvlan_nl_policy; ++ ops->validate = ipvlan_nl_validate; ++ ops->fill_info = ipvlan_nl_fillinfo; ++ ops->changelink = ipvlan_nl_changelink; ++ ops->maxtype = IFLA_IPVLAN_MAX; ++ return rtnl_link_register(ops); ++} ++EXPORT_SYMBOL_GPL(ipvlan_link_register); ++ ++static int ipvlan_device_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); ++ struct netdev_notifier_pre_changeaddr_info *prechaddr_info; ++ struct net_device *dev = netdev_notifier_info_to_dev(ptr); ++ struct ipvl_dev *ipvlan, *next; ++ struct ipvl_port *port; ++ LIST_HEAD(lst_kill); ++ int err; ++ ++ if (!netif_is_ipvlan_port(dev)) ++ return NOTIFY_DONE; ++ ++ port = ipvlan_port_get_rtnl(dev); ++ ++ switch (event) { ++ case NETDEV_UP: ++ case NETDEV_CHANGE: ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) ++ netif_stacked_transfer_operstate(ipvlan->phy_dev, ++ ipvlan->dev); ++ break; ++ ++ case NETDEV_REGISTER: { ++ struct net *oldnet, *newnet = dev_net(dev); ++ ++ oldnet = read_pnet(&port->pnet); ++ if (net_eq(newnet, oldnet)) ++ break; ++ ++ write_pnet(&port->pnet, newnet); ++ ++ ipvlan_migrate_l3s_hook(oldnet, newnet); ++ break; ++ } ++ case NETDEV_UNREGISTER: ++ if (dev->reg_state != NETREG_UNREGISTERING) ++ break; ++ ++ list_for_each_entry_safe(ipvlan, next, &port->ipvlans, pnode) ++ ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev, ++ &lst_kill); ++ unregister_netdevice_many(&lst_kill); ++ break; ++ ++ case NETDEV_FEAT_CHANGE: ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) { ++ netif_inherit_tso_max(ipvlan->dev, dev); ++ netdev_update_features(ipvlan->dev); ++ } ++ break; ++ ++ case NETDEV_CHANGEMTU: ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) ++ ipvlan_adjust_mtu(ipvlan, dev); ++ break; ++ ++ case NETDEV_PRE_CHANGEADDR: ++ prechaddr_info = ptr; ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) { ++ err = dev_pre_changeaddr_notify(ipvlan->dev, ++ prechaddr_info->dev_addr, ++ extack); ++ if (err) ++ return notifier_from_errno(err); ++ } ++ break; ++ ++ case NETDEV_CHANGEADDR: ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) { ++ eth_hw_addr_set(ipvlan->dev, dev->dev_addr); ++ call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev); ++ } ++ break; ++ ++ case NETDEV_PRE_TYPE_CHANGE: ++ /* Forbid underlying device to change its type. */ ++ return NOTIFY_BAD; ++ } ++ return NOTIFY_DONE; ++} ++ ++/* the caller must held the addrs lock */ ++static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) ++{ ++ struct ipvl_addr *addr; ++ ++ addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC); ++ if (!addr) ++ return -ENOMEM; ++ ++ addr->master = ipvlan; ++ if (!is_v6) { ++ memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr)); ++ addr->atype = IPVL_IPV4; ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { ++ memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr)); ++ addr->atype = IPVL_IPV6; ++#endif ++ } ++ ++ list_add_tail_rcu(&addr->anode, &ipvlan->addrs); ++ ++ /* If the interface is not up, the address will be added to the hash ++ * list by ipvlan_open. ++ */ ++ if (netif_running(ipvlan->dev)) ++ ipvlan_ht_addr_add(ipvlan, addr); ++ ++ return 0; ++} ++ ++static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) ++{ ++ struct ipvl_addr *addr; ++ ++ spin_lock_bh(&ipvlan->addrs_lock); ++ addr = ipvlan_find_addr(ipvlan, iaddr, is_v6); ++ if (!addr) { ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ return; ++ } ++ ++ ipvlan_ht_addr_del(addr); ++ list_del_rcu(&addr->anode); ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ kfree_rcu(addr, rcu); ++} ++ ++static bool ipvlan_is_valid_dev(const struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (!netif_is_ipvlan(dev)) ++ return false; ++ ++ if (!ipvlan || !ipvlan->port) ++ return false; ++ ++ return true; ++} ++ ++#if IS_ENABLED(CONFIG_IPV6) ++static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) ++{ ++ int ret = -EINVAL; ++ ++ spin_lock_bh(&ipvlan->addrs_lock); ++ if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) ++ netif_err(ipvlan, ifup, ipvlan->dev, ++ "Failed to add IPv6=%pI6c addr for %s intf\n", ++ ip6_addr, ipvlan->dev->name); ++ else ++ ret = ipvlan_add_addr(ipvlan, ip6_addr, true); ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ return ret; ++} ++ ++static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) ++{ ++ return ipvlan_del_addr(ipvlan, ip6_addr, true); ++} ++ ++static int ipvlan_addr6_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr; ++ struct net_device *dev = (struct net_device *)if6->idev->dev; ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (!ipvlan_is_valid_dev(dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_UP: ++ if (ipvlan_add_addr6(ipvlan, &if6->addr)) ++ return NOTIFY_BAD; ++ break; ++ ++ case NETDEV_DOWN: ++ ipvlan_del_addr6(ipvlan, &if6->addr); ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static int ipvlan_addr6_validator_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr; ++ struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev; ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (!ipvlan_is_valid_dev(dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_UP: ++ if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) { ++ NL_SET_ERR_MSG(i6vi->extack, ++ "Address already assigned to an ipvlan device"); ++ return notifier_from_errno(-EADDRINUSE); ++ } ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++#endif ++ ++static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) ++{ ++ int ret = -EINVAL; ++ ++ spin_lock_bh(&ipvlan->addrs_lock); ++ if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false)) ++ netif_err(ipvlan, ifup, ipvlan->dev, ++ "Failed to add IPv4=%pI4 on %s intf.\n", ++ ip4_addr, ipvlan->dev->name); ++ else ++ ret = ipvlan_add_addr(ipvlan, ip4_addr, false); ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ return ret; ++} ++ ++static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) ++{ ++ return ipvlan_del_addr(ipvlan, ip4_addr, false); ++} ++ ++static int ipvlan_addr4_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct in_ifaddr *if4 = (struct in_ifaddr *)ptr; ++ struct net_device *dev = (struct net_device *)if4->ifa_dev->dev; ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct in_addr ip4_addr; ++ ++ if (!ipvlan_is_valid_dev(dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_UP: ++ ip4_addr.s_addr = if4->ifa_address; ++ if (ipvlan_add_addr4(ipvlan, &ip4_addr)) ++ return NOTIFY_BAD; ++ break; ++ ++ case NETDEV_DOWN: ++ ip4_addr.s_addr = if4->ifa_address; ++ ipvlan_del_addr4(ipvlan, &ip4_addr); ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static int ipvlan_addr4_validator_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct in_validator_info *ivi = (struct in_validator_info *)ptr; ++ struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev; ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (!ipvlan_is_valid_dev(dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_UP: ++ if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) { ++ NL_SET_ERR_MSG(ivi->extack, ++ "Address already assigned to an ipvlan device"); ++ return notifier_from_errno(-EADDRINUSE); ++ } ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_addr4_event, ++}; ++ ++static struct notifier_block ipvlan_addr4_vtor_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_addr4_validator_event, ++}; ++ ++static struct notifier_block ipvlan_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_device_event, ++}; ++ ++#if IS_ENABLED(CONFIG_IPV6) ++static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_addr6_event, ++}; ++ ++static struct notifier_block ipvlan_addr6_vtor_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_addr6_validator_event, ++}; ++#endif ++ ++static int __init ipvlan_init_module(void) ++{ ++ int err; ++ ++ ipvlan_init_secret(); ++ register_netdevice_notifier(&ipvlan_notifier_block); ++#if IS_ENABLED(CONFIG_IPV6) ++ register_inet6addr_notifier(&ipvlan_addr6_notifier_block); ++ register_inet6addr_validator_notifier( ++ &ipvlan_addr6_vtor_notifier_block); ++#endif ++ register_inetaddr_notifier(&ipvlan_addr4_notifier_block); ++ register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block); ++ ++ err = ipvlan_l3s_init(); ++ if (err < 0) ++ goto error; ++ ++ err = ipvlan_link_register(&ipvlan_link_ops); ++ if (err < 0) { ++ ipvlan_l3s_cleanup(); ++ goto error; ++ } ++ ++ return 0; ++error: ++ unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); ++ unregister_inetaddr_validator_notifier( ++ &ipvlan_addr4_vtor_notifier_block); ++#if IS_ENABLED(CONFIG_IPV6) ++ unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); ++ unregister_inet6addr_validator_notifier( ++ &ipvlan_addr6_vtor_notifier_block); ++#endif ++ unregister_netdevice_notifier(&ipvlan_notifier_block); ++ return err; ++} ++ ++static void __exit ipvlan_cleanup_module(void) ++{ ++ rtnl_link_unregister(&ipvlan_link_ops); ++ ipvlan_l3s_cleanup(); ++ unregister_netdevice_notifier(&ipvlan_notifier_block); ++ unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); ++ unregister_inetaddr_validator_notifier( ++ &ipvlan_addr4_vtor_notifier_block); ++#if IS_ENABLED(CONFIG_IPV6) ++ unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); ++ unregister_inet6addr_validator_notifier( ++ &ipvlan_addr6_vtor_notifier_block); ++#endif ++} ++ ++module_init(ipvlan_init_module); ++module_exit(ipvlan_cleanup_module); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Mahesh Bandewar "); ++MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs"); ++MODULE_ALIAS_RTNL_LINK("ipvlan"); +diff -rupN linux.orig/drivers/net/loopback.c linux/drivers/net/loopback.c +--- linux.orig/drivers/net/loopback.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/loopback.c 2022-12-04 10:40:26.696034096 -0500 +@@ -106,10 +106,10 @@ void dev_lstats_read(struct net_device * lb_stats = per_cpu_ptr(dev->lstats, i); do { @@ -2769,11 +15998,10 @@ index 14e8d04cb4347..c4ad98d39ea60 100644 *bytes += tbytes; *packets += tpackets; } -diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c -index c6d271e5687e9..5056f3cd5699a 100644 ---- a/drivers/net/macsec.c -+++ b/drivers/net/macsec.c -@@ -2823,9 +2823,9 @@ static void get_rx_sc_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/macsec.c linux/drivers/net/macsec.c +--- linux.orig/drivers/net/macsec.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/macsec.c 2022-12-04 10:40:26.696034096 -0500 +@@ -2821,9 +2821,9 @@ static void get_rx_sc_stats(struct net_d stats = per_cpu_ptr(rx_sc->stats, cpu); do { @@ -2785,7 +16013,7 @@ index c6d271e5687e9..5056f3cd5699a 100644 sum->InOctetsValidated += tmp.InOctetsValidated; sum->InOctetsDecrypted += tmp.InOctetsDecrypted; -@@ -2904,9 +2904,9 @@ static void get_tx_sc_stats(struct net_device *dev, +@@ -2902,9 +2902,9 @@ static void get_tx_sc_stats(struct net_d stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu); do { @@ -2797,7 +16025,7 @@ index c6d271e5687e9..5056f3cd5699a 100644 sum->OutPktsProtected += tmp.OutPktsProtected; sum->OutPktsEncrypted += tmp.OutPktsEncrypted; -@@ -2960,9 +2960,9 @@ static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum) +@@ -2958,9 +2958,9 @@ static void get_secy_stats(struct net_de stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu); do { @@ -2809,11 +16037,4431 @@ index c6d271e5687e9..5056f3cd5699a 100644 sum->OutPktsUntagged += tmp.OutPktsUntagged; sum->InPktsUntagged += tmp.InPktsUntagged; -diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c -index 1080d6ebff63b..a1c7823f0ba66 100644 ---- a/drivers/net/macvlan.c -+++ b/drivers/net/macvlan.c -@@ -948,13 +948,13 @@ static void macvlan_dev_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/macsec.c.orig linux/drivers/net/macsec.c.orig +--- linux.orig/drivers/net/macsec.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/macsec.c.orig 2022-12-04 10:40:18.180055916 -0500 +@@ -0,0 +1,4417 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * drivers/net/macsec.c - MACsec device ++ * ++ * Copyright (c) 2015 Sabrina Dubroca ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define MACSEC_SCI_LEN 8 ++ ++/* SecTAG length = macsec_eth_header without the optional SCI */ ++#define MACSEC_TAG_LEN 6 ++ ++struct macsec_eth_header { ++ struct ethhdr eth; ++ /* SecTAG */ ++ u8 tci_an; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ u8 short_length:6, ++ unused:2; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ u8 unused:2, ++ short_length:6; ++#else ++#error "Please fix " ++#endif ++ __be32 packet_number; ++ u8 secure_channel_id[8]; /* optional */ ++} __packed; ++ ++#define MACSEC_TCI_VERSION 0x80 ++#define MACSEC_TCI_ES 0x40 /* end station */ ++#define MACSEC_TCI_SC 0x20 /* SCI present */ ++#define MACSEC_TCI_SCB 0x10 /* epon */ ++#define MACSEC_TCI_E 0x08 /* encryption */ ++#define MACSEC_TCI_C 0x04 /* changed text */ ++#define MACSEC_AN_MASK 0x03 /* association number */ ++#define MACSEC_TCI_CONFID (MACSEC_TCI_E | MACSEC_TCI_C) ++ ++/* minimum secure data length deemed "not short", see IEEE 802.1AE-2006 9.7 */ ++#define MIN_NON_SHORT_LEN 48 ++ ++#define GCM_AES_IV_LEN 12 ++#define DEFAULT_ICV_LEN 16 ++ ++#define for_each_rxsc(secy, sc) \ ++ for (sc = rcu_dereference_bh(secy->rx_sc); \ ++ sc; \ ++ sc = rcu_dereference_bh(sc->next)) ++#define for_each_rxsc_rtnl(secy, sc) \ ++ for (sc = rtnl_dereference(secy->rx_sc); \ ++ sc; \ ++ sc = rtnl_dereference(sc->next)) ++ ++#define pn_same_half(pn1, pn2) (!(((pn1) >> 31) ^ ((pn2) >> 31))) ++ ++struct gcm_iv_xpn { ++ union { ++ u8 short_secure_channel_id[4]; ++ ssci_t ssci; ++ }; ++ __be64 pn; ++} __packed; ++ ++struct gcm_iv { ++ union { ++ u8 secure_channel_id[8]; ++ sci_t sci; ++ }; ++ __be32 pn; ++}; ++ ++#define MACSEC_VALIDATE_DEFAULT MACSEC_VALIDATE_STRICT ++ ++struct pcpu_secy_stats { ++ struct macsec_dev_stats stats; ++ struct u64_stats_sync syncp; ++}; ++ ++/** ++ * struct macsec_dev - private data ++ * @secy: SecY config ++ * @real_dev: pointer to underlying netdevice ++ * @dev_tracker: refcount tracker for @real_dev reference ++ * @stats: MACsec device stats ++ * @secys: linked list of SecY's on the underlying device ++ * @gro_cells: pointer to the Generic Receive Offload cell ++ * @offload: status of offloading on the MACsec device ++ */ ++struct macsec_dev { ++ struct macsec_secy secy; ++ struct net_device *real_dev; ++ netdevice_tracker dev_tracker; ++ struct pcpu_secy_stats __percpu *stats; ++ struct list_head secys; ++ struct gro_cells gro_cells; ++ enum macsec_offload offload; ++}; ++ ++/** ++ * struct macsec_rxh_data - rx_handler private argument ++ * @secys: linked list of SecY's on this underlying device ++ */ ++struct macsec_rxh_data { ++ struct list_head secys; ++}; ++ ++static struct macsec_dev *macsec_priv(const struct net_device *dev) ++{ ++ return (struct macsec_dev *)netdev_priv(dev); ++} ++ ++static struct macsec_rxh_data *macsec_data_rcu(const struct net_device *dev) ++{ ++ return rcu_dereference_bh(dev->rx_handler_data); ++} ++ ++static struct macsec_rxh_data *macsec_data_rtnl(const struct net_device *dev) ++{ ++ return rtnl_dereference(dev->rx_handler_data); ++} ++ ++struct macsec_cb { ++ struct aead_request *req; ++ union { ++ struct macsec_tx_sa *tx_sa; ++ struct macsec_rx_sa *rx_sa; ++ }; ++ u8 assoc_num; ++ bool valid; ++ bool has_sci; ++}; ++ ++static struct macsec_rx_sa *macsec_rxsa_get(struct macsec_rx_sa __rcu *ptr) ++{ ++ struct macsec_rx_sa *sa = rcu_dereference_bh(ptr); ++ ++ if (!sa || !sa->active) ++ return NULL; ++ ++ if (!refcount_inc_not_zero(&sa->refcnt)) ++ return NULL; ++ ++ return sa; ++} ++ ++static struct macsec_rx_sa *macsec_active_rxsa_get(struct macsec_rx_sc *rx_sc) ++{ ++ struct macsec_rx_sa *sa = NULL; ++ int an; ++ ++ for (an = 0; an < MACSEC_NUM_AN; an++) { ++ sa = macsec_rxsa_get(rx_sc->sa[an]); ++ if (sa) ++ break; ++ } ++ return sa; ++} ++ ++static void free_rx_sc_rcu(struct rcu_head *head) ++{ ++ struct macsec_rx_sc *rx_sc = container_of(head, struct macsec_rx_sc, rcu_head); ++ ++ free_percpu(rx_sc->stats); ++ kfree(rx_sc); ++} ++ ++static struct macsec_rx_sc *macsec_rxsc_get(struct macsec_rx_sc *sc) ++{ ++ return refcount_inc_not_zero(&sc->refcnt) ? sc : NULL; ++} ++ ++static void macsec_rxsc_put(struct macsec_rx_sc *sc) ++{ ++ if (refcount_dec_and_test(&sc->refcnt)) ++ call_rcu(&sc->rcu_head, free_rx_sc_rcu); ++} ++ ++static void free_rxsa(struct rcu_head *head) ++{ ++ struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu); ++ ++ crypto_free_aead(sa->key.tfm); ++ free_percpu(sa->stats); ++ kfree(sa); ++} ++ ++static void macsec_rxsa_put(struct macsec_rx_sa *sa) ++{ ++ if (refcount_dec_and_test(&sa->refcnt)) ++ call_rcu(&sa->rcu, free_rxsa); ++} ++ ++static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) ++{ ++ struct macsec_tx_sa *sa = rcu_dereference_bh(ptr); ++ ++ if (!sa || !sa->active) ++ return NULL; ++ ++ if (!refcount_inc_not_zero(&sa->refcnt)) ++ return NULL; ++ ++ return sa; ++} ++ ++static void free_txsa(struct rcu_head *head) ++{ ++ struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu); ++ ++ crypto_free_aead(sa->key.tfm); ++ free_percpu(sa->stats); ++ kfree(sa); ++} ++ ++static void macsec_txsa_put(struct macsec_tx_sa *sa) ++{ ++ if (refcount_dec_and_test(&sa->refcnt)) ++ call_rcu(&sa->rcu, free_txsa); ++} ++ ++static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) ++{ ++ BUILD_BUG_ON(sizeof(struct macsec_cb) > sizeof(skb->cb)); ++ return (struct macsec_cb *)skb->cb; ++} ++ ++#define MACSEC_PORT_ES (htons(0x0001)) ++#define MACSEC_PORT_SCB (0x0000) ++#define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL) ++#define MACSEC_UNDEF_SSCI ((__force ssci_t)0xffffffff) ++ ++#define MACSEC_GCM_AES_128_SAK_LEN 16 ++#define MACSEC_GCM_AES_256_SAK_LEN 32 ++ ++#define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN ++#define DEFAULT_XPN false ++#define DEFAULT_SEND_SCI true ++#define DEFAULT_ENCRYPT false ++#define DEFAULT_ENCODING_SA 0 ++#define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1)) ++ ++static bool send_sci(const struct macsec_secy *secy) ++{ ++ const struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ ++ return tx_sc->send_sci || ++ (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); ++} ++ ++static sci_t make_sci(const u8 *addr, __be16 port) ++{ ++ sci_t sci; ++ ++ memcpy(&sci, addr, ETH_ALEN); ++ memcpy(((char *)&sci) + ETH_ALEN, &port, sizeof(port)); ++ ++ return sci; ++} ++ ++static sci_t macsec_frame_sci(struct macsec_eth_header *hdr, bool sci_present) ++{ ++ sci_t sci; ++ ++ if (sci_present) ++ memcpy(&sci, hdr->secure_channel_id, ++ sizeof(hdr->secure_channel_id)); ++ else ++ sci = make_sci(hdr->eth.h_source, MACSEC_PORT_ES); ++ ++ return sci; ++} ++ ++static unsigned int macsec_sectag_len(bool sci_present) ++{ ++ return MACSEC_TAG_LEN + (sci_present ? MACSEC_SCI_LEN : 0); ++} ++ ++static unsigned int macsec_hdr_len(bool sci_present) ++{ ++ return macsec_sectag_len(sci_present) + ETH_HLEN; ++} ++ ++static unsigned int macsec_extra_len(bool sci_present) ++{ ++ return macsec_sectag_len(sci_present) + sizeof(__be16); ++} ++ ++/* Fill SecTAG according to IEEE 802.1AE-2006 10.5.3 */ ++static void macsec_fill_sectag(struct macsec_eth_header *h, ++ const struct macsec_secy *secy, u32 pn, ++ bool sci_present) ++{ ++ const struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ ++ memset(&h->tci_an, 0, macsec_sectag_len(sci_present)); ++ h->eth.h_proto = htons(ETH_P_MACSEC); ++ ++ if (sci_present) { ++ h->tci_an |= MACSEC_TCI_SC; ++ memcpy(&h->secure_channel_id, &secy->sci, ++ sizeof(h->secure_channel_id)); ++ } else { ++ if (tx_sc->end_station) ++ h->tci_an |= MACSEC_TCI_ES; ++ if (tx_sc->scb) ++ h->tci_an |= MACSEC_TCI_SCB; ++ } ++ ++ h->packet_number = htonl(pn); ++ ++ /* with GCM, C/E clear for !encrypt, both set for encrypt */ ++ if (tx_sc->encrypt) ++ h->tci_an |= MACSEC_TCI_CONFID; ++ else if (secy->icv_len != DEFAULT_ICV_LEN) ++ h->tci_an |= MACSEC_TCI_C; ++ ++ h->tci_an |= tx_sc->encoding_sa; ++} ++ ++static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len) ++{ ++ if (data_len < MIN_NON_SHORT_LEN) ++ h->short_length = data_len; ++} ++ ++/* Checks if a MACsec interface is being offloaded to an hardware engine */ ++static bool macsec_is_offloaded(struct macsec_dev *macsec) ++{ ++ if (macsec->offload == MACSEC_OFFLOAD_MAC || ++ macsec->offload == MACSEC_OFFLOAD_PHY) ++ return true; ++ ++ return false; ++} ++ ++/* Checks if underlying layers implement MACsec offloading functions. */ ++static bool macsec_check_offload(enum macsec_offload offload, ++ struct macsec_dev *macsec) ++{ ++ if (!macsec || !macsec->real_dev) ++ return false; ++ ++ if (offload == MACSEC_OFFLOAD_PHY) ++ return macsec->real_dev->phydev && ++ macsec->real_dev->phydev->macsec_ops; ++ else if (offload == MACSEC_OFFLOAD_MAC) ++ return macsec->real_dev->features & NETIF_F_HW_MACSEC && ++ macsec->real_dev->macsec_ops; ++ ++ return false; ++} ++ ++static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload, ++ struct macsec_dev *macsec, ++ struct macsec_context *ctx) ++{ ++ if (ctx) { ++ memset(ctx, 0, sizeof(*ctx)); ++ ctx->offload = offload; ++ ++ if (offload == MACSEC_OFFLOAD_PHY) ++ ctx->phydev = macsec->real_dev->phydev; ++ else if (offload == MACSEC_OFFLOAD_MAC) ++ ctx->netdev = macsec->real_dev; ++ } ++ ++ if (offload == MACSEC_OFFLOAD_PHY) ++ return macsec->real_dev->phydev->macsec_ops; ++ else ++ return macsec->real_dev->macsec_ops; ++} ++ ++/* Returns a pointer to the MACsec ops struct if any and updates the MACsec ++ * context device reference if provided. ++ */ ++static const struct macsec_ops *macsec_get_ops(struct macsec_dev *macsec, ++ struct macsec_context *ctx) ++{ ++ if (!macsec_check_offload(macsec->offload, macsec)) ++ return NULL; ++ ++ return __macsec_get_ops(macsec->offload, macsec, ctx); ++} ++ ++/* validate MACsec packet according to IEEE 802.1AE-2018 9.12 */ ++static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len, bool xpn) ++{ ++ struct macsec_eth_header *h = (struct macsec_eth_header *)skb->data; ++ int len = skb->len - 2 * ETH_ALEN; ++ int extra_len = macsec_extra_len(!!(h->tci_an & MACSEC_TCI_SC)) + icv_len; ++ ++ /* a) It comprises at least 17 octets */ ++ if (skb->len <= 16) ++ return false; ++ ++ /* b) MACsec EtherType: already checked */ ++ ++ /* c) V bit is clear */ ++ if (h->tci_an & MACSEC_TCI_VERSION) ++ return false; ++ ++ /* d) ES or SCB => !SC */ ++ if ((h->tci_an & MACSEC_TCI_ES || h->tci_an & MACSEC_TCI_SCB) && ++ (h->tci_an & MACSEC_TCI_SC)) ++ return false; ++ ++ /* e) Bits 7 and 8 of octet 4 of the SecTAG are clear */ ++ if (h->unused) ++ return false; ++ ++ /* rx.pn != 0 if not XPN (figure 10-5 with 802.11AEbw-2013 amendment) */ ++ if (!h->packet_number && !xpn) ++ return false; ++ ++ /* length check, f) g) h) i) */ ++ if (h->short_length) ++ return len == extra_len + h->short_length; ++ return len >= extra_len + MIN_NON_SHORT_LEN; ++} ++ ++#define MACSEC_NEEDED_HEADROOM (macsec_extra_len(true)) ++#define MACSEC_NEEDED_TAILROOM MACSEC_STD_ICV_LEN ++ ++static void macsec_fill_iv_xpn(unsigned char *iv, ssci_t ssci, u64 pn, ++ salt_t salt) ++{ ++ struct gcm_iv_xpn *gcm_iv = (struct gcm_iv_xpn *)iv; ++ ++ gcm_iv->ssci = ssci ^ salt.ssci; ++ gcm_iv->pn = cpu_to_be64(pn) ^ salt.pn; ++} ++ ++static void macsec_fill_iv(unsigned char *iv, sci_t sci, u32 pn) ++{ ++ struct gcm_iv *gcm_iv = (struct gcm_iv *)iv; ++ ++ gcm_iv->sci = sci; ++ gcm_iv->pn = htonl(pn); ++} ++ ++static struct macsec_eth_header *macsec_ethhdr(struct sk_buff *skb) ++{ ++ return (struct macsec_eth_header *)skb_mac_header(skb); ++} ++ ++static void __macsec_pn_wrapped(struct macsec_secy *secy, ++ struct macsec_tx_sa *tx_sa) ++{ ++ pr_debug("PN wrapped, transitioning to !oper\n"); ++ tx_sa->active = false; ++ if (secy->protect_frames) ++ secy->operational = false; ++} ++ ++void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa) ++{ ++ spin_lock_bh(&tx_sa->lock); ++ __macsec_pn_wrapped(secy, tx_sa); ++ spin_unlock_bh(&tx_sa->lock); ++} ++EXPORT_SYMBOL_GPL(macsec_pn_wrapped); ++ ++static pn_t tx_sa_update_pn(struct macsec_tx_sa *tx_sa, ++ struct macsec_secy *secy) ++{ ++ pn_t pn; ++ ++ spin_lock_bh(&tx_sa->lock); ++ ++ pn = tx_sa->next_pn_halves; ++ if (secy->xpn) ++ tx_sa->next_pn++; ++ else ++ tx_sa->next_pn_halves.lower++; ++ ++ if (tx_sa->next_pn == 0) ++ __macsec_pn_wrapped(secy, tx_sa); ++ spin_unlock_bh(&tx_sa->lock); ++ ++ return pn; ++} ++ ++static void macsec_encrypt_finish(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct macsec_dev *macsec = netdev_priv(dev); ++ ++ skb->dev = macsec->real_dev; ++ skb_reset_mac_header(skb); ++ skb->protocol = eth_hdr(skb)->h_proto; ++} ++ ++static unsigned int macsec_msdu_len(struct sk_buff *skb) ++{ ++ struct macsec_dev *macsec = macsec_priv(skb->dev); ++ struct macsec_secy *secy = &macsec->secy; ++ bool sci_present = macsec_skb_cb(skb)->has_sci; ++ ++ return skb->len - macsec_hdr_len(sci_present) - secy->icv_len; ++} ++ ++static void macsec_count_tx(struct sk_buff *skb, struct macsec_tx_sc *tx_sc, ++ struct macsec_tx_sa *tx_sa) ++{ ++ unsigned int msdu_len = macsec_msdu_len(skb); ++ struct pcpu_tx_sc_stats *txsc_stats = this_cpu_ptr(tx_sc->stats); ++ ++ u64_stats_update_begin(&txsc_stats->syncp); ++ if (tx_sc->encrypt) { ++ txsc_stats->stats.OutOctetsEncrypted += msdu_len; ++ txsc_stats->stats.OutPktsEncrypted++; ++ this_cpu_inc(tx_sa->stats->OutPktsEncrypted); ++ } else { ++ txsc_stats->stats.OutOctetsProtected += msdu_len; ++ txsc_stats->stats.OutPktsProtected++; ++ this_cpu_inc(tx_sa->stats->OutPktsProtected); ++ } ++ u64_stats_update_end(&txsc_stats->syncp); ++} ++ ++static void count_tx(struct net_device *dev, int ret, int len) ++{ ++ if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { ++ struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats); ++ ++ u64_stats_update_begin(&stats->syncp); ++ u64_stats_inc(&stats->tx_packets); ++ u64_stats_add(&stats->tx_bytes, len); ++ u64_stats_update_end(&stats->syncp); ++ } ++} ++ ++static void macsec_encrypt_done(struct crypto_async_request *base, int err) ++{ ++ struct sk_buff *skb = base->data; ++ struct net_device *dev = skb->dev; ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa; ++ int len, ret; ++ ++ aead_request_free(macsec_skb_cb(skb)->req); ++ ++ rcu_read_lock_bh(); ++ macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); ++ /* packet is encrypted/protected so tx_bytes must be calculated */ ++ len = macsec_msdu_len(skb) + 2 * ETH_ALEN; ++ macsec_encrypt_finish(skb, dev); ++ ret = dev_queue_xmit(skb); ++ count_tx(dev, ret, len); ++ rcu_read_unlock_bh(); ++ ++ macsec_txsa_put(sa); ++ dev_put(dev); ++} ++ ++static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm, ++ unsigned char **iv, ++ struct scatterlist **sg, ++ int num_frags) ++{ ++ size_t size, iv_offset, sg_offset; ++ struct aead_request *req; ++ void *tmp; ++ ++ size = sizeof(struct aead_request) + crypto_aead_reqsize(tfm); ++ iv_offset = size; ++ size += GCM_AES_IV_LEN; ++ ++ size = ALIGN(size, __alignof__(struct scatterlist)); ++ sg_offset = size; ++ size += sizeof(struct scatterlist) * num_frags; ++ ++ tmp = kmalloc(size, GFP_ATOMIC); ++ if (!tmp) ++ return NULL; ++ ++ *iv = (unsigned char *)(tmp + iv_offset); ++ *sg = (struct scatterlist *)(tmp + sg_offset); ++ req = tmp; ++ ++ aead_request_set_tfm(req, tfm); ++ ++ return req; ++} ++ ++static struct sk_buff *macsec_encrypt(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ int ret; ++ struct scatterlist *sg; ++ struct sk_buff *trailer; ++ unsigned char *iv; ++ struct ethhdr *eth; ++ struct macsec_eth_header *hh; ++ size_t unprotected_len; ++ struct aead_request *req; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ struct macsec_dev *macsec = macsec_priv(dev); ++ bool sci_present; ++ pn_t pn; ++ ++ secy = &macsec->secy; ++ tx_sc = &secy->tx_sc; ++ ++ /* 10.5.1 TX SA assignment */ ++ tx_sa = macsec_txsa_get(tx_sc->sa[tx_sc->encoding_sa]); ++ if (!tx_sa) { ++ secy->operational = false; ++ kfree_skb(skb); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ if (unlikely(skb_headroom(skb) < MACSEC_NEEDED_HEADROOM || ++ skb_tailroom(skb) < MACSEC_NEEDED_TAILROOM)) { ++ struct sk_buff *nskb = skb_copy_expand(skb, ++ MACSEC_NEEDED_HEADROOM, ++ MACSEC_NEEDED_TAILROOM, ++ GFP_ATOMIC); ++ if (likely(nskb)) { ++ consume_skb(skb); ++ skb = nskb; ++ } else { ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(-ENOMEM); ++ } ++ } else { ++ skb = skb_unshare(skb, GFP_ATOMIC); ++ if (!skb) { ++ macsec_txsa_put(tx_sa); ++ return ERR_PTR(-ENOMEM); ++ } ++ } ++ ++ unprotected_len = skb->len; ++ eth = eth_hdr(skb); ++ sci_present = send_sci(secy); ++ hh = skb_push(skb, macsec_extra_len(sci_present)); ++ memmove(hh, eth, 2 * ETH_ALEN); ++ ++ pn = tx_sa_update_pn(tx_sa, secy); ++ if (pn.full64 == 0) { ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(-ENOLINK); ++ } ++ macsec_fill_sectag(hh, secy, pn.lower, sci_present); ++ macsec_set_shortlen(hh, unprotected_len - 2 * ETH_ALEN); ++ ++ skb_put(skb, secy->icv_len); ++ ++ if (skb->len - ETH_HLEN > macsec_priv(dev)->real_dev->mtu) { ++ struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); ++ ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.OutPktsTooLong++; ++ u64_stats_update_end(&secy_stats->syncp); ++ ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ ret = skb_cow_data(skb, 0, &trailer); ++ if (unlikely(ret < 0)) { ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(ret); ++ } ++ ++ req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret); ++ if (!req) { ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ if (secy->xpn) ++ macsec_fill_iv_xpn(iv, tx_sa->ssci, pn.full64, tx_sa->key.salt); ++ else ++ macsec_fill_iv(iv, secy->sci, pn.lower); ++ ++ sg_init_table(sg, ret); ++ ret = skb_to_sgvec(skb, sg, 0, skb->len); ++ if (unlikely(ret < 0)) { ++ aead_request_free(req); ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(ret); ++ } ++ ++ if (tx_sc->encrypt) { ++ int len = skb->len - macsec_hdr_len(sci_present) - ++ secy->icv_len; ++ aead_request_set_crypt(req, sg, sg, len, iv); ++ aead_request_set_ad(req, macsec_hdr_len(sci_present)); ++ } else { ++ aead_request_set_crypt(req, sg, sg, 0, iv); ++ aead_request_set_ad(req, skb->len - secy->icv_len); ++ } ++ ++ macsec_skb_cb(skb)->req = req; ++ macsec_skb_cb(skb)->tx_sa = tx_sa; ++ macsec_skb_cb(skb)->has_sci = sci_present; ++ aead_request_set_callback(req, 0, macsec_encrypt_done, skb); ++ ++ dev_hold(skb->dev); ++ ret = crypto_aead_encrypt(req); ++ if (ret == -EINPROGRESS) { ++ return ERR_PTR(ret); ++ } else if (ret != 0) { ++ dev_put(skb->dev); ++ kfree_skb(skb); ++ aead_request_free(req); ++ macsec_txsa_put(tx_sa); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ dev_put(skb->dev); ++ aead_request_free(req); ++ macsec_txsa_put(tx_sa); ++ ++ return skb; ++} ++ ++static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u32 pn) ++{ ++ struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; ++ struct pcpu_rx_sc_stats *rxsc_stats = this_cpu_ptr(rx_sa->sc->stats); ++ struct macsec_eth_header *hdr = macsec_ethhdr(skb); ++ u32 lowest_pn = 0; ++ ++ spin_lock(&rx_sa->lock); ++ if (rx_sa->next_pn_halves.lower >= secy->replay_window) ++ lowest_pn = rx_sa->next_pn_halves.lower - secy->replay_window; ++ ++ /* Now perform replay protection check again ++ * (see IEEE 802.1AE-2006 figure 10-5) ++ */ ++ if (secy->replay_protect && pn < lowest_pn && ++ (!secy->xpn || pn_same_half(pn, lowest_pn))) { ++ spin_unlock(&rx_sa->lock); ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsLate++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ secy->netdev->stats.rx_dropped++; ++ return false; ++ } ++ ++ if (secy->validate_frames != MACSEC_VALIDATE_DISABLED) { ++ unsigned int msdu_len = macsec_msdu_len(skb); ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ if (hdr->tci_an & MACSEC_TCI_E) ++ rxsc_stats->stats.InOctetsDecrypted += msdu_len; ++ else ++ rxsc_stats->stats.InOctetsValidated += msdu_len; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ } ++ ++ if (!macsec_skb_cb(skb)->valid) { ++ spin_unlock(&rx_sa->lock); ++ ++ /* 10.6.5 */ ++ if (hdr->tci_an & MACSEC_TCI_C || ++ secy->validate_frames == MACSEC_VALIDATE_STRICT) { ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsNotValid++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ this_cpu_inc(rx_sa->stats->InPktsNotValid); ++ secy->netdev->stats.rx_errors++; ++ return false; ++ } ++ ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ if (secy->validate_frames == MACSEC_VALIDATE_CHECK) { ++ rxsc_stats->stats.InPktsInvalid++; ++ this_cpu_inc(rx_sa->stats->InPktsInvalid); ++ } else if (pn < lowest_pn) { ++ rxsc_stats->stats.InPktsDelayed++; ++ } else { ++ rxsc_stats->stats.InPktsUnchecked++; ++ } ++ u64_stats_update_end(&rxsc_stats->syncp); ++ } else { ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ if (pn < lowest_pn) { ++ rxsc_stats->stats.InPktsDelayed++; ++ } else { ++ rxsc_stats->stats.InPktsOK++; ++ this_cpu_inc(rx_sa->stats->InPktsOK); ++ } ++ u64_stats_update_end(&rxsc_stats->syncp); ++ ++ // Instead of "pn >=" - to support pn overflow in xpn ++ if (pn + 1 > rx_sa->next_pn_halves.lower) { ++ rx_sa->next_pn_halves.lower = pn + 1; ++ } else if (secy->xpn && ++ !pn_same_half(pn, rx_sa->next_pn_halves.lower)) { ++ rx_sa->next_pn_halves.upper++; ++ rx_sa->next_pn_halves.lower = pn + 1; ++ } ++ ++ spin_unlock(&rx_sa->lock); ++ } ++ ++ return true; ++} ++ ++static void macsec_reset_skb(struct sk_buff *skb, struct net_device *dev) ++{ ++ skb->pkt_type = PACKET_HOST; ++ skb->protocol = eth_type_trans(skb, dev); ++ ++ skb_reset_network_header(skb); ++ if (!skb_transport_header_was_set(skb)) ++ skb_reset_transport_header(skb); ++ skb_reset_mac_len(skb); ++} ++ ++static void macsec_finalize_skb(struct sk_buff *skb, u8 icv_len, u8 hdr_len) ++{ ++ skb->ip_summed = CHECKSUM_NONE; ++ memmove(skb->data + hdr_len, skb->data, 2 * ETH_ALEN); ++ skb_pull(skb, hdr_len); ++ pskb_trim_unique(skb, skb->len - icv_len); ++} ++ ++static void count_rx(struct net_device *dev, int len) ++{ ++ struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats); ++ ++ u64_stats_update_begin(&stats->syncp); ++ u64_stats_inc(&stats->rx_packets); ++ u64_stats_add(&stats->rx_bytes, len); ++ u64_stats_update_end(&stats->syncp); ++} ++ ++static void macsec_decrypt_done(struct crypto_async_request *base, int err) ++{ ++ struct sk_buff *skb = base->data; ++ struct net_device *dev = skb->dev; ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; ++ struct macsec_rx_sc *rx_sc = rx_sa->sc; ++ int len; ++ u32 pn; ++ ++ aead_request_free(macsec_skb_cb(skb)->req); ++ ++ if (!err) ++ macsec_skb_cb(skb)->valid = true; ++ ++ rcu_read_lock_bh(); ++ pn = ntohl(macsec_ethhdr(skb)->packet_number); ++ if (!macsec_post_decrypt(skb, &macsec->secy, pn)) { ++ rcu_read_unlock_bh(); ++ kfree_skb(skb); ++ goto out; ++ } ++ ++ macsec_finalize_skb(skb, macsec->secy.icv_len, ++ macsec_extra_len(macsec_skb_cb(skb)->has_sci)); ++ len = skb->len; ++ macsec_reset_skb(skb, macsec->secy.netdev); ++ ++ if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS) ++ count_rx(dev, len); ++ ++ rcu_read_unlock_bh(); ++ ++out: ++ macsec_rxsa_put(rx_sa); ++ macsec_rxsc_put(rx_sc); ++ dev_put(dev); ++} ++ ++static struct sk_buff *macsec_decrypt(struct sk_buff *skb, ++ struct net_device *dev, ++ struct macsec_rx_sa *rx_sa, ++ sci_t sci, ++ struct macsec_secy *secy) ++{ ++ int ret; ++ struct scatterlist *sg; ++ struct sk_buff *trailer; ++ unsigned char *iv; ++ struct aead_request *req; ++ struct macsec_eth_header *hdr; ++ u32 hdr_pn; ++ u16 icv_len = secy->icv_len; ++ ++ macsec_skb_cb(skb)->valid = false; ++ skb = skb_share_check(skb, GFP_ATOMIC); ++ if (!skb) ++ return ERR_PTR(-ENOMEM); ++ ++ ret = skb_cow_data(skb, 0, &trailer); ++ if (unlikely(ret < 0)) { ++ kfree_skb(skb); ++ return ERR_PTR(ret); ++ } ++ req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret); ++ if (!req) { ++ kfree_skb(skb); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ hdr = (struct macsec_eth_header *)skb->data; ++ hdr_pn = ntohl(hdr->packet_number); ++ ++ if (secy->xpn) { ++ pn_t recovered_pn = rx_sa->next_pn_halves; ++ ++ recovered_pn.lower = hdr_pn; ++ if (hdr_pn < rx_sa->next_pn_halves.lower && ++ !pn_same_half(hdr_pn, rx_sa->next_pn_halves.lower)) ++ recovered_pn.upper++; ++ ++ macsec_fill_iv_xpn(iv, rx_sa->ssci, recovered_pn.full64, ++ rx_sa->key.salt); ++ } else { ++ macsec_fill_iv(iv, sci, hdr_pn); ++ } ++ ++ sg_init_table(sg, ret); ++ ret = skb_to_sgvec(skb, sg, 0, skb->len); ++ if (unlikely(ret < 0)) { ++ aead_request_free(req); ++ kfree_skb(skb); ++ return ERR_PTR(ret); ++ } ++ ++ if (hdr->tci_an & MACSEC_TCI_E) { ++ /* confidentiality: ethernet + macsec header ++ * authenticated, encrypted payload ++ */ ++ int len = skb->len - macsec_hdr_len(macsec_skb_cb(skb)->has_sci); ++ ++ aead_request_set_crypt(req, sg, sg, len, iv); ++ aead_request_set_ad(req, macsec_hdr_len(macsec_skb_cb(skb)->has_sci)); ++ skb = skb_unshare(skb, GFP_ATOMIC); ++ if (!skb) { ++ aead_request_free(req); ++ return ERR_PTR(-ENOMEM); ++ } ++ } else { ++ /* integrity only: all headers + data authenticated */ ++ aead_request_set_crypt(req, sg, sg, icv_len, iv); ++ aead_request_set_ad(req, skb->len - icv_len); ++ } ++ ++ macsec_skb_cb(skb)->req = req; ++ skb->dev = dev; ++ aead_request_set_callback(req, 0, macsec_decrypt_done, skb); ++ ++ dev_hold(dev); ++ ret = crypto_aead_decrypt(req); ++ if (ret == -EINPROGRESS) { ++ return ERR_PTR(ret); ++ } else if (ret != 0) { ++ /* decryption/authentication failed ++ * 10.6 if validateFrames is disabled, deliver anyway ++ */ ++ if (ret != -EBADMSG) { ++ kfree_skb(skb); ++ skb = ERR_PTR(ret); ++ } ++ } else { ++ macsec_skb_cb(skb)->valid = true; ++ } ++ dev_put(dev); ++ ++ aead_request_free(req); ++ ++ return skb; ++} ++ ++static struct macsec_rx_sc *find_rx_sc(struct macsec_secy *secy, sci_t sci) ++{ ++ struct macsec_rx_sc *rx_sc; ++ ++ for_each_rxsc(secy, rx_sc) { ++ if (rx_sc->sci == sci) ++ return rx_sc; ++ } ++ ++ return NULL; ++} ++ ++static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci) ++{ ++ struct macsec_rx_sc *rx_sc; ++ ++ for_each_rxsc_rtnl(secy, rx_sc) { ++ if (rx_sc->sci == sci) ++ return rx_sc; ++ } ++ ++ return NULL; ++} ++ ++static enum rx_handler_result handle_not_macsec(struct sk_buff *skb) ++{ ++ /* Deliver to the uncontrolled port by default */ ++ enum rx_handler_result ret = RX_HANDLER_PASS; ++ struct ethhdr *hdr = eth_hdr(skb); ++ struct macsec_rxh_data *rxd; ++ struct macsec_dev *macsec; ++ ++ rcu_read_lock(); ++ rxd = macsec_data_rcu(skb->dev); ++ ++ list_for_each_entry_rcu(macsec, &rxd->secys, secys) { ++ struct sk_buff *nskb; ++ struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); ++ struct net_device *ndev = macsec->secy.netdev; ++ ++ /* If h/w offloading is enabled, HW decodes frames and strips ++ * the SecTAG, so we have to deduce which port to deliver to. ++ */ ++ if (macsec_is_offloaded(macsec) && netif_running(ndev)) { ++ if (ether_addr_equal_64bits(hdr->h_dest, ++ ndev->dev_addr)) { ++ /* exact match, divert skb to this port */ ++ skb->dev = ndev; ++ skb->pkt_type = PACKET_HOST; ++ ret = RX_HANDLER_ANOTHER; ++ goto out; ++ } else if (is_multicast_ether_addr_64bits( ++ hdr->h_dest)) { ++ /* multicast frame, deliver on this port too */ ++ nskb = skb_clone(skb, GFP_ATOMIC); ++ if (!nskb) ++ break; ++ ++ nskb->dev = ndev; ++ if (ether_addr_equal_64bits(hdr->h_dest, ++ ndev->broadcast)) ++ nskb->pkt_type = PACKET_BROADCAST; ++ else ++ nskb->pkt_type = PACKET_MULTICAST; ++ ++ __netif_rx(nskb); ++ } ++ continue; ++ } ++ ++ /* 10.6 If the management control validateFrames is not ++ * Strict, frames without a SecTAG are received, counted, and ++ * delivered to the Controlled Port ++ */ ++ if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsNoTag++; ++ u64_stats_update_end(&secy_stats->syncp); ++ macsec->secy.netdev->stats.rx_dropped++; ++ continue; ++ } ++ ++ /* deliver on this port */ ++ nskb = skb_clone(skb, GFP_ATOMIC); ++ if (!nskb) ++ break; ++ ++ nskb->dev = ndev; ++ ++ if (__netif_rx(nskb) == NET_RX_SUCCESS) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsUntagged++; ++ u64_stats_update_end(&secy_stats->syncp); ++ } ++ } ++ ++out: ++ rcu_read_unlock(); ++ return ret; ++} ++ ++static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) ++{ ++ struct sk_buff *skb = *pskb; ++ struct net_device *dev = skb->dev; ++ struct macsec_eth_header *hdr; ++ struct macsec_secy *secy = NULL; ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ struct macsec_rxh_data *rxd; ++ struct macsec_dev *macsec; ++ unsigned int len; ++ sci_t sci; ++ u32 hdr_pn; ++ bool cbit; ++ struct pcpu_rx_sc_stats *rxsc_stats; ++ struct pcpu_secy_stats *secy_stats; ++ bool pulled_sci; ++ int ret; ++ ++ if (skb_headroom(skb) < ETH_HLEN) ++ goto drop_direct; ++ ++ hdr = macsec_ethhdr(skb); ++ if (hdr->eth.h_proto != htons(ETH_P_MACSEC)) ++ return handle_not_macsec(skb); ++ ++ skb = skb_unshare(skb, GFP_ATOMIC); ++ *pskb = skb; ++ if (!skb) ++ return RX_HANDLER_CONSUMED; ++ ++ pulled_sci = pskb_may_pull(skb, macsec_extra_len(true)); ++ if (!pulled_sci) { ++ if (!pskb_may_pull(skb, macsec_extra_len(false))) ++ goto drop_direct; ++ } ++ ++ hdr = macsec_ethhdr(skb); ++ ++ /* Frames with a SecTAG that has the TCI E bit set but the C ++ * bit clear are discarded, as this reserved encoding is used ++ * to identify frames with a SecTAG that are not to be ++ * delivered to the Controlled Port. ++ */ ++ if ((hdr->tci_an & (MACSEC_TCI_C | MACSEC_TCI_E)) == MACSEC_TCI_E) ++ return RX_HANDLER_PASS; ++ ++ /* now, pull the extra length */ ++ if (hdr->tci_an & MACSEC_TCI_SC) { ++ if (!pulled_sci) ++ goto drop_direct; ++ } ++ ++ /* ethernet header is part of crypto processing */ ++ skb_push(skb, ETH_HLEN); ++ ++ macsec_skb_cb(skb)->has_sci = !!(hdr->tci_an & MACSEC_TCI_SC); ++ macsec_skb_cb(skb)->assoc_num = hdr->tci_an & MACSEC_AN_MASK; ++ sci = macsec_frame_sci(hdr, macsec_skb_cb(skb)->has_sci); ++ ++ rcu_read_lock(); ++ rxd = macsec_data_rcu(skb->dev); ++ ++ list_for_each_entry_rcu(macsec, &rxd->secys, secys) { ++ struct macsec_rx_sc *sc = find_rx_sc(&macsec->secy, sci); ++ ++ sc = sc ? macsec_rxsc_get(sc) : NULL; ++ ++ if (sc) { ++ secy = &macsec->secy; ++ rx_sc = sc; ++ break; ++ } ++ } ++ ++ if (!secy) ++ goto nosci; ++ ++ dev = secy->netdev; ++ macsec = macsec_priv(dev); ++ secy_stats = this_cpu_ptr(macsec->stats); ++ rxsc_stats = this_cpu_ptr(rx_sc->stats); ++ ++ if (!macsec_validate_skb(skb, secy->icv_len, secy->xpn)) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsBadTag++; ++ u64_stats_update_end(&secy_stats->syncp); ++ secy->netdev->stats.rx_errors++; ++ goto drop_nosa; ++ } ++ ++ rx_sa = macsec_rxsa_get(rx_sc->sa[macsec_skb_cb(skb)->assoc_num]); ++ if (!rx_sa) { ++ /* 10.6.1 if the SA is not in use */ ++ ++ /* If validateFrames is Strict or the C bit in the ++ * SecTAG is set, discard ++ */ ++ struct macsec_rx_sa *active_rx_sa = macsec_active_rxsa_get(rx_sc); ++ if (hdr->tci_an & MACSEC_TCI_C || ++ secy->validate_frames == MACSEC_VALIDATE_STRICT) { ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsNotUsingSA++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ secy->netdev->stats.rx_errors++; ++ if (active_rx_sa) ++ this_cpu_inc(active_rx_sa->stats->InPktsNotUsingSA); ++ goto drop_nosa; ++ } ++ ++ /* not Strict, the frame (with the SecTAG and ICV ++ * removed) is delivered to the Controlled Port. ++ */ ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsUnusedSA++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ if (active_rx_sa) ++ this_cpu_inc(active_rx_sa->stats->InPktsUnusedSA); ++ goto deliver; ++ } ++ ++ /* First, PN check to avoid decrypting obviously wrong packets */ ++ hdr_pn = ntohl(hdr->packet_number); ++ if (secy->replay_protect) { ++ bool late; ++ ++ spin_lock(&rx_sa->lock); ++ late = rx_sa->next_pn_halves.lower >= secy->replay_window && ++ hdr_pn < (rx_sa->next_pn_halves.lower - secy->replay_window); ++ ++ if (secy->xpn) ++ late = late && pn_same_half(rx_sa->next_pn_halves.lower, hdr_pn); ++ spin_unlock(&rx_sa->lock); ++ ++ if (late) { ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsLate++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ macsec->secy.netdev->stats.rx_dropped++; ++ goto drop; ++ } ++ } ++ ++ macsec_skb_cb(skb)->rx_sa = rx_sa; ++ ++ /* Disabled && !changed text => skip validation */ ++ if (hdr->tci_an & MACSEC_TCI_C || ++ secy->validate_frames != MACSEC_VALIDATE_DISABLED) ++ skb = macsec_decrypt(skb, dev, rx_sa, sci, secy); ++ ++ if (IS_ERR(skb)) { ++ /* the decrypt callback needs the reference */ ++ if (PTR_ERR(skb) != -EINPROGRESS) { ++ macsec_rxsa_put(rx_sa); ++ macsec_rxsc_put(rx_sc); ++ } ++ rcu_read_unlock(); ++ *pskb = NULL; ++ return RX_HANDLER_CONSUMED; ++ } ++ ++ if (!macsec_post_decrypt(skb, secy, hdr_pn)) ++ goto drop; ++ ++deliver: ++ macsec_finalize_skb(skb, secy->icv_len, ++ macsec_extra_len(macsec_skb_cb(skb)->has_sci)); ++ len = skb->len; ++ macsec_reset_skb(skb, secy->netdev); ++ ++ if (rx_sa) ++ macsec_rxsa_put(rx_sa); ++ macsec_rxsc_put(rx_sc); ++ ++ skb_orphan(skb); ++ ret = gro_cells_receive(&macsec->gro_cells, skb); ++ if (ret == NET_RX_SUCCESS) ++ count_rx(dev, len); ++ else ++ macsec->secy.netdev->stats.rx_dropped++; ++ ++ rcu_read_unlock(); ++ ++ *pskb = NULL; ++ return RX_HANDLER_CONSUMED; ++ ++drop: ++ macsec_rxsa_put(rx_sa); ++drop_nosa: ++ macsec_rxsc_put(rx_sc); ++ rcu_read_unlock(); ++drop_direct: ++ kfree_skb(skb); ++ *pskb = NULL; ++ return RX_HANDLER_CONSUMED; ++ ++nosci: ++ /* 10.6.1 if the SC is not found */ ++ cbit = !!(hdr->tci_an & MACSEC_TCI_C); ++ if (!cbit) ++ macsec_finalize_skb(skb, DEFAULT_ICV_LEN, ++ macsec_extra_len(macsec_skb_cb(skb)->has_sci)); ++ ++ list_for_each_entry_rcu(macsec, &rxd->secys, secys) { ++ struct sk_buff *nskb; ++ ++ secy_stats = this_cpu_ptr(macsec->stats); ++ ++ /* If validateFrames is Strict or the C bit in the ++ * SecTAG is set, discard ++ */ ++ if (cbit || ++ macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsNoSCI++; ++ u64_stats_update_end(&secy_stats->syncp); ++ macsec->secy.netdev->stats.rx_errors++; ++ continue; ++ } ++ ++ /* not strict, the frame (with the SecTAG and ICV ++ * removed) is delivered to the Controlled Port. ++ */ ++ nskb = skb_clone(skb, GFP_ATOMIC); ++ if (!nskb) ++ break; ++ ++ macsec_reset_skb(nskb, macsec->secy.netdev); ++ ++ ret = __netif_rx(nskb); ++ if (ret == NET_RX_SUCCESS) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsUnknownSCI++; ++ u64_stats_update_end(&secy_stats->syncp); ++ } else { ++ macsec->secy.netdev->stats.rx_dropped++; ++ } ++ } ++ ++ rcu_read_unlock(); ++ *pskb = skb; ++ return RX_HANDLER_PASS; ++} ++ ++static struct crypto_aead *macsec_alloc_tfm(char *key, int key_len, int icv_len) ++{ ++ struct crypto_aead *tfm; ++ int ret; ++ ++ /* Pick a sync gcm(aes) cipher to ensure order is preserved. */ ++ tfm = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); ++ ++ if (IS_ERR(tfm)) ++ return tfm; ++ ++ ret = crypto_aead_setkey(tfm, key, key_len); ++ if (ret < 0) ++ goto fail; ++ ++ ret = crypto_aead_setauthsize(tfm, icv_len); ++ if (ret < 0) ++ goto fail; ++ ++ return tfm; ++fail: ++ crypto_free_aead(tfm); ++ return ERR_PTR(ret); ++} ++ ++static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len, ++ int icv_len) ++{ ++ rx_sa->stats = alloc_percpu(struct macsec_rx_sa_stats); ++ if (!rx_sa->stats) ++ return -ENOMEM; ++ ++ rx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); ++ if (IS_ERR(rx_sa->key.tfm)) { ++ free_percpu(rx_sa->stats); ++ return PTR_ERR(rx_sa->key.tfm); ++ } ++ ++ rx_sa->ssci = MACSEC_UNDEF_SSCI; ++ rx_sa->active = false; ++ rx_sa->next_pn = 1; ++ refcount_set(&rx_sa->refcnt, 1); ++ spin_lock_init(&rx_sa->lock); ++ ++ return 0; ++} ++ ++static void clear_rx_sa(struct macsec_rx_sa *rx_sa) ++{ ++ rx_sa->active = false; ++ ++ macsec_rxsa_put(rx_sa); ++} ++ ++static void free_rx_sc(struct macsec_rx_sc *rx_sc) ++{ ++ int i; ++ ++ for (i = 0; i < MACSEC_NUM_AN; i++) { ++ struct macsec_rx_sa *sa = rtnl_dereference(rx_sc->sa[i]); ++ ++ RCU_INIT_POINTER(rx_sc->sa[i], NULL); ++ if (sa) ++ clear_rx_sa(sa); ++ } ++ ++ macsec_rxsc_put(rx_sc); ++} ++ ++static struct macsec_rx_sc *del_rx_sc(struct macsec_secy *secy, sci_t sci) ++{ ++ struct macsec_rx_sc *rx_sc, __rcu **rx_scp; ++ ++ for (rx_scp = &secy->rx_sc, rx_sc = rtnl_dereference(*rx_scp); ++ rx_sc; ++ rx_scp = &rx_sc->next, rx_sc = rtnl_dereference(*rx_scp)) { ++ if (rx_sc->sci == sci) { ++ if (rx_sc->active) ++ secy->n_rx_sc--; ++ rcu_assign_pointer(*rx_scp, rx_sc->next); ++ return rx_sc; ++ } ++ } ++ ++ return NULL; ++} ++ ++static struct macsec_rx_sc *create_rx_sc(struct net_device *dev, sci_t sci, ++ bool active) ++{ ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_dev *macsec; ++ struct net_device *real_dev = macsec_priv(dev)->real_dev; ++ struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); ++ struct macsec_secy *secy; ++ ++ list_for_each_entry(macsec, &rxd->secys, secys) { ++ if (find_rx_sc_rtnl(&macsec->secy, sci)) ++ return ERR_PTR(-EEXIST); ++ } ++ ++ rx_sc = kzalloc(sizeof(*rx_sc), GFP_KERNEL); ++ if (!rx_sc) ++ return ERR_PTR(-ENOMEM); ++ ++ rx_sc->stats = netdev_alloc_pcpu_stats(struct pcpu_rx_sc_stats); ++ if (!rx_sc->stats) { ++ kfree(rx_sc); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ rx_sc->sci = sci; ++ rx_sc->active = active; ++ refcount_set(&rx_sc->refcnt, 1); ++ ++ secy = &macsec_priv(dev)->secy; ++ rcu_assign_pointer(rx_sc->next, secy->rx_sc); ++ rcu_assign_pointer(secy->rx_sc, rx_sc); ++ ++ if (rx_sc->active) ++ secy->n_rx_sc++; ++ ++ return rx_sc; ++} ++ ++static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len, ++ int icv_len) ++{ ++ tx_sa->stats = alloc_percpu(struct macsec_tx_sa_stats); ++ if (!tx_sa->stats) ++ return -ENOMEM; ++ ++ tx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); ++ if (IS_ERR(tx_sa->key.tfm)) { ++ free_percpu(tx_sa->stats); ++ return PTR_ERR(tx_sa->key.tfm); ++ } ++ ++ tx_sa->ssci = MACSEC_UNDEF_SSCI; ++ tx_sa->active = false; ++ refcount_set(&tx_sa->refcnt, 1); ++ spin_lock_init(&tx_sa->lock); ++ ++ return 0; ++} ++ ++static void clear_tx_sa(struct macsec_tx_sa *tx_sa) ++{ ++ tx_sa->active = false; ++ ++ macsec_txsa_put(tx_sa); ++} ++ ++static struct genl_family macsec_fam; ++ ++static struct net_device *get_dev_from_nl(struct net *net, ++ struct nlattr **attrs) ++{ ++ int ifindex = nla_get_u32(attrs[MACSEC_ATTR_IFINDEX]); ++ struct net_device *dev; ++ ++ dev = __dev_get_by_index(net, ifindex); ++ if (!dev) ++ return ERR_PTR(-ENODEV); ++ ++ if (!netif_is_macsec(dev)) ++ return ERR_PTR(-ENODEV); ++ ++ return dev; ++} ++ ++static enum macsec_offload nla_get_offload(const struct nlattr *nla) ++{ ++ return (__force enum macsec_offload)nla_get_u8(nla); ++} ++ ++static sci_t nla_get_sci(const struct nlattr *nla) ++{ ++ return (__force sci_t)nla_get_u64(nla); ++} ++ ++static int nla_put_sci(struct sk_buff *skb, int attrtype, sci_t value, ++ int padattr) ++{ ++ return nla_put_u64_64bit(skb, attrtype, (__force u64)value, padattr); ++} ++ ++static ssci_t nla_get_ssci(const struct nlattr *nla) ++{ ++ return (__force ssci_t)nla_get_u32(nla); ++} ++ ++static int nla_put_ssci(struct sk_buff *skb, int attrtype, ssci_t value) ++{ ++ return nla_put_u32(skb, attrtype, (__force u64)value); ++} ++ ++static struct macsec_tx_sa *get_txsa_from_nl(struct net *net, ++ struct nlattr **attrs, ++ struct nlattr **tb_sa, ++ struct net_device **devp, ++ struct macsec_secy **secyp, ++ struct macsec_tx_sc **scp, ++ u8 *assoc_num) ++{ ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ ++ if (!tb_sa[MACSEC_SA_ATTR_AN]) ++ return ERR_PTR(-EINVAL); ++ ++ *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); ++ ++ dev = get_dev_from_nl(net, attrs); ++ if (IS_ERR(dev)) ++ return ERR_CAST(dev); ++ ++ if (*assoc_num >= MACSEC_NUM_AN) ++ return ERR_PTR(-EINVAL); ++ ++ secy = &macsec_priv(dev)->secy; ++ tx_sc = &secy->tx_sc; ++ ++ tx_sa = rtnl_dereference(tx_sc->sa[*assoc_num]); ++ if (!tx_sa) ++ return ERR_PTR(-ENODEV); ++ ++ *devp = dev; ++ *scp = tx_sc; ++ *secyp = secy; ++ return tx_sa; ++} ++ ++static struct macsec_rx_sc *get_rxsc_from_nl(struct net *net, ++ struct nlattr **attrs, ++ struct nlattr **tb_rxsc, ++ struct net_device **devp, ++ struct macsec_secy **secyp) ++{ ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ sci_t sci; ++ ++ dev = get_dev_from_nl(net, attrs); ++ if (IS_ERR(dev)) ++ return ERR_CAST(dev); ++ ++ secy = &macsec_priv(dev)->secy; ++ ++ if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) ++ return ERR_PTR(-EINVAL); ++ ++ sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); ++ rx_sc = find_rx_sc_rtnl(secy, sci); ++ if (!rx_sc) ++ return ERR_PTR(-ENODEV); ++ ++ *secyp = secy; ++ *devp = dev; ++ ++ return rx_sc; ++} ++ ++static struct macsec_rx_sa *get_rxsa_from_nl(struct net *net, ++ struct nlattr **attrs, ++ struct nlattr **tb_rxsc, ++ struct nlattr **tb_sa, ++ struct net_device **devp, ++ struct macsec_secy **secyp, ++ struct macsec_rx_sc **scp, ++ u8 *assoc_num) ++{ ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ ++ if (!tb_sa[MACSEC_SA_ATTR_AN]) ++ return ERR_PTR(-EINVAL); ++ ++ *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); ++ if (*assoc_num >= MACSEC_NUM_AN) ++ return ERR_PTR(-EINVAL); ++ ++ rx_sc = get_rxsc_from_nl(net, attrs, tb_rxsc, devp, secyp); ++ if (IS_ERR(rx_sc)) ++ return ERR_CAST(rx_sc); ++ ++ rx_sa = rtnl_dereference(rx_sc->sa[*assoc_num]); ++ if (!rx_sa) ++ return ERR_PTR(-ENODEV); ++ ++ *scp = rx_sc; ++ return rx_sa; ++} ++ ++static const struct nla_policy macsec_genl_policy[NUM_MACSEC_ATTR] = { ++ [MACSEC_ATTR_IFINDEX] = { .type = NLA_U32 }, ++ [MACSEC_ATTR_RXSC_CONFIG] = { .type = NLA_NESTED }, ++ [MACSEC_ATTR_SA_CONFIG] = { .type = NLA_NESTED }, ++ [MACSEC_ATTR_OFFLOAD] = { .type = NLA_NESTED }, ++}; ++ ++static const struct nla_policy macsec_genl_rxsc_policy[NUM_MACSEC_RXSC_ATTR] = { ++ [MACSEC_RXSC_ATTR_SCI] = { .type = NLA_U64 }, ++ [MACSEC_RXSC_ATTR_ACTIVE] = { .type = NLA_U8 }, ++}; ++ ++static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = { ++ [MACSEC_SA_ATTR_AN] = { .type = NLA_U8 }, ++ [MACSEC_SA_ATTR_ACTIVE] = { .type = NLA_U8 }, ++ [MACSEC_SA_ATTR_PN] = NLA_POLICY_MIN_LEN(4), ++ [MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY, ++ .len = MACSEC_KEYID_LEN, }, ++ [MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY, ++ .len = MACSEC_MAX_KEY_LEN, }, ++ [MACSEC_SA_ATTR_SSCI] = { .type = NLA_U32 }, ++ [MACSEC_SA_ATTR_SALT] = { .type = NLA_BINARY, ++ .len = MACSEC_SALT_LEN, }, ++}; ++ ++static const struct nla_policy macsec_genl_offload_policy[NUM_MACSEC_OFFLOAD_ATTR] = { ++ [MACSEC_OFFLOAD_ATTR_TYPE] = { .type = NLA_U8 }, ++}; ++ ++/* Offloads an operation to a device driver */ ++static int macsec_offload(int (* const func)(struct macsec_context *), ++ struct macsec_context *ctx) ++{ ++ int ret; ++ ++ if (unlikely(!func)) ++ return 0; ++ ++ if (ctx->offload == MACSEC_OFFLOAD_PHY) ++ mutex_lock(&ctx->phydev->lock); ++ ++ /* Phase I: prepare. The drive should fail here if there are going to be ++ * issues in the commit phase. ++ */ ++ ctx->prepare = true; ++ ret = (*func)(ctx); ++ if (ret) ++ goto phy_unlock; ++ ++ /* Phase II: commit. This step cannot fail. */ ++ ctx->prepare = false; ++ ret = (*func)(ctx); ++ /* This should never happen: commit is not allowed to fail */ ++ if (unlikely(ret)) ++ WARN(1, "MACsec offloading commit failed (%d)\n", ret); ++ ++phy_unlock: ++ if (ctx->offload == MACSEC_OFFLOAD_PHY) ++ mutex_unlock(&ctx->phydev->lock); ++ ++ return ret; ++} ++ ++static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa) ++{ ++ if (!attrs[MACSEC_ATTR_SA_CONFIG]) ++ return -EINVAL; ++ ++ if (nla_parse_nested_deprecated(tb_sa, MACSEC_SA_ATTR_MAX, attrs[MACSEC_ATTR_SA_CONFIG], macsec_genl_sa_policy, NULL)) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int parse_rxsc_config(struct nlattr **attrs, struct nlattr **tb_rxsc) ++{ ++ if (!attrs[MACSEC_ATTR_RXSC_CONFIG]) ++ return -EINVAL; ++ ++ if (nla_parse_nested_deprecated(tb_rxsc, MACSEC_RXSC_ATTR_MAX, attrs[MACSEC_ATTR_RXSC_CONFIG], macsec_genl_rxsc_policy, NULL)) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static bool validate_add_rxsa(struct nlattr **attrs) ++{ ++ if (!attrs[MACSEC_SA_ATTR_AN] || ++ !attrs[MACSEC_SA_ATTR_KEY] || ++ !attrs[MACSEC_SA_ATTR_KEYID]) ++ return false; ++ ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_PN] && ++ nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_ACTIVE]) { ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) ++ return false; ++ } ++ ++ if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) ++ return false; ++ ++ return true; ++} ++ ++static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct net_device *dev; ++ struct nlattr **attrs = info->attrs; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ unsigned char assoc_num; ++ int pn_len; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ int err; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (!validate_add_rxsa(tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); ++ if (IS_ERR(rx_sc)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sc); ++ } ++ ++ assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); ++ ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { ++ pr_notice("macsec: nl: add_rxsa: bad key length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; ++ if (tb_sa[MACSEC_SA_ATTR_PN] && ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { ++ pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ if (secy->xpn) { ++ if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { ++ pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), ++ MACSEC_SALT_LEN); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ } ++ ++ rx_sa = rtnl_dereference(rx_sc->sa[assoc_num]); ++ if (rx_sa) { ++ rtnl_unlock(); ++ return -EBUSY; ++ } ++ ++ rx_sa = kmalloc(sizeof(*rx_sa), GFP_KERNEL); ++ if (!rx_sa) { ++ rtnl_unlock(); ++ return -ENOMEM; ++ } ++ ++ err = init_rx_sa(rx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), ++ secy->key_len, secy->icv_len); ++ if (err < 0) { ++ kfree(rx_sa); ++ rtnl_unlock(); ++ return err; ++ } ++ ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ spin_lock_bh(&rx_sa->lock); ++ rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); ++ spin_unlock_bh(&rx_sa->lock); ++ } ++ ++ if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) ++ rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); ++ ++ rx_sa->sc = rx_sc; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ err = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.rx_sa = rx_sa; ++ ctx.secy = secy; ++ memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), ++ secy->key_len); ++ ++ err = macsec_offload(ops->mdo_add_rxsa, &ctx); ++ memzero_explicit(ctx.sa.key, secy->key_len); ++ if (err) ++ goto cleanup; ++ } ++ ++ if (secy->xpn) { ++ rx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); ++ nla_memcpy(rx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], ++ MACSEC_SALT_LEN); ++ } ++ ++ nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); ++ rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa); ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ macsec_rxsa_put(rx_sa); ++ rtnl_unlock(); ++ return err; ++} ++ ++static bool validate_add_rxsc(struct nlattr **attrs) ++{ ++ if (!attrs[MACSEC_RXSC_ATTR_SCI]) ++ return false; ++ ++ if (attrs[MACSEC_RXSC_ATTR_ACTIVE]) { ++ if (nla_get_u8(attrs[MACSEC_RXSC_ATTR_ACTIVE]) > 1) ++ return false; ++ } ++ ++ return true; ++} ++ ++static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct net_device *dev; ++ sci_t sci = MACSEC_UNDEF_SCI; ++ struct nlattr **attrs = info->attrs; ++ struct macsec_rx_sc *rx_sc; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ struct macsec_secy *secy; ++ bool active = true; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (!validate_add_rxsc(tb_rxsc)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ dev = get_dev_from_nl(genl_info_net(info), attrs); ++ if (IS_ERR(dev)) { ++ rtnl_unlock(); ++ return PTR_ERR(dev); ++ } ++ ++ secy = &macsec_priv(dev)->secy; ++ sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); ++ ++ if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) ++ active = nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); ++ ++ rx_sc = create_rx_sc(dev, sci, active); ++ if (IS_ERR(rx_sc)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sc); ++ } ++ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.rx_sc = rx_sc; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_add_rxsc, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ del_rx_sc(secy, sci); ++ free_rx_sc(rx_sc); ++ rtnl_unlock(); ++ return ret; ++} ++ ++static bool validate_add_txsa(struct nlattr **attrs) ++{ ++ if (!attrs[MACSEC_SA_ATTR_AN] || ++ !attrs[MACSEC_SA_ATTR_PN] || ++ !attrs[MACSEC_SA_ATTR_KEY] || ++ !attrs[MACSEC_SA_ATTR_KEYID]) ++ return false; ++ ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) ++ return false; ++ ++ if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_ACTIVE]) { ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) ++ return false; ++ } ++ ++ if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) ++ return false; ++ ++ return true; ++} ++ ++static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct net_device *dev; ++ struct nlattr **attrs = info->attrs; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ unsigned char assoc_num; ++ int pn_len; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ bool was_operational; ++ int err; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (!validate_add_txsa(tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ dev = get_dev_from_nl(genl_info_net(info), attrs); ++ if (IS_ERR(dev)) { ++ rtnl_unlock(); ++ return PTR_ERR(dev); ++ } ++ ++ secy = &macsec_priv(dev)->secy; ++ tx_sc = &secy->tx_sc; ++ ++ assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); ++ ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { ++ pr_notice("macsec: nl: add_txsa: bad key length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { ++ pr_notice("macsec: nl: add_txsa: bad pn length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ if (secy->xpn) { ++ if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { ++ pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), ++ MACSEC_SALT_LEN); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ } ++ ++ tx_sa = rtnl_dereference(tx_sc->sa[assoc_num]); ++ if (tx_sa) { ++ rtnl_unlock(); ++ return -EBUSY; ++ } ++ ++ tx_sa = kmalloc(sizeof(*tx_sa), GFP_KERNEL); ++ if (!tx_sa) { ++ rtnl_unlock(); ++ return -ENOMEM; ++ } ++ ++ err = init_tx_sa(tx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), ++ secy->key_len, secy->icv_len); ++ if (err < 0) { ++ kfree(tx_sa); ++ rtnl_unlock(); ++ return err; ++ } ++ ++ spin_lock_bh(&tx_sa->lock); ++ tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); ++ spin_unlock_bh(&tx_sa->lock); ++ ++ if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) ++ tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); ++ ++ was_operational = secy->operational; ++ if (assoc_num == tx_sc->encoding_sa && tx_sa->active) ++ secy->operational = true; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ err = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.tx_sa = tx_sa; ++ ctx.secy = secy; ++ memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), ++ secy->key_len); ++ ++ err = macsec_offload(ops->mdo_add_txsa, &ctx); ++ memzero_explicit(ctx.sa.key, secy->key_len); ++ if (err) ++ goto cleanup; ++ } ++ ++ if (secy->xpn) { ++ tx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); ++ nla_memcpy(tx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], ++ MACSEC_SALT_LEN); ++ } ++ ++ nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); ++ rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa); ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ secy->operational = was_operational; ++ macsec_txsa_put(tx_sa); ++ rtnl_unlock(); ++ return err; ++} ++ ++static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ u8 assoc_num; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, ++ &dev, &secy, &rx_sc, &assoc_num); ++ if (IS_ERR(rx_sa)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sa); ++ } ++ ++ if (rx_sa->active) { ++ rtnl_unlock(); ++ return -EBUSY; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.rx_sa = rx_sa; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_del_rxsa, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL); ++ clear_rx_sa(rx_sa); ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ rtnl_unlock(); ++ return ret; ++} ++ ++static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ sci_t sci; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ dev = get_dev_from_nl(genl_info_net(info), info->attrs); ++ if (IS_ERR(dev)) { ++ rtnl_unlock(); ++ return PTR_ERR(dev); ++ } ++ ++ secy = &macsec_priv(dev)->secy; ++ sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); ++ ++ rx_sc = del_rx_sc(secy, sci); ++ if (!rx_sc) { ++ rtnl_unlock(); ++ return -ENODEV; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.rx_sc = rx_sc; ++ ctx.secy = secy; ++ ret = macsec_offload(ops->mdo_del_rxsc, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ free_rx_sc(rx_sc); ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ rtnl_unlock(); ++ return ret; ++} ++ ++static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ u8 assoc_num; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, ++ &dev, &secy, &tx_sc, &assoc_num); ++ if (IS_ERR(tx_sa)) { ++ rtnl_unlock(); ++ return PTR_ERR(tx_sa); ++ } ++ ++ if (tx_sa->active) { ++ rtnl_unlock(); ++ return -EBUSY; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.tx_sa = tx_sa; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_del_txsa, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL); ++ clear_tx_sa(tx_sa); ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ rtnl_unlock(); ++ return ret; ++} ++ ++static bool validate_upd_sa(struct nlattr **attrs) ++{ ++ if (!attrs[MACSEC_SA_ATTR_AN] || ++ attrs[MACSEC_SA_ATTR_KEY] || ++ attrs[MACSEC_SA_ATTR_KEYID] || ++ attrs[MACSEC_SA_ATTR_SSCI] || ++ attrs[MACSEC_SA_ATTR_SALT]) ++ return false; ++ ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_ACTIVE]) { ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) ++ return false; ++ } ++ ++ return true; ++} ++ ++static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ u8 assoc_num; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ bool was_operational, was_active; ++ pn_t prev_pn; ++ int ret = 0; ++ ++ prev_pn.full64 = 0; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (!validate_upd_sa(tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, ++ &dev, &secy, &tx_sc, &assoc_num); ++ if (IS_ERR(tx_sa)) { ++ rtnl_unlock(); ++ return PTR_ERR(tx_sa); ++ } ++ ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ int pn_len; ++ ++ pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { ++ pr_notice("macsec: nl: upd_txsa: bad pn length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ spin_lock_bh(&tx_sa->lock); ++ prev_pn = tx_sa->next_pn_halves; ++ tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); ++ spin_unlock_bh(&tx_sa->lock); ++ } ++ ++ was_active = tx_sa->active; ++ if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) ++ tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); ++ ++ was_operational = secy->operational; ++ if (assoc_num == tx_sc->encoding_sa) ++ secy->operational = tx_sa->active; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.tx_sa = tx_sa; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_upd_txsa, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ spin_lock_bh(&tx_sa->lock); ++ tx_sa->next_pn_halves = prev_pn; ++ spin_unlock_bh(&tx_sa->lock); ++ } ++ tx_sa->active = was_active; ++ secy->operational = was_operational; ++ rtnl_unlock(); ++ return ret; ++} ++ ++static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ u8 assoc_num; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ bool was_active; ++ pn_t prev_pn; ++ int ret = 0; ++ ++ prev_pn.full64 = 0; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (!validate_upd_sa(tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, ++ &dev, &secy, &rx_sc, &assoc_num); ++ if (IS_ERR(rx_sa)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sa); ++ } ++ ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ int pn_len; ++ ++ pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { ++ pr_notice("macsec: nl: upd_rxsa: bad pn length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ spin_lock_bh(&rx_sa->lock); ++ prev_pn = rx_sa->next_pn_halves; ++ rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); ++ spin_unlock_bh(&rx_sa->lock); ++ } ++ ++ was_active = rx_sa->active; ++ if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) ++ rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.rx_sa = rx_sa; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_upd_rxsa, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ rtnl_unlock(); ++ return 0; ++ ++cleanup: ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ spin_lock_bh(&rx_sa->lock); ++ rx_sa->next_pn_halves = prev_pn; ++ spin_unlock_bh(&rx_sa->lock); ++ } ++ rx_sa->active = was_active; ++ rtnl_unlock(); ++ return ret; ++} ++ ++static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ unsigned int prev_n_rx_sc; ++ bool was_active; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (!validate_add_rxsc(tb_rxsc)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); ++ if (IS_ERR(rx_sc)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sc); ++ } ++ ++ was_active = rx_sc->active; ++ prev_n_rx_sc = secy->n_rx_sc; ++ if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) { ++ bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); ++ ++ if (rx_sc->active != new) ++ secy->n_rx_sc += new ? 1 : -1; ++ ++ rx_sc->active = new; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.rx_sc = rx_sc; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_upd_rxsc, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ secy->n_rx_sc = prev_n_rx_sc; ++ rx_sc->active = was_active; ++ rtnl_unlock(); ++ return ret; ++} ++ ++static bool macsec_is_configured(struct macsec_dev *macsec) ++{ ++ struct macsec_secy *secy = &macsec->secy; ++ struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ int i; ++ ++ if (secy->rx_sc) ++ return true; ++ ++ for (i = 0; i < MACSEC_NUM_AN; i++) ++ if (tx_sc->sa[i]) ++ return true; ++ ++ return false; ++} ++ ++static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr *tb_offload[MACSEC_OFFLOAD_ATTR_MAX + 1]; ++ enum macsec_offload offload, prev_offload; ++ int (*func)(struct macsec_context *ctx); ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ struct macsec_dev *macsec; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (!attrs[MACSEC_ATTR_OFFLOAD]) ++ return -EINVAL; ++ ++ if (nla_parse_nested_deprecated(tb_offload, MACSEC_OFFLOAD_ATTR_MAX, ++ attrs[MACSEC_ATTR_OFFLOAD], ++ macsec_genl_offload_policy, NULL)) ++ return -EINVAL; ++ ++ dev = get_dev_from_nl(genl_info_net(info), attrs); ++ if (IS_ERR(dev)) ++ return PTR_ERR(dev); ++ macsec = macsec_priv(dev); ++ ++ if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) ++ return -EINVAL; ++ ++ offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]); ++ if (macsec->offload == offload) ++ return 0; ++ ++ /* Check if the offloading mode is supported by the underlying layers */ ++ if (offload != MACSEC_OFFLOAD_OFF && ++ !macsec_check_offload(offload, macsec)) ++ return -EOPNOTSUPP; ++ ++ /* Check if the net device is busy. */ ++ if (netif_running(dev)) ++ return -EBUSY; ++ ++ rtnl_lock(); ++ ++ prev_offload = macsec->offload; ++ macsec->offload = offload; ++ ++ /* Check if the device already has rules configured: we do not support ++ * rules migration. ++ */ ++ if (macsec_is_configured(macsec)) { ++ ret = -EBUSY; ++ goto rollback; ++ } ++ ++ ops = __macsec_get_ops(offload == MACSEC_OFFLOAD_OFF ? prev_offload : offload, ++ macsec, &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto rollback; ++ } ++ ++ if (prev_offload == MACSEC_OFFLOAD_OFF) ++ func = ops->mdo_add_secy; ++ else ++ func = ops->mdo_del_secy; ++ ++ ctx.secy = &macsec->secy; ++ ret = macsec_offload(func, &ctx); ++ if (ret) ++ goto rollback; ++ ++ rtnl_unlock(); ++ return 0; ++ ++rollback: ++ macsec->offload = prev_offload; ++ ++ rtnl_unlock(); ++ return ret; ++} ++ ++static void get_tx_sa_stats(struct net_device *dev, int an, ++ struct macsec_tx_sa *tx_sa, ++ struct macsec_tx_sa_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.sa.assoc_num = an; ++ ctx.sa.tx_sa = tx_sa; ++ ctx.stats.tx_sa_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ macsec_offload(ops->mdo_get_tx_sa_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct macsec_tx_sa_stats *stats = ++ per_cpu_ptr(tx_sa->stats, cpu); ++ ++ sum->OutPktsProtected += stats->OutPktsProtected; ++ sum->OutPktsEncrypted += stats->OutPktsEncrypted; ++ } ++} ++ ++static int copy_tx_sa_stats(struct sk_buff *skb, struct macsec_tx_sa_stats *sum) ++{ ++ if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED, ++ sum->OutPktsProtected) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED, ++ sum->OutPktsEncrypted)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static void get_rx_sa_stats(struct net_device *dev, ++ struct macsec_rx_sc *rx_sc, int an, ++ struct macsec_rx_sa *rx_sa, ++ struct macsec_rx_sa_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.sa.assoc_num = an; ++ ctx.sa.rx_sa = rx_sa; ++ ctx.stats.rx_sa_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ ctx.rx_sc = rx_sc; ++ macsec_offload(ops->mdo_get_rx_sa_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct macsec_rx_sa_stats *stats = ++ per_cpu_ptr(rx_sa->stats, cpu); ++ ++ sum->InPktsOK += stats->InPktsOK; ++ sum->InPktsInvalid += stats->InPktsInvalid; ++ sum->InPktsNotValid += stats->InPktsNotValid; ++ sum->InPktsNotUsingSA += stats->InPktsNotUsingSA; ++ sum->InPktsUnusedSA += stats->InPktsUnusedSA; ++ } ++} ++ ++static int copy_rx_sa_stats(struct sk_buff *skb, ++ struct macsec_rx_sa_stats *sum) ++{ ++ if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID, ++ sum->InPktsInvalid) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID, ++ sum->InPktsNotValid) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA, ++ sum->InPktsNotUsingSA) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA, ++ sum->InPktsUnusedSA)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static void get_rx_sc_stats(struct net_device *dev, ++ struct macsec_rx_sc *rx_sc, ++ struct macsec_rx_sc_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.stats.rx_sc_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ ctx.rx_sc = rx_sc; ++ macsec_offload(ops->mdo_get_rx_sc_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct pcpu_rx_sc_stats *stats; ++ struct macsec_rx_sc_stats tmp; ++ unsigned int start; ++ ++ stats = per_cpu_ptr(rx_sc->stats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&stats->syncp); ++ memcpy(&tmp, &stats->stats, sizeof(tmp)); ++ } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); ++ ++ sum->InOctetsValidated += tmp.InOctetsValidated; ++ sum->InOctetsDecrypted += tmp.InOctetsDecrypted; ++ sum->InPktsUnchecked += tmp.InPktsUnchecked; ++ sum->InPktsDelayed += tmp.InPktsDelayed; ++ sum->InPktsOK += tmp.InPktsOK; ++ sum->InPktsInvalid += tmp.InPktsInvalid; ++ sum->InPktsLate += tmp.InPktsLate; ++ sum->InPktsNotValid += tmp.InPktsNotValid; ++ sum->InPktsNotUsingSA += tmp.InPktsNotUsingSA; ++ sum->InPktsUnusedSA += tmp.InPktsUnusedSA; ++ } ++} ++ ++static int copy_rx_sc_stats(struct sk_buff *skb, struct macsec_rx_sc_stats *sum) ++{ ++ if (nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED, ++ sum->InOctetsValidated, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED, ++ sum->InOctetsDecrypted, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED, ++ sum->InPktsUnchecked, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED, ++ sum->InPktsDelayed, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK, ++ sum->InPktsOK, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID, ++ sum->InPktsInvalid, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE, ++ sum->InPktsLate, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID, ++ sum->InPktsNotValid, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA, ++ sum->InPktsNotUsingSA, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA, ++ sum->InPktsUnusedSA, ++ MACSEC_RXSC_STATS_ATTR_PAD)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static void get_tx_sc_stats(struct net_device *dev, ++ struct macsec_tx_sc_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.stats.tx_sc_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ macsec_offload(ops->mdo_get_tx_sc_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct pcpu_tx_sc_stats *stats; ++ struct macsec_tx_sc_stats tmp; ++ unsigned int start; ++ ++ stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&stats->syncp); ++ memcpy(&tmp, &stats->stats, sizeof(tmp)); ++ } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); ++ ++ sum->OutPktsProtected += tmp.OutPktsProtected; ++ sum->OutPktsEncrypted += tmp.OutPktsEncrypted; ++ sum->OutOctetsProtected += tmp.OutOctetsProtected; ++ sum->OutOctetsEncrypted += tmp.OutOctetsEncrypted; ++ } ++} ++ ++static int copy_tx_sc_stats(struct sk_buff *skb, struct macsec_tx_sc_stats *sum) ++{ ++ if (nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED, ++ sum->OutPktsProtected, ++ MACSEC_TXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED, ++ sum->OutPktsEncrypted, ++ MACSEC_TXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED, ++ sum->OutOctetsProtected, ++ MACSEC_TXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED, ++ sum->OutOctetsEncrypted, ++ MACSEC_TXSC_STATS_ATTR_PAD)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.stats.dev_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ macsec_offload(ops->mdo_get_dev_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct pcpu_secy_stats *stats; ++ struct macsec_dev_stats tmp; ++ unsigned int start; ++ ++ stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&stats->syncp); ++ memcpy(&tmp, &stats->stats, sizeof(tmp)); ++ } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); ++ ++ sum->OutPktsUntagged += tmp.OutPktsUntagged; ++ sum->InPktsUntagged += tmp.InPktsUntagged; ++ sum->OutPktsTooLong += tmp.OutPktsTooLong; ++ sum->InPktsNoTag += tmp.InPktsNoTag; ++ sum->InPktsBadTag += tmp.InPktsBadTag; ++ sum->InPktsUnknownSCI += tmp.InPktsUnknownSCI; ++ sum->InPktsNoSCI += tmp.InPktsNoSCI; ++ sum->InPktsOverrun += tmp.InPktsOverrun; ++ } ++} ++ ++static int copy_secy_stats(struct sk_buff *skb, struct macsec_dev_stats *sum) ++{ ++ if (nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED, ++ sum->OutPktsUntagged, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED, ++ sum->InPktsUntagged, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG, ++ sum->OutPktsTooLong, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG, ++ sum->InPktsNoTag, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG, ++ sum->InPktsBadTag, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI, ++ sum->InPktsUnknownSCI, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI, ++ sum->InPktsNoSCI, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN, ++ sum->InPktsOverrun, ++ MACSEC_SECY_STATS_ATTR_PAD)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb) ++{ ++ struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ struct nlattr *secy_nest = nla_nest_start_noflag(skb, ++ MACSEC_ATTR_SECY); ++ u64 csid; ++ ++ if (!secy_nest) ++ return 1; ++ ++ switch (secy->key_len) { ++ case MACSEC_GCM_AES_128_SAK_LEN: ++ csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; ++ break; ++ case MACSEC_GCM_AES_256_SAK_LEN: ++ csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; ++ break; ++ default: ++ goto cancel; ++ } ++ ++ if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci, ++ MACSEC_SECY_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE, ++ csid, MACSEC_SECY_ATTR_PAD) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_REPLAY, secy->replay_protect) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_VALIDATE, secy->validate_frames) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_ENCRYPT, tx_sc->encrypt) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_INC_SCI, tx_sc->send_sci) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_ES, tx_sc->end_station) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_SCB, tx_sc->scb) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_ENCODING_SA, tx_sc->encoding_sa)) ++ goto cancel; ++ ++ if (secy->replay_protect) { ++ if (nla_put_u32(skb, MACSEC_SECY_ATTR_WINDOW, secy->replay_window)) ++ goto cancel; ++ } ++ ++ nla_nest_end(skb, secy_nest); ++ return 0; ++ ++cancel: ++ nla_nest_cancel(skb, secy_nest); ++ return 1; ++} ++ ++static noinline_for_stack int ++dump_secy(struct macsec_secy *secy, struct net_device *dev, ++ struct sk_buff *skb, struct netlink_callback *cb) ++{ ++ struct macsec_tx_sc_stats tx_sc_stats = {0, }; ++ struct macsec_tx_sa_stats tx_sa_stats = {0, }; ++ struct macsec_rx_sc_stats rx_sc_stats = {0, }; ++ struct macsec_rx_sa_stats rx_sa_stats = {0, }; ++ struct macsec_dev *macsec = netdev_priv(dev); ++ struct macsec_dev_stats dev_stats = {0, }; ++ struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ struct nlattr *txsa_list, *rxsc_list; ++ struct macsec_rx_sc *rx_sc; ++ struct nlattr *attr; ++ void *hdr; ++ int i, j; ++ ++ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, ++ &macsec_fam, NLM_F_MULTI, MACSEC_CMD_GET_TXSC); ++ if (!hdr) ++ return -EMSGSIZE; ++ ++ genl_dump_check_consistent(cb, hdr); ++ ++ if (nla_put_u32(skb, MACSEC_ATTR_IFINDEX, dev->ifindex)) ++ goto nla_put_failure; ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_ATTR_OFFLOAD); ++ if (!attr) ++ goto nla_put_failure; ++ if (nla_put_u8(skb, MACSEC_OFFLOAD_ATTR_TYPE, macsec->offload)) ++ goto nla_put_failure; ++ nla_nest_end(skb, attr); ++ ++ if (nla_put_secy(secy, skb)) ++ goto nla_put_failure; ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSC_STATS); ++ if (!attr) ++ goto nla_put_failure; ++ ++ get_tx_sc_stats(dev, &tx_sc_stats); ++ if (copy_tx_sc_stats(skb, &tx_sc_stats)) { ++ nla_nest_cancel(skb, attr); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY_STATS); ++ if (!attr) ++ goto nla_put_failure; ++ get_secy_stats(dev, &dev_stats); ++ if (copy_secy_stats(skb, &dev_stats)) { ++ nla_nest_cancel(skb, attr); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ txsa_list = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSA_LIST); ++ if (!txsa_list) ++ goto nla_put_failure; ++ for (i = 0, j = 1; i < MACSEC_NUM_AN; i++) { ++ struct macsec_tx_sa *tx_sa = rtnl_dereference(tx_sc->sa[i]); ++ struct nlattr *txsa_nest; ++ u64 pn; ++ int pn_len; ++ ++ if (!tx_sa) ++ continue; ++ ++ txsa_nest = nla_nest_start_noflag(skb, j++); ++ if (!txsa_nest) { ++ nla_nest_cancel(skb, txsa_list); ++ goto nla_put_failure; ++ } ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS); ++ if (!attr) { ++ nla_nest_cancel(skb, txsa_nest); ++ nla_nest_cancel(skb, txsa_list); ++ goto nla_put_failure; ++ } ++ memset(&tx_sa_stats, 0, sizeof(tx_sa_stats)); ++ get_tx_sa_stats(dev, i, tx_sa, &tx_sa_stats); ++ if (copy_tx_sa_stats(skb, &tx_sa_stats)) { ++ nla_nest_cancel(skb, attr); ++ nla_nest_cancel(skb, txsa_nest); ++ nla_nest_cancel(skb, txsa_list); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ if (secy->xpn) { ++ pn = tx_sa->next_pn; ++ pn_len = MACSEC_XPN_PN_LEN; ++ } else { ++ pn = tx_sa->next_pn_halves.lower; ++ pn_len = MACSEC_DEFAULT_PN_LEN; ++ } ++ ++ if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || ++ nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || ++ nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, tx_sa->key.id) || ++ (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, tx_sa->ssci)) || ++ nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, tx_sa->active)) { ++ nla_nest_cancel(skb, txsa_nest); ++ nla_nest_cancel(skb, txsa_list); ++ goto nla_put_failure; ++ } ++ ++ nla_nest_end(skb, txsa_nest); ++ } ++ nla_nest_end(skb, txsa_list); ++ ++ rxsc_list = nla_nest_start_noflag(skb, MACSEC_ATTR_RXSC_LIST); ++ if (!rxsc_list) ++ goto nla_put_failure; ++ ++ j = 1; ++ for_each_rxsc_rtnl(secy, rx_sc) { ++ int k; ++ struct nlattr *rxsa_list; ++ struct nlattr *rxsc_nest = nla_nest_start_noflag(skb, j++); ++ ++ if (!rxsc_nest) { ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ ++ if (nla_put_u8(skb, MACSEC_RXSC_ATTR_ACTIVE, rx_sc->active) || ++ nla_put_sci(skb, MACSEC_RXSC_ATTR_SCI, rx_sc->sci, ++ MACSEC_RXSC_ATTR_PAD)) { ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_STATS); ++ if (!attr) { ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ memset(&rx_sc_stats, 0, sizeof(rx_sc_stats)); ++ get_rx_sc_stats(dev, rx_sc, &rx_sc_stats); ++ if (copy_rx_sc_stats(skb, &rx_sc_stats)) { ++ nla_nest_cancel(skb, attr); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ rxsa_list = nla_nest_start_noflag(skb, ++ MACSEC_RXSC_ATTR_SA_LIST); ++ if (!rxsa_list) { ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ ++ for (i = 0, k = 1; i < MACSEC_NUM_AN; i++) { ++ struct macsec_rx_sa *rx_sa = rtnl_dereference(rx_sc->sa[i]); ++ struct nlattr *rxsa_nest; ++ u64 pn; ++ int pn_len; ++ ++ if (!rx_sa) ++ continue; ++ ++ rxsa_nest = nla_nest_start_noflag(skb, k++); ++ if (!rxsa_nest) { ++ nla_nest_cancel(skb, rxsa_list); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ ++ attr = nla_nest_start_noflag(skb, ++ MACSEC_SA_ATTR_STATS); ++ if (!attr) { ++ nla_nest_cancel(skb, rxsa_list); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ memset(&rx_sa_stats, 0, sizeof(rx_sa_stats)); ++ get_rx_sa_stats(dev, rx_sc, i, rx_sa, &rx_sa_stats); ++ if (copy_rx_sa_stats(skb, &rx_sa_stats)) { ++ nla_nest_cancel(skb, attr); ++ nla_nest_cancel(skb, rxsa_list); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ if (secy->xpn) { ++ pn = rx_sa->next_pn; ++ pn_len = MACSEC_XPN_PN_LEN; ++ } else { ++ pn = rx_sa->next_pn_halves.lower; ++ pn_len = MACSEC_DEFAULT_PN_LEN; ++ } ++ ++ if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || ++ nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || ++ nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, rx_sa->key.id) || ++ (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, rx_sa->ssci)) || ++ nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, rx_sa->active)) { ++ nla_nest_cancel(skb, rxsa_nest); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, rxsa_nest); ++ } ++ ++ nla_nest_end(skb, rxsa_list); ++ nla_nest_end(skb, rxsc_nest); ++ } ++ ++ nla_nest_end(skb, rxsc_list); ++ ++ genlmsg_end(skb, hdr); ++ ++ return 0; ++ ++nla_put_failure: ++ genlmsg_cancel(skb, hdr); ++ return -EMSGSIZE; ++} ++ ++static int macsec_generation = 1; /* protected by RTNL */ ++ ++static int macsec_dump_txsc(struct sk_buff *skb, struct netlink_callback *cb) ++{ ++ struct net *net = sock_net(skb->sk); ++ struct net_device *dev; ++ int dev_idx, d; ++ ++ dev_idx = cb->args[0]; ++ ++ d = 0; ++ rtnl_lock(); ++ ++ cb->seq = macsec_generation; ++ ++ for_each_netdev(net, dev) { ++ struct macsec_secy *secy; ++ ++ if (d < dev_idx) ++ goto next; ++ ++ if (!netif_is_macsec(dev)) ++ goto next; ++ ++ secy = &macsec_priv(dev)->secy; ++ if (dump_secy(secy, dev, skb, cb) < 0) ++ goto done; ++next: ++ d++; ++ } ++ ++done: ++ rtnl_unlock(); ++ cb->args[0] = d; ++ return skb->len; ++} ++ ++static const struct genl_small_ops macsec_genl_ops[] = { ++ { ++ .cmd = MACSEC_CMD_GET_TXSC, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .dumpit = macsec_dump_txsc, ++ }, ++ { ++ .cmd = MACSEC_CMD_ADD_RXSC, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_add_rxsc, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_DEL_RXSC, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_del_rxsc, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_UPD_RXSC, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_upd_rxsc, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_ADD_TXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_add_txsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_DEL_TXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_del_txsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_UPD_TXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_upd_txsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_ADD_RXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_add_rxsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_DEL_RXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_del_rxsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_UPD_RXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_upd_rxsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_UPD_OFFLOAD, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_upd_offload, ++ .flags = GENL_ADMIN_PERM, ++ }, ++}; ++ ++static struct genl_family macsec_fam __ro_after_init = { ++ .name = MACSEC_GENL_NAME, ++ .hdrsize = 0, ++ .version = MACSEC_GENL_VERSION, ++ .maxattr = MACSEC_ATTR_MAX, ++ .policy = macsec_genl_policy, ++ .netnsok = true, ++ .module = THIS_MODULE, ++ .small_ops = macsec_genl_ops, ++ .n_small_ops = ARRAY_SIZE(macsec_genl_ops), ++}; ++ ++static netdev_tx_t macsec_start_xmit(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ struct macsec_dev *macsec = netdev_priv(dev); ++ struct macsec_secy *secy = &macsec->secy; ++ struct pcpu_secy_stats *secy_stats; ++ int ret, len; ++ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ skb->dev = macsec->real_dev; ++ return dev_queue_xmit(skb); ++ } ++ ++ /* 10.5 */ ++ if (!secy->protect_frames) { ++ secy_stats = this_cpu_ptr(macsec->stats); ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.OutPktsUntagged++; ++ u64_stats_update_end(&secy_stats->syncp); ++ skb->dev = macsec->real_dev; ++ len = skb->len; ++ ret = dev_queue_xmit(skb); ++ count_tx(dev, ret, len); ++ return ret; ++ } ++ ++ if (!secy->operational) { ++ kfree_skb(skb); ++ dev->stats.tx_dropped++; ++ return NETDEV_TX_OK; ++ } ++ ++ len = skb->len; ++ skb = macsec_encrypt(skb, dev); ++ if (IS_ERR(skb)) { ++ if (PTR_ERR(skb) != -EINPROGRESS) ++ dev->stats.tx_dropped++; ++ return NETDEV_TX_OK; ++ } ++ ++ macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); ++ ++ macsec_encrypt_finish(skb, dev); ++ ret = dev_queue_xmit(skb); ++ count_tx(dev, ret, len); ++ return ret; ++} ++ ++#define MACSEC_FEATURES \ ++ (NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST) ++ ++static int macsec_dev_init(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ int err; ++ ++ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); ++ if (!dev->tstats) ++ return -ENOMEM; ++ ++ err = gro_cells_init(&macsec->gro_cells, dev); ++ if (err) { ++ free_percpu(dev->tstats); ++ return err; ++ } ++ ++ dev->features = real_dev->features & MACSEC_FEATURES; ++ dev->features |= NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE; ++ ++ dev->needed_headroom = real_dev->needed_headroom + ++ MACSEC_NEEDED_HEADROOM; ++ dev->needed_tailroom = real_dev->needed_tailroom + ++ MACSEC_NEEDED_TAILROOM; ++ ++ if (is_zero_ether_addr(dev->dev_addr)) ++ eth_hw_addr_inherit(dev, real_dev); ++ if (is_zero_ether_addr(dev->broadcast)) ++ memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); ++ ++ /* Get macsec's reference to real_dev */ ++ netdev_hold(real_dev, &macsec->dev_tracker, GFP_KERNEL); ++ ++ return 0; ++} ++ ++static void macsec_dev_uninit(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ ++ gro_cells_destroy(&macsec->gro_cells); ++ free_percpu(dev->tstats); ++} ++ ++static netdev_features_t macsec_fix_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ ++ features &= (real_dev->features & MACSEC_FEATURES) | ++ NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES; ++ features |= NETIF_F_LLTX; ++ ++ return features; ++} ++ ++static int macsec_dev_open(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ int err; ++ ++ err = dev_uc_add(real_dev, dev->dev_addr); ++ if (err < 0) ++ return err; ++ ++ if (dev->flags & IFF_ALLMULTI) { ++ err = dev_set_allmulti(real_dev, 1); ++ if (err < 0) ++ goto del_unicast; ++ } ++ ++ if (dev->flags & IFF_PROMISC) { ++ err = dev_set_promiscuity(real_dev, 1); ++ if (err < 0) ++ goto clear_allmulti; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ err = -EOPNOTSUPP; ++ goto clear_allmulti; ++ } ++ ++ ctx.secy = &macsec->secy; ++ err = macsec_offload(ops->mdo_dev_open, &ctx); ++ if (err) ++ goto clear_allmulti; ++ } ++ ++ if (netif_carrier_ok(real_dev)) ++ netif_carrier_on(dev); ++ ++ return 0; ++clear_allmulti: ++ if (dev->flags & IFF_ALLMULTI) ++ dev_set_allmulti(real_dev, -1); ++del_unicast: ++ dev_uc_del(real_dev, dev->dev_addr); ++ netif_carrier_off(dev); ++ return err; ++} ++ ++static int macsec_dev_stop(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ ++ netif_carrier_off(dev); ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.secy = &macsec->secy; ++ macsec_offload(ops->mdo_dev_stop, &ctx); ++ } ++ } ++ ++ dev_mc_unsync(real_dev, dev); ++ dev_uc_unsync(real_dev, dev); ++ ++ if (dev->flags & IFF_ALLMULTI) ++ dev_set_allmulti(real_dev, -1); ++ ++ if (dev->flags & IFF_PROMISC) ++ dev_set_promiscuity(real_dev, -1); ++ ++ dev_uc_del(real_dev, dev->dev_addr); ++ ++ return 0; ++} ++ ++static void macsec_dev_change_rx_flags(struct net_device *dev, int change) ++{ ++ struct net_device *real_dev = macsec_priv(dev)->real_dev; ++ ++ if (!(dev->flags & IFF_UP)) ++ return; ++ ++ if (change & IFF_ALLMULTI) ++ dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); ++ ++ if (change & IFF_PROMISC) ++ dev_set_promiscuity(real_dev, ++ dev->flags & IFF_PROMISC ? 1 : -1); ++} ++ ++static void macsec_dev_set_rx_mode(struct net_device *dev) ++{ ++ struct net_device *real_dev = macsec_priv(dev)->real_dev; ++ ++ dev_mc_sync(real_dev, dev); ++ dev_uc_sync(real_dev, dev); ++} ++ ++static int macsec_set_mac_address(struct net_device *dev, void *p) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ struct sockaddr *addr = p; ++ int err; ++ ++ if (!is_valid_ether_addr(addr->sa_data)) ++ return -EADDRNOTAVAIL; ++ ++ if (!(dev->flags & IFF_UP)) ++ goto out; ++ ++ err = dev_uc_add(real_dev, addr->sa_data); ++ if (err < 0) ++ return err; ++ ++ dev_uc_del(real_dev, dev->dev_addr); ++ ++out: ++ eth_hw_addr_set(dev, addr->sa_data); ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.secy = &macsec->secy; ++ macsec_offload(ops->mdo_upd_secy, &ctx); ++ } ++ } ++ ++ return 0; ++} ++ ++static int macsec_change_mtu(struct net_device *dev, int new_mtu) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ unsigned int extra = macsec->secy.icv_len + macsec_extra_len(true); ++ ++ if (macsec->real_dev->mtu - extra < new_mtu) ++ return -ERANGE; ++ ++ dev->mtu = new_mtu; ++ ++ return 0; ++} ++ ++static void macsec_get_stats64(struct net_device *dev, ++ struct rtnl_link_stats64 *s) ++{ ++ if (!dev->tstats) ++ return; ++ ++ dev_fetch_sw_netstats(s, dev->tstats); ++ ++ s->rx_dropped = dev->stats.rx_dropped; ++ s->tx_dropped = dev->stats.tx_dropped; ++ s->rx_errors = dev->stats.rx_errors; ++} ++ ++static int macsec_get_iflink(const struct net_device *dev) ++{ ++ return macsec_priv(dev)->real_dev->ifindex; ++} ++ ++static const struct net_device_ops macsec_netdev_ops = { ++ .ndo_init = macsec_dev_init, ++ .ndo_uninit = macsec_dev_uninit, ++ .ndo_open = macsec_dev_open, ++ .ndo_stop = macsec_dev_stop, ++ .ndo_fix_features = macsec_fix_features, ++ .ndo_change_mtu = macsec_change_mtu, ++ .ndo_set_rx_mode = macsec_dev_set_rx_mode, ++ .ndo_change_rx_flags = macsec_dev_change_rx_flags, ++ .ndo_set_mac_address = macsec_set_mac_address, ++ .ndo_start_xmit = macsec_start_xmit, ++ .ndo_get_stats64 = macsec_get_stats64, ++ .ndo_get_iflink = macsec_get_iflink, ++}; ++ ++static const struct device_type macsec_type = { ++ .name = "macsec", ++}; ++ ++static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = { ++ [IFLA_MACSEC_SCI] = { .type = NLA_U64 }, ++ [IFLA_MACSEC_PORT] = { .type = NLA_U16 }, ++ [IFLA_MACSEC_ICV_LEN] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_CIPHER_SUITE] = { .type = NLA_U64 }, ++ [IFLA_MACSEC_WINDOW] = { .type = NLA_U32 }, ++ [IFLA_MACSEC_ENCODING_SA] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_ENCRYPT] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_PROTECT] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_INC_SCI] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_ES] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_SCB] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 }, ++}; ++ ++static void macsec_free_netdev(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ ++ free_percpu(macsec->stats); ++ free_percpu(macsec->secy.tx_sc.stats); ++ ++ /* Get rid of the macsec's reference to real_dev */ ++ netdev_put(macsec->real_dev, &macsec->dev_tracker); ++} ++ ++static void macsec_setup(struct net_device *dev) ++{ ++ ether_setup(dev); ++ dev->min_mtu = 0; ++ dev->max_mtu = ETH_MAX_MTU; ++ dev->priv_flags |= IFF_NO_QUEUE; ++ dev->netdev_ops = &macsec_netdev_ops; ++ dev->needs_free_netdev = true; ++ dev->priv_destructor = macsec_free_netdev; ++ SET_NETDEV_DEVTYPE(dev, &macsec_type); ++ ++ eth_zero_addr(dev->broadcast); ++} ++ ++static int macsec_changelink_common(struct net_device *dev, ++ struct nlattr *data[]) ++{ ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ ++ secy = &macsec_priv(dev)->secy; ++ tx_sc = &secy->tx_sc; ++ ++ if (data[IFLA_MACSEC_ENCODING_SA]) { ++ struct macsec_tx_sa *tx_sa; ++ ++ tx_sc->encoding_sa = nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]); ++ tx_sa = rtnl_dereference(tx_sc->sa[tx_sc->encoding_sa]); ++ ++ secy->operational = tx_sa && tx_sa->active; ++ } ++ ++ if (data[IFLA_MACSEC_ENCRYPT]) ++ tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]); ++ ++ if (data[IFLA_MACSEC_PROTECT]) ++ secy->protect_frames = !!nla_get_u8(data[IFLA_MACSEC_PROTECT]); ++ ++ if (data[IFLA_MACSEC_INC_SCI]) ++ tx_sc->send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); ++ ++ if (data[IFLA_MACSEC_ES]) ++ tx_sc->end_station = !!nla_get_u8(data[IFLA_MACSEC_ES]); ++ ++ if (data[IFLA_MACSEC_SCB]) ++ tx_sc->scb = !!nla_get_u8(data[IFLA_MACSEC_SCB]); ++ ++ if (data[IFLA_MACSEC_REPLAY_PROTECT]) ++ secy->replay_protect = !!nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT]); ++ ++ if (data[IFLA_MACSEC_VALIDATION]) ++ secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]); ++ ++ if (data[IFLA_MACSEC_CIPHER_SUITE]) { ++ switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) { ++ case MACSEC_CIPHER_ID_GCM_AES_128: ++ case MACSEC_DEFAULT_CIPHER_ID: ++ secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; ++ secy->xpn = false; ++ break; ++ case MACSEC_CIPHER_ID_GCM_AES_256: ++ secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; ++ secy->xpn = false; ++ break; ++ case MACSEC_CIPHER_ID_GCM_AES_XPN_128: ++ secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; ++ secy->xpn = true; ++ break; ++ case MACSEC_CIPHER_ID_GCM_AES_XPN_256: ++ secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; ++ secy->xpn = true; ++ break; ++ default: ++ return -EINVAL; ++ } ++ } ++ ++ if (data[IFLA_MACSEC_WINDOW]) { ++ secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]); ++ ++ /* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window ++ * for XPN cipher suites */ ++ if (secy->xpn && ++ secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW) ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], ++ struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_tx_sc tx_sc; ++ struct macsec_secy secy; ++ int ret; ++ ++ if (!data) ++ return 0; ++ ++ if (data[IFLA_MACSEC_CIPHER_SUITE] || ++ data[IFLA_MACSEC_ICV_LEN] || ++ data[IFLA_MACSEC_SCI] || ++ data[IFLA_MACSEC_PORT]) ++ return -EINVAL; ++ ++ /* Keep a copy of unmodified secy and tx_sc, in case the offload ++ * propagation fails, to revert macsec_changelink_common. ++ */ ++ memcpy(&secy, &macsec->secy, sizeof(secy)); ++ memcpy(&tx_sc, &macsec->secy.tx_sc, sizeof(tx_sc)); ++ ++ ret = macsec_changelink_common(dev, data); ++ if (ret) ++ goto cleanup; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.secy = &macsec->secy; ++ ret = macsec_offload(ops->mdo_upd_secy, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ return 0; ++ ++cleanup: ++ memcpy(&macsec->secy.tx_sc, &tx_sc, sizeof(tx_sc)); ++ memcpy(&macsec->secy, &secy, sizeof(secy)); ++ ++ return ret; ++} ++ ++static void macsec_del_dev(struct macsec_dev *macsec) ++{ ++ int i; ++ ++ while (macsec->secy.rx_sc) { ++ struct macsec_rx_sc *rx_sc = rtnl_dereference(macsec->secy.rx_sc); ++ ++ rcu_assign_pointer(macsec->secy.rx_sc, rx_sc->next); ++ free_rx_sc(rx_sc); ++ } ++ ++ for (i = 0; i < MACSEC_NUM_AN; i++) { ++ struct macsec_tx_sa *sa = rtnl_dereference(macsec->secy.tx_sc.sa[i]); ++ ++ if (sa) { ++ RCU_INIT_POINTER(macsec->secy.tx_sc.sa[i], NULL); ++ clear_tx_sa(sa); ++ } ++ } ++} ++ ++static void macsec_common_dellink(struct net_device *dev, struct list_head *head) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (ops) { ++ ctx.secy = &macsec->secy; ++ macsec_offload(ops->mdo_del_secy, &ctx); ++ } ++ } ++ ++ unregister_netdevice_queue(dev, head); ++ list_del_rcu(&macsec->secys); ++ macsec_del_dev(macsec); ++ netdev_upper_dev_unlink(real_dev, dev); ++ ++ macsec_generation++; ++} ++ ++static void macsec_dellink(struct net_device *dev, struct list_head *head) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); ++ ++ macsec_common_dellink(dev, head); ++ ++ if (list_empty(&rxd->secys)) { ++ netdev_rx_handler_unregister(real_dev); ++ kfree(rxd); ++ } ++} ++ ++static int register_macsec_dev(struct net_device *real_dev, ++ struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); ++ ++ if (!rxd) { ++ int err; ++ ++ rxd = kmalloc(sizeof(*rxd), GFP_KERNEL); ++ if (!rxd) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&rxd->secys); ++ ++ err = netdev_rx_handler_register(real_dev, macsec_handle_frame, ++ rxd); ++ if (err < 0) { ++ kfree(rxd); ++ return err; ++ } ++ } ++ ++ list_add_tail_rcu(&macsec->secys, &rxd->secys); ++ return 0; ++} ++ ++static bool sci_exists(struct net_device *dev, sci_t sci) ++{ ++ struct macsec_rxh_data *rxd = macsec_data_rtnl(dev); ++ struct macsec_dev *macsec; ++ ++ list_for_each_entry(macsec, &rxd->secys, secys) { ++ if (macsec->secy.sci == sci) ++ return true; ++ } ++ ++ return false; ++} ++ ++static sci_t dev_to_sci(struct net_device *dev, __be16 port) ++{ ++ return make_sci(dev->dev_addr, port); ++} ++ ++static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_secy *secy = &macsec->secy; ++ ++ macsec->stats = netdev_alloc_pcpu_stats(struct pcpu_secy_stats); ++ if (!macsec->stats) ++ return -ENOMEM; ++ ++ secy->tx_sc.stats = netdev_alloc_pcpu_stats(struct pcpu_tx_sc_stats); ++ if (!secy->tx_sc.stats) { ++ free_percpu(macsec->stats); ++ return -ENOMEM; ++ } ++ ++ if (sci == MACSEC_UNDEF_SCI) ++ sci = dev_to_sci(dev, MACSEC_PORT_ES); ++ ++ secy->netdev = dev; ++ secy->operational = true; ++ secy->key_len = DEFAULT_SAK_LEN; ++ secy->icv_len = icv_len; ++ secy->validate_frames = MACSEC_VALIDATE_DEFAULT; ++ secy->protect_frames = true; ++ secy->replay_protect = false; ++ secy->xpn = DEFAULT_XPN; ++ ++ secy->sci = sci; ++ secy->tx_sc.active = true; ++ secy->tx_sc.encoding_sa = DEFAULT_ENCODING_SA; ++ secy->tx_sc.encrypt = DEFAULT_ENCRYPT; ++ secy->tx_sc.send_sci = DEFAULT_SEND_SCI; ++ secy->tx_sc.end_station = false; ++ secy->tx_sc.scb = false; ++ ++ return 0; ++} ++ ++static struct lock_class_key macsec_netdev_addr_lock_key; ++ ++static int macsec_newlink(struct net *net, struct net_device *dev, ++ struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ rx_handler_func_t *rx_handler; ++ u8 icv_len = DEFAULT_ICV_LEN; ++ struct net_device *real_dev; ++ int err, mtu; ++ sci_t sci; ++ ++ if (!tb[IFLA_LINK]) ++ return -EINVAL; ++ real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK])); ++ if (!real_dev) ++ return -ENODEV; ++ if (real_dev->type != ARPHRD_ETHER) ++ return -EINVAL; ++ ++ dev->priv_flags |= IFF_MACSEC; ++ ++ macsec->real_dev = real_dev; ++ ++ if (data && data[IFLA_MACSEC_OFFLOAD]) ++ macsec->offload = nla_get_offload(data[IFLA_MACSEC_OFFLOAD]); ++ else ++ /* MACsec offloading is off by default */ ++ macsec->offload = MACSEC_OFFLOAD_OFF; ++ ++ /* Check if the offloading mode is supported by the underlying layers */ ++ if (macsec->offload != MACSEC_OFFLOAD_OFF && ++ !macsec_check_offload(macsec->offload, macsec)) ++ return -EOPNOTSUPP; ++ ++ /* send_sci must be set to true when transmit sci explicitly is set */ ++ if ((data && data[IFLA_MACSEC_SCI]) && ++ (data && data[IFLA_MACSEC_INC_SCI])) { ++ u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); ++ ++ if (!send_sci) ++ return -EINVAL; ++ } ++ ++ if (data && data[IFLA_MACSEC_ICV_LEN]) ++ icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); ++ mtu = real_dev->mtu - icv_len - macsec_extra_len(true); ++ if (mtu < 0) ++ dev->mtu = 0; ++ else ++ dev->mtu = mtu; ++ ++ rx_handler = rtnl_dereference(real_dev->rx_handler); ++ if (rx_handler && rx_handler != macsec_handle_frame) ++ return -EBUSY; ++ ++ err = register_netdevice(dev); ++ if (err < 0) ++ return err; ++ ++ netdev_lockdep_set_classes(dev); ++ lockdep_set_class(&dev->addr_list_lock, ++ &macsec_netdev_addr_lock_key); ++ ++ err = netdev_upper_dev_link(real_dev, dev, extack); ++ if (err < 0) ++ goto unregister; ++ ++ /* need to be already registered so that ->init has run and ++ * the MAC addr is set ++ */ ++ if (data && data[IFLA_MACSEC_SCI]) ++ sci = nla_get_sci(data[IFLA_MACSEC_SCI]); ++ else if (data && data[IFLA_MACSEC_PORT]) ++ sci = dev_to_sci(dev, nla_get_be16(data[IFLA_MACSEC_PORT])); ++ else ++ sci = dev_to_sci(dev, MACSEC_PORT_ES); ++ ++ if (rx_handler && sci_exists(real_dev, sci)) { ++ err = -EBUSY; ++ goto unlink; ++ } ++ ++ err = macsec_add_dev(dev, sci, icv_len); ++ if (err) ++ goto unlink; ++ ++ if (data) { ++ err = macsec_changelink_common(dev, data); ++ if (err) ++ goto del_dev; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.secy = &macsec->secy; ++ err = macsec_offload(ops->mdo_add_secy, &ctx); ++ if (err) ++ goto del_dev; ++ } ++ } ++ ++ err = register_macsec_dev(real_dev, dev); ++ if (err < 0) ++ goto del_dev; ++ ++ netif_stacked_transfer_operstate(real_dev, dev); ++ linkwatch_fire_event(dev); ++ ++ macsec_generation++; ++ ++ return 0; ++ ++del_dev: ++ macsec_del_dev(macsec); ++unlink: ++ netdev_upper_dev_unlink(real_dev, dev); ++unregister: ++ unregister_netdevice(dev); ++ return err; ++} ++ ++static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ u64 csid = MACSEC_DEFAULT_CIPHER_ID; ++ u8 icv_len = DEFAULT_ICV_LEN; ++ int flag; ++ bool es, scb, sci; ++ ++ if (!data) ++ return 0; ++ ++ if (data[IFLA_MACSEC_CIPHER_SUITE]) ++ csid = nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE]); ++ ++ if (data[IFLA_MACSEC_ICV_LEN]) { ++ icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); ++ if (icv_len != DEFAULT_ICV_LEN) { ++ char dummy_key[DEFAULT_SAK_LEN] = { 0 }; ++ struct crypto_aead *dummy_tfm; ++ ++ dummy_tfm = macsec_alloc_tfm(dummy_key, ++ DEFAULT_SAK_LEN, ++ icv_len); ++ if (IS_ERR(dummy_tfm)) ++ return PTR_ERR(dummy_tfm); ++ crypto_free_aead(dummy_tfm); ++ } ++ } ++ ++ switch (csid) { ++ case MACSEC_CIPHER_ID_GCM_AES_128: ++ case MACSEC_CIPHER_ID_GCM_AES_256: ++ case MACSEC_CIPHER_ID_GCM_AES_XPN_128: ++ case MACSEC_CIPHER_ID_GCM_AES_XPN_256: ++ case MACSEC_DEFAULT_CIPHER_ID: ++ if (icv_len < MACSEC_MIN_ICV_LEN || ++ icv_len > MACSEC_STD_ICV_LEN) ++ return -EINVAL; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ if (data[IFLA_MACSEC_ENCODING_SA]) { ++ if (nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]) >= MACSEC_NUM_AN) ++ return -EINVAL; ++ } ++ ++ for (flag = IFLA_MACSEC_ENCODING_SA + 1; ++ flag < IFLA_MACSEC_VALIDATION; ++ flag++) { ++ if (data[flag]) { ++ if (nla_get_u8(data[flag]) > 1) ++ return -EINVAL; ++ } ++ } ++ ++ es = data[IFLA_MACSEC_ES] ? nla_get_u8(data[IFLA_MACSEC_ES]) : false; ++ sci = data[IFLA_MACSEC_INC_SCI] ? nla_get_u8(data[IFLA_MACSEC_INC_SCI]) : false; ++ scb = data[IFLA_MACSEC_SCB] ? nla_get_u8(data[IFLA_MACSEC_SCB]) : false; ++ ++ if ((sci && (scb || es)) || (scb && es)) ++ return -EINVAL; ++ ++ if (data[IFLA_MACSEC_VALIDATION] && ++ nla_get_u8(data[IFLA_MACSEC_VALIDATION]) > MACSEC_VALIDATE_MAX) ++ return -EINVAL; ++ ++ if ((data[IFLA_MACSEC_REPLAY_PROTECT] && ++ nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT])) && ++ !data[IFLA_MACSEC_WINDOW]) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static struct net *macsec_get_link_net(const struct net_device *dev) ++{ ++ return dev_net(macsec_priv(dev)->real_dev); ++} ++ ++static size_t macsec_get_size(const struct net_device *dev) ++{ ++ return nla_total_size_64bit(8) + /* IFLA_MACSEC_SCI */ ++ nla_total_size(1) + /* IFLA_MACSEC_ICV_LEN */ ++ nla_total_size_64bit(8) + /* IFLA_MACSEC_CIPHER_SUITE */ ++ nla_total_size(4) + /* IFLA_MACSEC_WINDOW */ ++ nla_total_size(1) + /* IFLA_MACSEC_ENCODING_SA */ ++ nla_total_size(1) + /* IFLA_MACSEC_ENCRYPT */ ++ nla_total_size(1) + /* IFLA_MACSEC_PROTECT */ ++ nla_total_size(1) + /* IFLA_MACSEC_INC_SCI */ ++ nla_total_size(1) + /* IFLA_MACSEC_ES */ ++ nla_total_size(1) + /* IFLA_MACSEC_SCB */ ++ nla_total_size(1) + /* IFLA_MACSEC_REPLAY_PROTECT */ ++ nla_total_size(1) + /* IFLA_MACSEC_VALIDATION */ ++ 0; ++} ++ ++static int macsec_fill_info(struct sk_buff *skb, ++ const struct net_device *dev) ++{ ++ struct macsec_secy *secy = &macsec_priv(dev)->secy; ++ struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ u64 csid; ++ ++ switch (secy->key_len) { ++ case MACSEC_GCM_AES_128_SAK_LEN: ++ csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; ++ break; ++ case MACSEC_GCM_AES_256_SAK_LEN: ++ csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; ++ break; ++ default: ++ goto nla_put_failure; ++ } ++ ++ if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci, ++ IFLA_MACSEC_PAD) || ++ nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) || ++ nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE, ++ csid, IFLA_MACSEC_PAD) || ++ nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) || ++ nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) || ++ nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) || ++ nla_put_u8(skb, IFLA_MACSEC_INC_SCI, tx_sc->send_sci) || ++ nla_put_u8(skb, IFLA_MACSEC_ES, tx_sc->end_station) || ++ nla_put_u8(skb, IFLA_MACSEC_SCB, tx_sc->scb) || ++ nla_put_u8(skb, IFLA_MACSEC_REPLAY_PROTECT, secy->replay_protect) || ++ nla_put_u8(skb, IFLA_MACSEC_VALIDATION, secy->validate_frames) || ++ 0) ++ goto nla_put_failure; ++ ++ if (secy->replay_protect) { ++ if (nla_put_u32(skb, IFLA_MACSEC_WINDOW, secy->replay_window)) ++ goto nla_put_failure; ++ } ++ ++ return 0; ++ ++nla_put_failure: ++ return -EMSGSIZE; ++} ++ ++static struct rtnl_link_ops macsec_link_ops __read_mostly = { ++ .kind = "macsec", ++ .priv_size = sizeof(struct macsec_dev), ++ .maxtype = IFLA_MACSEC_MAX, ++ .policy = macsec_rtnl_policy, ++ .setup = macsec_setup, ++ .validate = macsec_validate_attr, ++ .newlink = macsec_newlink, ++ .changelink = macsec_changelink, ++ .dellink = macsec_dellink, ++ .get_size = macsec_get_size, ++ .fill_info = macsec_fill_info, ++ .get_link_net = macsec_get_link_net, ++}; ++ ++static bool is_macsec_master(struct net_device *dev) ++{ ++ return rcu_access_pointer(dev->rx_handler) == macsec_handle_frame; ++} ++ ++static int macsec_notify(struct notifier_block *this, unsigned long event, ++ void *ptr) ++{ ++ struct net_device *real_dev = netdev_notifier_info_to_dev(ptr); ++ LIST_HEAD(head); ++ ++ if (!is_macsec_master(real_dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_DOWN: ++ case NETDEV_UP: ++ case NETDEV_CHANGE: { ++ struct macsec_dev *m, *n; ++ struct macsec_rxh_data *rxd; ++ ++ rxd = macsec_data_rtnl(real_dev); ++ list_for_each_entry_safe(m, n, &rxd->secys, secys) { ++ struct net_device *dev = m->secy.netdev; ++ ++ netif_stacked_transfer_operstate(real_dev, dev); ++ } ++ break; ++ } ++ case NETDEV_UNREGISTER: { ++ struct macsec_dev *m, *n; ++ struct macsec_rxh_data *rxd; ++ ++ rxd = macsec_data_rtnl(real_dev); ++ list_for_each_entry_safe(m, n, &rxd->secys, secys) { ++ macsec_common_dellink(m->secy.netdev, &head); ++ } ++ ++ netdev_rx_handler_unregister(real_dev); ++ kfree(rxd); ++ ++ unregister_netdevice_many(&head); ++ break; ++ } ++ case NETDEV_CHANGEMTU: { ++ struct macsec_dev *m; ++ struct macsec_rxh_data *rxd; ++ ++ rxd = macsec_data_rtnl(real_dev); ++ list_for_each_entry(m, &rxd->secys, secys) { ++ struct net_device *dev = m->secy.netdev; ++ unsigned int mtu = real_dev->mtu - (m->secy.icv_len + ++ macsec_extra_len(true)); ++ ++ if (dev->mtu > mtu) ++ dev_set_mtu(dev, mtu); ++ } ++ } ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block macsec_notifier = { ++ .notifier_call = macsec_notify, ++}; ++ ++static int __init macsec_init(void) ++{ ++ int err; ++ ++ pr_info("MACsec IEEE 802.1AE\n"); ++ err = register_netdevice_notifier(&macsec_notifier); ++ if (err) ++ return err; ++ ++ err = rtnl_link_register(&macsec_link_ops); ++ if (err) ++ goto notifier; ++ ++ err = genl_register_family(&macsec_fam); ++ if (err) ++ goto rtnl; ++ ++ return 0; ++ ++rtnl: ++ rtnl_link_unregister(&macsec_link_ops); ++notifier: ++ unregister_netdevice_notifier(&macsec_notifier); ++ return err; ++} ++ ++static void __exit macsec_exit(void) ++{ ++ genl_unregister_family(&macsec_fam); ++ rtnl_link_unregister(&macsec_link_ops); ++ unregister_netdevice_notifier(&macsec_notifier); ++ rcu_barrier(); ++} ++ ++module_init(macsec_init); ++module_exit(macsec_exit); ++ ++MODULE_ALIAS_RTNL_LINK("macsec"); ++MODULE_ALIAS_GENL_FAMILY("macsec"); ++ ++MODULE_DESCRIPTION("MACsec IEEE 802.1AE"); ++MODULE_LICENSE("GPL v2"); +diff -rupN linux.orig/drivers/net/macvlan.c linux/drivers/net/macvlan.c +--- linux.orig/drivers/net/macvlan.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/macvlan.c 2022-12-04 10:40:26.696034096 -0500 +@@ -948,13 +948,13 @@ static void macvlan_dev_get_stats64(stru for_each_possible_cpu(i) { p = per_cpu_ptr(vlan->pcpu_stats, i); do { @@ -2829,11 +20477,10 @@ index 1080d6ebff63b..a1c7823f0ba66 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c -index 0b1b6f650104b..ff302144029de 100644 ---- a/drivers/net/mhi_net.c -+++ b/drivers/net/mhi_net.c -@@ -104,19 +104,19 @@ static void mhi_ndo_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/mhi_net.c linux/drivers/net/mhi_net.c +--- linux.orig/drivers/net/mhi_net.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/mhi_net.c 2022-12-04 10:40:26.696034096 -0500 +@@ -104,19 +104,19 @@ static void mhi_ndo_get_stats64(struct n unsigned int start; do { @@ -2857,11 +20504,10 @@ index 0b1b6f650104b..ff302144029de 100644 } static const struct net_device_ops mhi_netdev_ops = { -diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c -index 9a1a5b2036240..e470e3398abc2 100644 ---- a/drivers/net/netdevsim/netdev.c -+++ b/drivers/net/netdevsim/netdev.c -@@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/netdevsim/netdev.c linux/drivers/net/netdevsim/netdev.c +--- linux.orig/drivers/net/netdevsim/netdev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/netdevsim/netdev.c 2022-12-04 10:40:26.696034096 -0500 +@@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev, unsigned int start; do { @@ -2874,11 +20520,10 @@ index 9a1a5b2036240..e470e3398abc2 100644 } static int -diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c -index 154a3c0a6dfd8..3de937141c168 100644 ---- a/drivers/net/team/team.c -+++ b/drivers/net/team/team.c -@@ -1865,13 +1865,13 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/team/team.c linux/drivers/net/team/team.c +--- linux.orig/drivers/net/team/team.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/team/team.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1865,13 +1865,13 @@ team_get_stats64(struct net_device *dev, for_each_possible_cpu(i) { p = per_cpu_ptr(team->pcpu_stats, i); do { @@ -2894,11 +20539,10 @@ index 154a3c0a6dfd8..3de937141c168 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c -index b095a4b4957bb..18d99fda997cf 100644 ---- a/drivers/net/team/team_mode_loadbalance.c -+++ b/drivers/net/team/team_mode_loadbalance.c -@@ -466,9 +466,9 @@ static void __lb_one_cpu_stats_add(struct lb_stats *acc_stats, +diff -rupN linux.orig/drivers/net/team/team_mode_loadbalance.c linux/drivers/net/team/team_mode_loadbalance.c +--- linux.orig/drivers/net/team/team_mode_loadbalance.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/team/team_mode_loadbalance.c 2022-12-04 10:40:26.696034096 -0500 +@@ -466,9 +466,9 @@ static void __lb_one_cpu_stats_add(struc struct lb_stats tmp; do { @@ -2910,11 +20554,10 @@ index b095a4b4957bb..18d99fda997cf 100644 acc_stats->tx_bytes += tmp.tx_bytes; } -diff --git a/drivers/net/veth.c b/drivers/net/veth.c -index 466da01ba2e3e..2da7cfcfe1c31 100644 ---- a/drivers/net/veth.c -+++ b/drivers/net/veth.c -@@ -182,12 +182,12 @@ static void veth_get_ethtool_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/veth.c linux/drivers/net/veth.c +--- linux.orig/drivers/net/veth.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/veth.c 2022-12-04 10:40:26.696034096 -0500 +@@ -182,12 +182,12 @@ static void veth_get_ethtool_stats(struc size_t offset; do { @@ -2929,7 +20572,7 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644 idx += VETH_RQ_STATS_LEN; } -@@ -203,12 +203,12 @@ static void veth_get_ethtool_stats(struct net_device *dev, +@@ -203,12 +203,12 @@ static void veth_get_ethtool_stats(struc tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; do { @@ -2944,7 +20587,7 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644 } } -@@ -379,13 +379,13 @@ static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) +@@ -379,13 +379,13 @@ static void veth_stats_rx(struct veth_st unsigned int start; do { @@ -2960,11 +20603,10 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; result->xdp_tx_err += xdp_tx_err; result->xdp_packets += packets; -diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c -index 9cce7dec7366d..a94d9d8f67fd0 100644 ---- a/drivers/net/virtio_net.c -+++ b/drivers/net/virtio_net.c -@@ -2066,18 +2066,18 @@ static void virtnet_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/virtio_net.c linux/drivers/net/virtio_net.c +--- linux.orig/drivers/net/virtio_net.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/virtio_net.c 2022-12-04 10:40:26.696034096 -0500 +@@ -2066,18 +2066,18 @@ static void virtnet_stats(struct net_dev struct send_queue *sq = &vi->sq[i]; do { @@ -2987,7 +20629,7 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644 tot->rx_packets += rpackets; tot->tx_packets += tpackets; -@@ -2688,12 +2688,12 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, +@@ -2688,12 +2688,12 @@ static void virtnet_get_ethtool_stats(st stats_base = (u8 *)&rq->stats; do { @@ -3002,7 +20644,7 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644 idx += VIRTNET_RQ_STATS_LEN; } -@@ -2702,12 +2702,12 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, +@@ -2702,12 +2702,12 @@ static void virtnet_get_ethtool_stats(st stats_base = (u8 *)&sq->stats; do { @@ -3017,11 +20659,10 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644 idx += VIRTNET_SQ_STATS_LEN; } } -diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c -index 5df7a0abc39d5..191ebc482f0c1 100644 ---- a/drivers/net/vrf.c -+++ b/drivers/net/vrf.c -@@ -159,13 +159,13 @@ static void vrf_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/vrf.c linux/drivers/net/vrf.c +--- linux.orig/drivers/net/vrf.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/vrf.c 2022-12-04 10:40:26.696034096 -0500 +@@ -159,13 +159,13 @@ static void vrf_get_stats64(struct net_d dstats = per_cpu_ptr(dev->dstats, i); do { @@ -3037,11 +20678,10 @@ index 5df7a0abc39d5..191ebc482f0c1 100644 stats->tx_bytes += tbytes; stats->tx_packets += tpkts; stats->tx_dropped += tdrops; -diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c -index 3e04af4c5daa1..a3de081cda5ee 100644 ---- a/drivers/net/vxlan/vxlan_vnifilter.c -+++ b/drivers/net/vxlan/vxlan_vnifilter.c -@@ -129,9 +129,9 @@ static void vxlan_vnifilter_stats_get(const struct vxlan_vni_node *vninode, +diff -rupN linux.orig/drivers/net/vxlan/vxlan_vnifilter.c linux/drivers/net/vxlan/vxlan_vnifilter.c +--- linux.orig/drivers/net/vxlan/vxlan_vnifilter.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/vxlan/vxlan_vnifilter.c 2022-12-04 10:40:26.696034096 -0500 +@@ -129,9 +129,9 @@ static void vxlan_vnifilter_stats_get(co pstats = per_cpu_ptr(vninode->stats, i); do { @@ -3053,11 +20693,10 @@ index 3e04af4c5daa1..a3de081cda5ee 100644 dest->rx_packets += temp.rx_packets; dest->rx_bytes += temp.rx_bytes; -diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c -index 6872782e8dd89..22b5939a42bb3 100644 ---- a/drivers/net/wwan/mhi_wwan_mbim.c -+++ b/drivers/net/wwan/mhi_wwan_mbim.c -@@ -456,19 +456,19 @@ static void mhi_mbim_ndo_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/wwan/mhi_wwan_mbim.c linux/drivers/net/wwan/mhi_wwan_mbim.c +--- linux.orig/drivers/net/wwan/mhi_wwan_mbim.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/wwan/mhi_wwan_mbim.c 2022-12-04 10:40:26.696034096 -0500 +@@ -456,19 +456,19 @@ static void mhi_mbim_ndo_get_stats64(str unsigned int start; do { @@ -3081,11 +20720,10 @@ index 6872782e8dd89..22b5939a42bb3 100644 } static void mhi_mbim_ul_callback(struct mhi_device *mhi_dev, -diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c -index 27a11cc08c61e..df4dc02638a00 100644 ---- a/drivers/net/xen-netfront.c -+++ b/drivers/net/xen-netfront.c -@@ -1392,16 +1392,16 @@ static void xennet_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/xen-netfront.c linux/drivers/net/xen-netfront.c +--- linux.orig/drivers/net/xen-netfront.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/xen-netfront.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1392,16 +1392,16 @@ static void xennet_get_stats64(struct ne unsigned int start; do { @@ -3106,11 +20744,10 @@ index 27a11cc08c61e..df4dc02638a00 100644 tot->rx_packets += rx_packets; tot->tx_packets += tx_packets; -diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c -index 2a4b3efb7e12b..9f6ed09538cd0 100644 ---- a/drivers/pinctrl/pinctrl-amd.c -+++ b/drivers/pinctrl/pinctrl-amd.c -@@ -639,7 +639,7 @@ static bool do_amd_gpio_irq_handler(int irq, void *dev_id) +diff -rupN linux.orig/drivers/pinctrl/pinctrl-amd.c linux/drivers/pinctrl/pinctrl-amd.c +--- linux.orig/drivers/pinctrl/pinctrl-amd.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/pinctrl/pinctrl-amd.c 2022-12-04 10:40:26.696034096 -0500 +@@ -639,7 +639,7 @@ static bool do_amd_gpio_irq_handler(int if (!(regval & PIN_IRQ_PENDING) || !(regval & BIT(INTERRUPT_MASK_OFF))) continue; @@ -3119,11 +20756,10 @@ index 2a4b3efb7e12b..9f6ed09538cd0 100644 /* Clear interrupt. * We must read the pin register again, in case the -diff --git a/drivers/platform/x86/intel/int0002_vgpio.c b/drivers/platform/x86/intel/int0002_vgpio.c -index 617dbf98980ec..97cfbc520a02c 100644 ---- a/drivers/platform/x86/intel/int0002_vgpio.c -+++ b/drivers/platform/x86/intel/int0002_vgpio.c -@@ -125,8 +125,7 @@ static irqreturn_t int0002_irq(int irq, void *data) +diff -rupN linux.orig/drivers/platform/x86/intel/int0002_vgpio.c linux/drivers/platform/x86/intel/int0002_vgpio.c +--- linux.orig/drivers/platform/x86/intel/int0002_vgpio.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/platform/x86/intel/int0002_vgpio.c 2022-12-04 10:40:26.696034096 -0500 +@@ -125,8 +125,7 @@ static irqreturn_t int0002_irq(int irq, if (!(gpe_sts_reg & GPE0A_PME_B0_STS_BIT)) return IRQ_NONE; @@ -3133,10 +20769,9 @@ index 617dbf98980ec..97cfbc520a02c 100644 pm_wakeup_hard_event(chip->parent); -diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c -index 4b42f2302a8a8..d4f77f6688cf7 100644 ---- a/drivers/spi/spi.c -+++ b/drivers/spi/spi.c +diff -rupN linux.orig/drivers/spi/spi.c linux/drivers/spi/spi.c +--- linux.orig/drivers/spi/spi.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/spi/spi.c 2022-12-04 10:40:26.700034085 -0500 @@ -127,10 +127,10 @@ do { \ unsigned int start; \ pcpu_stats = per_cpu_ptr(in, i); \ @@ -3150,11 +20785,10 @@ index 4b42f2302a8a8..d4f77f6688cf7 100644 &pcpu_stats->syncp, start)); \ ret += inc; \ } \ -diff --git a/drivers/ssb/driver_gpio.c b/drivers/ssb/driver_gpio.c -index 2de3896489c84..897cb8db5084f 100644 ---- a/drivers/ssb/driver_gpio.c -+++ b/drivers/ssb/driver_gpio.c -@@ -132,7 +132,8 @@ static irqreturn_t ssb_gpio_irq_chipco_handler(int irq, void *dev_id) +diff -rupN linux.orig/drivers/ssb/driver_gpio.c linux/drivers/ssb/driver_gpio.c +--- linux.orig/drivers/ssb/driver_gpio.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/ssb/driver_gpio.c 2022-12-04 10:40:26.700034085 -0500 +@@ -132,7 +132,8 @@ static irqreturn_t ssb_gpio_irq_chipco_h return IRQ_NONE; for_each_set_bit(gpio, &irqs, bus->gpio.ngpio) @@ -3164,7 +20798,7 @@ index 2de3896489c84..897cb8db5084f 100644 ssb_chipco_gpio_polarity(chipco, irqs, val & irqs); return IRQ_HANDLED; -@@ -330,7 +331,8 @@ static irqreturn_t ssb_gpio_irq_extif_handler(int irq, void *dev_id) +@@ -330,7 +331,8 @@ static irqreturn_t ssb_gpio_irq_extif_ha return IRQ_NONE; for_each_set_bit(gpio, &irqs, bus->gpio.ngpio) @@ -3174,11 +20808,207 @@ index 2de3896489c84..897cb8db5084f 100644 ssb_extif_gpio_polarity(extif, irqs, val & irqs); return IRQ_HANDLED; -diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h -index 287153d325365..81f5fce6e895f 100644 ---- a/drivers/tty/serial/8250/8250.h -+++ b/drivers/tty/serial/8250/8250.h -@@ -177,12 +177,74 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value) +diff -rupN linux.orig/drivers/tty/serial/8250/8250_aspeed_vuart.c linux/drivers/tty/serial/8250/8250_aspeed_vuart.c +--- linux.orig/drivers/tty/serial/8250/8250_aspeed_vuart.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_aspeed_vuart.c 2022-12-04 10:40:26.700034085 -0500 +@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle( + up->ier &= ~irqs; + if (!throttle) + up->ier |= irqs; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + } + static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle) + { +diff -rupN linux.orig/drivers/tty/serial/8250/8250_bcm7271.c linux/drivers/tty/serial/8250/8250_bcm7271.c +--- linux.orig/drivers/tty/serial/8250/8250_bcm7271.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_bcm7271.c 2022-12-04 10:40:26.700034085 -0500 +@@ -609,7 +609,7 @@ static int brcmuart_startup(struct uart_ + * will handle this. + */ + up->ier &= ~UART_IER_RDI; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + priv->tx_running = false; + priv->dma.rx_dma = NULL; +@@ -775,10 +775,12 @@ static int brcmuart_handle_irq(struct ua + unsigned int iir = serial_port_in(p, UART_IIR); + struct brcmuart_priv *priv = p->private_data; + struct uart_8250_port *up = up_to_u8250p(p); ++ unsigned long cs_flags; + unsigned int status; + unsigned long flags; + unsigned int ier; + unsigned int mcr; ++ bool is_console; + int handled = 0; + + /* +@@ -789,6 +791,10 @@ static int brcmuart_handle_irq(struct ua + spin_lock_irqsave(&p->lock, flags); + status = serial_port_in(p, UART_LSR); + if ((status & UART_LSR_DR) == 0) { ++ is_console = uart_console(p); ++ ++ if (is_console) ++ printk_cpu_sync_get_irqsave(cs_flags); + + ier = serial_port_in(p, UART_IER); + /* +@@ -809,6 +815,9 @@ static int brcmuart_handle_irq(struct ua + serial_port_in(p, UART_RX); + } + ++ if (is_console) ++ printk_cpu_sync_put_irqrestore(cs_flags); ++ + handled = 1; + } + spin_unlock_irqrestore(&p->lock, flags); +@@ -823,8 +832,10 @@ static enum hrtimer_restart brcmuart_hrt + struct brcmuart_priv *priv = container_of(t, struct brcmuart_priv, hrt); + struct uart_port *p = priv->up; + struct uart_8250_port *up = up_to_u8250p(p); ++ unsigned long cs_flags; + unsigned int status; + unsigned long flags; ++ bool is_console; + + if (priv->shutdown) + return HRTIMER_NORESTART; +@@ -846,12 +857,20 @@ static enum hrtimer_restart brcmuart_hrt + /* re-enable receive unless upper layer has disabled it */ + if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) == + (UART_IER_RLSI | UART_IER_RDI)) { ++ is_console = uart_console(p); ++ ++ if (is_console) ++ printk_cpu_sync_get_irqsave(cs_flags); ++ + status = serial_port_in(p, UART_IER); + status |= (UART_IER_RLSI | UART_IER_RDI); + serial_port_out(p, UART_IER, status); + status = serial_port_in(p, UART_MCR); + status |= UART_MCR_RTS; + serial_port_out(p, UART_MCR, status); ++ ++ if (is_console) ++ printk_cpu_sync_put_irqrestore(cs_flags); + } + spin_unlock_irqrestore(&p->lock, flags); + return HRTIMER_NORESTART; +diff -rupN linux.orig/drivers/tty/serial/8250/8250_core.c linux/drivers/tty/serial/8250/8250_core.c +--- linux.orig/drivers/tty/serial/8250/8250_core.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_core.c 2022-12-04 10:40:26.700034085 -0500 +@@ -255,8 +255,11 @@ static void serial8250_timeout(struct ti + static void serial8250_backup_timeout(struct timer_list *t) + { + struct uart_8250_port *up = from_timer(up, t, timer); ++ struct uart_port *port = &up->port; + unsigned int iir, ier = 0, lsr; ++ unsigned long cs_flags; + unsigned long flags; ++ bool is_console; + + spin_lock_irqsave(&up->port.lock, flags); + +@@ -265,8 +268,16 @@ static void serial8250_backup_timeout(st + * based handler. + */ + if (up->port.irq) { ++ is_console = uart_console(port); ++ ++ if (is_console) ++ printk_cpu_sync_get_irqsave(cs_flags); ++ + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, 0); ++ ++ if (is_console) ++ printk_cpu_sync_put_irqrestore(cs_flags); + } + + iir = serial_in(up, UART_IIR); +@@ -289,7 +300,7 @@ static void serial8250_backup_timeout(st + serial8250_tx_chars(up); + + if (up->port.irq) +- serial_out(up, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + spin_unlock_irqrestore(&up->port.lock, flags); + +@@ -575,6 +586,14 @@ serial8250_register_ports(struct uart_dr + + #ifdef CONFIG_SERIAL_8250_CONSOLE + ++static void univ8250_console_write_atomic(struct console *co, const char *s, ++ unsigned int count) ++{ ++ struct uart_8250_port *up = &serial8250_ports[co->index]; ++ ++ serial8250_console_write_atomic(up, s, count); ++} ++ + static void univ8250_console_write(struct console *co, const char *s, + unsigned int count) + { +@@ -668,6 +687,7 @@ static int univ8250_console_match(struct + + static struct console univ8250_console = { + .name = "ttyS", ++ .write_atomic = univ8250_console_write_atomic, + .write = univ8250_console_write, + .device = uart_console_device, + .setup = univ8250_console_setup, +@@ -961,7 +981,7 @@ static void serial_8250_overrun_backoff_ + spin_lock_irqsave(&port->lock, flags); + up->ier |= UART_IER_RLSI | UART_IER_RDI; + up->port.read_status_mask |= UART_LSR_DR; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + spin_unlock_irqrestore(&port->lock, flags); + } + +diff -rupN linux.orig/drivers/tty/serial/8250/8250_exar.c linux/drivers/tty/serial/8250/8250_exar.c +--- linux.orig/drivers/tty/serial/8250/8250_exar.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_exar.c 2022-12-04 10:40:26.700034085 -0500 +@@ -179,6 +179,8 @@ static void xr17v35x_set_divisor(struct + + static int xr17v35x_startup(struct uart_port *port) + { ++ struct uart_8250_port *up = up_to_u8250p(port); ++ + /* + * First enable access to IER [7:5], ISR [5:4], FCR [5:4], + * MCR [7:5] and MSR [7:0] +@@ -189,7 +191,7 @@ static int xr17v35x_startup(struct uart_ + * Make sure all interrups are masked until initialization is + * complete and the FIFOs are cleared + */ +- serial_port_out(port, UART_IER, 0); ++ serial8250_set_IER(up, 0); + + return serial8250_do_startup(port); + } +diff -rupN linux.orig/drivers/tty/serial/8250/8250_fsl.c linux/drivers/tty/serial/8250/8250_fsl.c +--- linux.orig/drivers/tty/serial/8250/8250_fsl.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_fsl.c 2022-12-04 10:40:26.700034085 -0500 +@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port + if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { + unsigned long delay; + +- up->ier = port->serial_in(port, UART_IER); ++ up->ier = serial8250_in_IER(up); ++ + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { + port->ops->stop_rx(port); + } else { +diff -rupN linux.orig/drivers/tty/serial/8250/8250.h linux/drivers/tty/serial/8250/8250.h +--- linux.orig/drivers/tty/serial/8250/8250.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250.h 2022-12-04 10:40:26.700034085 -0500 +@@ -177,12 +177,74 @@ static inline void serial_dl_write(struc up->dl_write(up, value); } @@ -3254,7 +21084,7 @@ index 287153d325365..81f5fce6e895f 100644 return true; } -@@ -191,7 +253,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) +@@ -191,7 +253,7 @@ static inline bool serial8250_clear_THRI if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; @@ -3263,213 +21093,10 @@ index 287153d325365..81f5fce6e895f 100644 return true; } -diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c -index 9d2a7856784f7..7cc6b527c088b 100644 ---- a/drivers/tty/serial/8250/8250_aspeed_vuart.c -+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c -@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle(struct uart_8250_port *up, - up->ier &= ~irqs; - if (!throttle) - up->ier |= irqs; -- serial_out(up, UART_IER, up->ier); -+ serial8250_set_IER(up, up->ier); - } - static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle) - { -diff --git a/drivers/tty/serial/8250/8250_bcm7271.c b/drivers/tty/serial/8250/8250_bcm7271.c -index 8efdc271eb75f..d30c74618411f 100644 ---- a/drivers/tty/serial/8250/8250_bcm7271.c -+++ b/drivers/tty/serial/8250/8250_bcm7271.c -@@ -609,7 +609,7 @@ static int brcmuart_startup(struct uart_port *port) - * will handle this. - */ - up->ier &= ~UART_IER_RDI; -- serial_port_out(port, UART_IER, up->ier); -+ serial8250_set_IER(up, up->ier); - - priv->tx_running = false; - priv->dma.rx_dma = NULL; -@@ -775,10 +775,12 @@ static int brcmuart_handle_irq(struct uart_port *p) - unsigned int iir = serial_port_in(p, UART_IIR); - struct brcmuart_priv *priv = p->private_data; - struct uart_8250_port *up = up_to_u8250p(p); -+ unsigned long cs_flags; - unsigned int status; - unsigned long flags; - unsigned int ier; - unsigned int mcr; -+ bool is_console; - int handled = 0; - - /* -@@ -789,6 +791,10 @@ static int brcmuart_handle_irq(struct uart_port *p) - spin_lock_irqsave(&p->lock, flags); - status = serial_port_in(p, UART_LSR); - if ((status & UART_LSR_DR) == 0) { -+ is_console = uart_console(p); -+ -+ if (is_console) -+ printk_cpu_sync_get_irqsave(cs_flags); - - ier = serial_port_in(p, UART_IER); - /* -@@ -809,6 +815,9 @@ static int brcmuart_handle_irq(struct uart_port *p) - serial_port_in(p, UART_RX); - } - -+ if (is_console) -+ printk_cpu_sync_put_irqrestore(cs_flags); -+ - handled = 1; - } - spin_unlock_irqrestore(&p->lock, flags); -@@ -823,8 +832,10 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t) - struct brcmuart_priv *priv = container_of(t, struct brcmuart_priv, hrt); - struct uart_port *p = priv->up; - struct uart_8250_port *up = up_to_u8250p(p); -+ unsigned long cs_flags; - unsigned int status; - unsigned long flags; -+ bool is_console; - - if (priv->shutdown) - return HRTIMER_NORESTART; -@@ -846,12 +857,20 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t) - /* re-enable receive unless upper layer has disabled it */ - if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) == - (UART_IER_RLSI | UART_IER_RDI)) { -+ is_console = uart_console(p); -+ -+ if (is_console) -+ printk_cpu_sync_get_irqsave(cs_flags); -+ - status = serial_port_in(p, UART_IER); - status |= (UART_IER_RLSI | UART_IER_RDI); - serial_port_out(p, UART_IER, status); - status = serial_port_in(p, UART_MCR); - status |= UART_MCR_RTS; - serial_port_out(p, UART_MCR, status); -+ -+ if (is_console) -+ printk_cpu_sync_put_irqrestore(cs_flags); - } - spin_unlock_irqrestore(&p->lock, flags); - return HRTIMER_NORESTART; -diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c -index 94fbf0add2ce2..196d0c55dfe99 100644 ---- a/drivers/tty/serial/8250/8250_core.c -+++ b/drivers/tty/serial/8250/8250_core.c -@@ -255,8 +255,11 @@ static void serial8250_timeout(struct timer_list *t) - static void serial8250_backup_timeout(struct timer_list *t) - { - struct uart_8250_port *up = from_timer(up, t, timer); -+ struct uart_port *port = &up->port; - unsigned int iir, ier = 0, lsr; -+ unsigned long cs_flags; - unsigned long flags; -+ bool is_console; - - spin_lock_irqsave(&up->port.lock, flags); - -@@ -265,8 +268,16 @@ static void serial8250_backup_timeout(struct timer_list *t) - * based handler. - */ - if (up->port.irq) { -+ is_console = uart_console(port); -+ -+ if (is_console) -+ printk_cpu_sync_get_irqsave(cs_flags); -+ - ier = serial_in(up, UART_IER); - serial_out(up, UART_IER, 0); -+ -+ if (is_console) -+ printk_cpu_sync_put_irqrestore(cs_flags); - } - - iir = serial_in(up, UART_IIR); -@@ -289,7 +300,7 @@ static void serial8250_backup_timeout(struct timer_list *t) - serial8250_tx_chars(up); - - if (up->port.irq) -- serial_out(up, UART_IER, ier); -+ serial8250_set_IER(up, ier); - - spin_unlock_irqrestore(&up->port.lock, flags); - -@@ -575,6 +586,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) - - #ifdef CONFIG_SERIAL_8250_CONSOLE - -+static void univ8250_console_write_atomic(struct console *co, const char *s, -+ unsigned int count) -+{ -+ struct uart_8250_port *up = &serial8250_ports[co->index]; -+ -+ serial8250_console_write_atomic(up, s, count); -+} -+ - static void univ8250_console_write(struct console *co, const char *s, - unsigned int count) - { -@@ -668,6 +687,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, - - static struct console univ8250_console = { - .name = "ttyS", -+ .write_atomic = univ8250_console_write_atomic, - .write = univ8250_console_write, - .device = uart_console_device, - .setup = univ8250_console_setup, -@@ -961,7 +981,7 @@ static void serial_8250_overrun_backoff_work(struct work_struct *work) - spin_lock_irqsave(&port->lock, flags); - up->ier |= UART_IER_RLSI | UART_IER_RDI; - up->port.read_status_mask |= UART_LSR_DR; -- serial_out(up, UART_IER, up->ier); -+ serial8250_set_IER(up, up->ier); - spin_unlock_irqrestore(&port->lock, flags); - } - -diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c -index 314a05e009df9..9809517de8270 100644 ---- a/drivers/tty/serial/8250/8250_exar.c -+++ b/drivers/tty/serial/8250/8250_exar.c -@@ -179,6 +179,8 @@ static void xr17v35x_set_divisor(struct uart_port *p, unsigned int baud, - - static int xr17v35x_startup(struct uart_port *port) - { -+ struct uart_8250_port *up = up_to_u8250p(port); -+ - /* - * First enable access to IER [7:5], ISR [5:4], FCR [5:4], - * MCR [7:5] and MSR [7:0] -@@ -189,7 +191,7 @@ static int xr17v35x_startup(struct uart_port *port) - * Make sure all interrups are masked until initialization is - * complete and the FIFOs are cleared - */ -- serial_port_out(port, UART_IER, 0); -+ serial8250_set_IER(up, 0); - - return serial8250_do_startup(port); - } -diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c -index 8aad15622a2e5..74bb85b705e7f 100644 ---- a/drivers/tty/serial/8250/8250_fsl.c -+++ b/drivers/tty/serial/8250/8250_fsl.c -@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port *port) - if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { - unsigned long delay; - -- up->ier = port->serial_in(port, UART_IER); -+ up->ier = serial8250_in_IER(up); -+ - if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { - port->ops->stop_rx(port); - } else { -diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c -index 2b2f5d8d24b91..2b78e6c394fb9 100644 ---- a/drivers/tty/serial/8250/8250_ingenic.c -+++ b/drivers/tty/serial/8250/8250_ingenic.c -@@ -146,6 +146,7 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", +diff -rupN linux.orig/drivers/tty/serial/8250/8250_ingenic.c linux/drivers/tty/serial/8250/8250_ingenic.c +--- linux.orig/drivers/tty/serial/8250/8250_ingenic.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_ingenic.c 2022-12-04 10:40:26.700034085 -0500 +@@ -146,6 +146,7 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) { @@ -3477,7 +21104,7 @@ index 2b2f5d8d24b91..2b78e6c394fb9 100644 int ier; switch (offset) { -@@ -167,7 +168,7 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) +@@ -167,7 +168,7 @@ static void ingenic_uart_serial_out(stru * If we have enabled modem status IRQs we should enable * modem mode. */ @@ -3486,11 +21113,10 @@ index 2b2f5d8d24b91..2b78e6c394fb9 100644 if (ier & UART_IER_MSI) value |= UART_MCR_MDCE | UART_MCR_FCM; -diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c -index 54051ec7b4992..6092c75808fb9 100644 ---- a/drivers/tty/serial/8250/8250_mtk.c -+++ b/drivers/tty/serial/8250/8250_mtk.c -@@ -222,12 +222,40 @@ static void mtk8250_shutdown(struct uart_port *port) +diff -rupN linux.orig/drivers/tty/serial/8250/8250_mtk.c linux/drivers/tty/serial/8250/8250_mtk.c +--- linux.orig/drivers/tty/serial/8250/8250_mtk.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_mtk.c 2022-12-04 10:40:26.700034085 -0500 +@@ -222,12 +222,40 @@ static void mtk8250_shutdown(struct uart static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) { @@ -3533,20 +21159,19 @@ index 54051ec7b4992..6092c75808fb9 100644 } static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) -diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c -index 38ee3e42251af..8dc983a8cad15 100644 ---- a/drivers/tty/serial/8250/8250_omap.c -+++ b/drivers/tty/serial/8250/8250_omap.c -@@ -325,7 +325,7 @@ static void omap8250_restore_regs(struct uart_8250_port *up) - +diff -rupN linux.orig/drivers/tty/serial/8250/8250_omap.c linux/drivers/tty/serial/8250/8250_omap.c +--- linux.orig/drivers/tty/serial/8250/8250_omap.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_omap.c 2022-12-04 10:41:15.271907054 -0500 +@@ -328,7 +328,7 @@ static void omap8250_restore_regs(struct /* drop TCR + TLR access, we setup XON/XOFF later */ - serial8250_out_MCR(up, up->mcr); + serial8250_out_MCR(up, mcr); + - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_dl_write(up, priv->quot); -@@ -515,7 +515,7 @@ static void omap_8250_pm(struct uart_port *port, unsigned int state, +@@ -518,7 +518,7 @@ static void omap_8250_pm(struct uart_por serial_out(up, UART_EFR, efr | UART_EFR_ECB); serial_out(up, UART_LCR, 0); @@ -3555,7 +21180,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(up, UART_EFR, efr); serial_out(up, UART_LCR, 0); -@@ -636,7 +636,7 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id) +@@ -639,7 +639,7 @@ static irqreturn_t omap8250_irq(int irq, if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) { unsigned long delay; @@ -3564,7 +21189,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { port->ops->stop_rx(port); } else { -@@ -696,7 +696,7 @@ static int omap_8250_startup(struct uart_port *port) +@@ -698,7 +698,7 @@ static int omap_8250_startup(struct uart goto err; up->ier = UART_IER_RLSI | UART_IER_RDI; @@ -3573,7 +21198,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 #ifdef CONFIG_PM up->capabilities |= UART_CAP_RPM; -@@ -737,7 +737,7 @@ static void omap_8250_shutdown(struct uart_port *port) +@@ -739,7 +739,7 @@ static void omap_8250_shutdown(struct ua serial_out(up, UART_OMAP_EFR2, 0x0); up->ier = 0; @@ -3582,7 +21207,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 if (up->dma) serial8250_release_dma(up); -@@ -785,7 +785,7 @@ static void omap_8250_unthrottle(struct uart_port *port) +@@ -787,7 +787,7 @@ static void omap_8250_unthrottle(struct up->dma->rx_dma(up); up->ier |= UART_IER_RLSI | UART_IER_RDI; port->read_status_mask |= UART_LSR_DR; @@ -3591,7 +21216,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 spin_unlock_irqrestore(&port->lock, flags); pm_runtime_mark_last_busy(port->dev); -@@ -876,7 +876,7 @@ static void __dma_rx_complete(void *param) +@@ -878,7 +878,7 @@ static void __dma_rx_complete(void *para __dma_rx_do_complete(p); if (!priv->throttled) { p->ier |= UART_IER_RLSI | UART_IER_RDI; @@ -3600,7 +21225,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 if (!(priv->habit & UART_HAS_EFR2)) omap_8250_rx_dma(p); } -@@ -933,7 +933,7 @@ static int omap_8250_rx_dma(struct uart_8250_port *p) +@@ -935,7 +935,7 @@ static int omap_8250_rx_dma(struct uart_ * callback to run. */ p->ier &= ~(UART_IER_RLSI | UART_IER_RDI); @@ -3609,7 +21234,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 } goto out; } -@@ -1148,12 +1148,12 @@ static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, +@@ -1150,12 +1150,12 @@ static void am654_8250_handle_rx_dma(str * periodic timeouts, re-enable interrupts. */ up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); @@ -3624,11 +21249,1731 @@ index 38ee3e42251af..8dc983a8cad15 100644 } } -diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c -index 2030a92ac66e7..326549603740d 100644 ---- a/drivers/tty/serial/8250/8250_port.c -+++ b/drivers/tty/serial/8250/8250_port.c -@@ -743,7 +743,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) +diff -rupN linux.orig/drivers/tty/serial/8250/8250_omap.c.orig linux/drivers/tty/serial/8250/8250_omap.c.orig +--- linux.orig/drivers/tty/serial/8250/8250_omap.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_omap.c.orig 2022-12-04 10:40:18.432055273 -0500 +@@ -0,0 +1,1716 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * 8250-core based driver for the OMAP internal UART ++ * ++ * based on omap-serial.c, Copyright (C) 2010 Texas Instruments. ++ * ++ * Copyright (C) 2014 Sebastian Andrzej Siewior ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "8250.h" ++ ++#define DEFAULT_CLK_SPEED 48000000 ++ ++#define UART_ERRATA_i202_MDR1_ACCESS (1 << 0) ++#define OMAP_UART_WER_HAS_TX_WAKEUP (1 << 1) ++#define OMAP_DMA_TX_KICK (1 << 2) ++/* ++ * See Advisory 21 in AM437x errata SPRZ408B, updated April 2015. ++ * The same errata is applicable to AM335x and DRA7x processors too. ++ */ ++#define UART_ERRATA_CLOCK_DISABLE (1 << 3) ++#define UART_HAS_EFR2 BIT(4) ++#define UART_HAS_RHR_IT_DIS BIT(5) ++#define UART_RX_TIMEOUT_QUIRK BIT(6) ++ ++#define OMAP_UART_FCR_RX_TRIG 6 ++#define OMAP_UART_FCR_TX_TRIG 4 ++ ++/* SCR register bitmasks */ ++#define OMAP_UART_SCR_RX_TRIG_GRANU1_MASK (1 << 7) ++#define OMAP_UART_SCR_TX_TRIG_GRANU1_MASK (1 << 6) ++#define OMAP_UART_SCR_TX_EMPTY (1 << 3) ++#define OMAP_UART_SCR_DMAMODE_MASK (3 << 1) ++#define OMAP_UART_SCR_DMAMODE_1 (1 << 1) ++#define OMAP_UART_SCR_DMAMODE_CTL (1 << 0) ++ ++/* MVR register bitmasks */ ++#define OMAP_UART_MVR_SCHEME_SHIFT 30 ++#define OMAP_UART_LEGACY_MVR_MAJ_MASK 0xf0 ++#define OMAP_UART_LEGACY_MVR_MAJ_SHIFT 4 ++#define OMAP_UART_LEGACY_MVR_MIN_MASK 0x0f ++#define OMAP_UART_MVR_MAJ_MASK 0x700 ++#define OMAP_UART_MVR_MAJ_SHIFT 8 ++#define OMAP_UART_MVR_MIN_MASK 0x3f ++ ++/* SYSC register bitmasks */ ++#define OMAP_UART_SYSC_SOFTRESET (1 << 1) ++ ++/* SYSS register bitmasks */ ++#define OMAP_UART_SYSS_RESETDONE (1 << 0) ++ ++#define UART_TI752_TLR_TX 0 ++#define UART_TI752_TLR_RX 4 ++ ++#define TRIGGER_TLR_MASK(x) ((x & 0x3c) >> 2) ++#define TRIGGER_FCR_MASK(x) (x & 3) ++ ++/* Enable XON/XOFF flow control on output */ ++#define OMAP_UART_SW_TX 0x08 ++/* Enable XON/XOFF flow control on input */ ++#define OMAP_UART_SW_RX 0x02 ++ ++#define OMAP_UART_WER_MOD_WKUP 0x7f ++#define OMAP_UART_TX_WAKEUP_EN (1 << 7) ++ ++#define TX_TRIGGER 1 ++#define RX_TRIGGER 48 ++ ++#define OMAP_UART_TCR_RESTORE(x) ((x / 4) << 4) ++#define OMAP_UART_TCR_HALT(x) ((x / 4) << 0) ++ ++#define UART_BUILD_REVISION(x, y) (((x) << 8) | (y)) ++ ++#define OMAP_UART_REV_46 0x0406 ++#define OMAP_UART_REV_52 0x0502 ++#define OMAP_UART_REV_63 0x0603 ++ ++/* Interrupt Enable Register 2 */ ++#define UART_OMAP_IER2 0x1B ++#define UART_OMAP_IER2_RHR_IT_DIS BIT(2) ++ ++/* Enhanced features register 2 */ ++#define UART_OMAP_EFR2 0x23 ++#define UART_OMAP_EFR2_TIMEOUT_BEHAVE BIT(6) ++ ++/* RX FIFO occupancy indicator */ ++#define UART_OMAP_RX_LVL 0x19 ++ ++struct omap8250_priv { ++ int line; ++ u8 habit; ++ u8 mdr1; ++ u8 efr; ++ u8 scr; ++ u8 wer; ++ u8 xon; ++ u8 xoff; ++ u8 delayed_restore; ++ u16 quot; ++ ++ u8 tx_trigger; ++ u8 rx_trigger; ++ bool is_suspending; ++ int wakeirq; ++ int wakeups_enabled; ++ u32 latency; ++ u32 calc_latency; ++ struct pm_qos_request pm_qos_request; ++ struct work_struct qos_work; ++ struct uart_8250_dma omap8250_dma; ++ spinlock_t rx_dma_lock; ++ bool rx_dma_broken; ++ bool throttled; ++}; ++ ++struct omap8250_dma_params { ++ u32 rx_size; ++ u8 rx_trigger; ++ u8 tx_trigger; ++}; ++ ++struct omap8250_platdata { ++ struct omap8250_dma_params *dma_params; ++ u8 habit; ++}; ++ ++#ifdef CONFIG_SERIAL_8250_DMA ++static void omap_8250_rx_dma_flush(struct uart_8250_port *p); ++#else ++static inline void omap_8250_rx_dma_flush(struct uart_8250_port *p) { } ++#endif ++ ++static u32 uart_read(struct uart_8250_port *up, u32 reg) ++{ ++ return readl(up->port.membase + (reg << up->port.regshift)); ++} ++ ++/* ++ * Called on runtime PM resume path from omap8250_restore_regs(), and ++ * omap8250_set_mctrl(). ++ */ ++static void __omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = up->port.private_data; ++ u8 lcr; ++ ++ serial8250_do_set_mctrl(port, mctrl); ++ ++ if (!mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_RTS)) { ++ /* ++ * Turn off autoRTS if RTS is lowered and restore autoRTS ++ * setting if RTS is raised ++ */ ++ lcr = serial_in(up, UART_LCR); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ if ((mctrl & TIOCM_RTS) && (port->status & UPSTAT_AUTORTS)) ++ priv->efr |= UART_EFR_RTS; ++ else ++ priv->efr &= ~UART_EFR_RTS; ++ serial_out(up, UART_EFR, priv->efr); ++ serial_out(up, UART_LCR, lcr); ++ } ++} ++ ++static void omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl) ++{ ++ int err; ++ ++ err = pm_runtime_resume_and_get(port->dev); ++ if (err) ++ return; ++ ++ __omap8250_set_mctrl(port, mctrl); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++} ++ ++/* ++ * Work Around for Errata i202 (2430, 3430, 3630, 4430 and 4460) ++ * The access to uart register after MDR1 Access ++ * causes UART to corrupt data. ++ * ++ * Need a delay = ++ * 5 L4 clock cycles + 5 UART functional clock cycle (@48MHz = ~0.2uS) ++ * give 10 times as much ++ */ ++static void omap_8250_mdr1_errataset(struct uart_8250_port *up, ++ struct omap8250_priv *priv) ++{ ++ serial_out(up, UART_OMAP_MDR1, priv->mdr1); ++ udelay(2); ++ serial_out(up, UART_FCR, up->fcr | UART_FCR_CLEAR_XMIT | ++ UART_FCR_CLEAR_RCVR); ++} ++ ++static void omap_8250_get_divisor(struct uart_port *port, unsigned int baud, ++ struct omap8250_priv *priv) ++{ ++ unsigned int uartclk = port->uartclk; ++ unsigned int div_13, div_16; ++ unsigned int abs_d13, abs_d16; ++ ++ /* ++ * Old custom speed handling. ++ */ ++ if (baud == 38400 && (port->flags & UPF_SPD_MASK) == UPF_SPD_CUST) { ++ priv->quot = port->custom_divisor & UART_DIV_MAX; ++ /* ++ * I assume that nobody is using this. But hey, if somebody ++ * would like to specify the divisor _and_ the mode then the ++ * driver is ready and waiting for it. ++ */ ++ if (port->custom_divisor & (1 << 16)) ++ priv->mdr1 = UART_OMAP_MDR1_13X_MODE; ++ else ++ priv->mdr1 = UART_OMAP_MDR1_16X_MODE; ++ return; ++ } ++ div_13 = DIV_ROUND_CLOSEST(uartclk, 13 * baud); ++ div_16 = DIV_ROUND_CLOSEST(uartclk, 16 * baud); ++ ++ if (!div_13) ++ div_13 = 1; ++ if (!div_16) ++ div_16 = 1; ++ ++ abs_d13 = abs(baud - uartclk / 13 / div_13); ++ abs_d16 = abs(baud - uartclk / 16 / div_16); ++ ++ if (abs_d13 >= abs_d16) { ++ priv->mdr1 = UART_OMAP_MDR1_16X_MODE; ++ priv->quot = div_16; ++ } else { ++ priv->mdr1 = UART_OMAP_MDR1_13X_MODE; ++ priv->quot = div_13; ++ } ++} ++ ++static void omap8250_update_scr(struct uart_8250_port *up, ++ struct omap8250_priv *priv) ++{ ++ u8 old_scr; ++ ++ old_scr = serial_in(up, UART_OMAP_SCR); ++ if (old_scr == priv->scr) ++ return; ++ ++ /* ++ * The manual recommends not to enable the DMA mode selector in the SCR ++ * (instead of the FCR) register _and_ selecting the DMA mode as one ++ * register write because this may lead to malfunction. ++ */ ++ if (priv->scr & OMAP_UART_SCR_DMAMODE_MASK) ++ serial_out(up, UART_OMAP_SCR, ++ priv->scr & ~OMAP_UART_SCR_DMAMODE_MASK); ++ serial_out(up, UART_OMAP_SCR, priv->scr); ++} ++ ++static void omap8250_update_mdr1(struct uart_8250_port *up, ++ struct omap8250_priv *priv) ++{ ++ if (priv->habit & UART_ERRATA_i202_MDR1_ACCESS) ++ omap_8250_mdr1_errataset(up, priv); ++ else ++ serial_out(up, UART_OMAP_MDR1, priv->mdr1); ++} ++ ++static void omap8250_restore_regs(struct uart_8250_port *up) ++{ ++ struct omap8250_priv *priv = up->port.private_data; ++ struct uart_8250_dma *dma = up->dma; ++ u8 mcr = serial8250_in_MCR(up); ++ ++ if (dma && dma->tx_running) { ++ /* ++ * TCSANOW requests the change to occur immediately however if ++ * we have a TX-DMA operation in progress then it has been ++ * observed that it might stall and never complete. Therefore we ++ * delay DMA completes to prevent this hang from happen. ++ */ ++ priv->delayed_restore = 1; ++ return; ++ } ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_EFR, UART_EFR_ECB); ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ serial8250_out_MCR(up, mcr | UART_MCR_TCRTLR); ++ serial_out(up, UART_FCR, up->fcr); ++ ++ omap8250_update_scr(up, priv); ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ ++ serial_out(up, UART_TI752_TCR, OMAP_UART_TCR_RESTORE(16) | ++ OMAP_UART_TCR_HALT(52)); ++ serial_out(up, UART_TI752_TLR, ++ TRIGGER_TLR_MASK(priv->tx_trigger) << UART_TI752_TLR_TX | ++ TRIGGER_TLR_MASK(priv->rx_trigger) << UART_TI752_TLR_RX); ++ ++ serial_out(up, UART_LCR, 0); ++ ++ /* drop TCR + TLR access, we setup XON/XOFF later */ ++ serial8250_out_MCR(up, mcr); ++ ++ serial_out(up, UART_IER, up->ier); ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_dl_write(up, priv->quot); ++ ++ serial_out(up, UART_EFR, priv->efr); ++ ++ /* Configure flow control */ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_XON1, priv->xon); ++ serial_out(up, UART_XOFF1, priv->xoff); ++ ++ serial_out(up, UART_LCR, up->lcr); ++ ++ omap8250_update_mdr1(up, priv); ++ ++ __omap8250_set_mctrl(&up->port, up->port.mctrl); ++ ++ if (up->port.rs485.flags & SER_RS485_ENABLED) ++ serial8250_em485_stop_tx(up); ++} ++ ++/* ++ * OMAP can use "CLK / (16 or 13) / div" for baud rate. And then we have have ++ * some differences in how we want to handle flow control. ++ */ ++static void omap_8250_set_termios(struct uart_port *port, ++ struct ktermios *termios, ++ struct ktermios *old) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = up->port.private_data; ++ unsigned char cval = 0; ++ unsigned int baud; ++ ++ cval = UART_LCR_WLEN(tty_get_char_size(termios->c_cflag)); ++ ++ if (termios->c_cflag & CSTOPB) ++ cval |= UART_LCR_STOP; ++ if (termios->c_cflag & PARENB) ++ cval |= UART_LCR_PARITY; ++ if (!(termios->c_cflag & PARODD)) ++ cval |= UART_LCR_EPAR; ++ if (termios->c_cflag & CMSPAR) ++ cval |= UART_LCR_SPAR; ++ ++ /* ++ * Ask the core to calculate the divisor for us. ++ */ ++ baud = uart_get_baud_rate(port, termios, old, ++ port->uartclk / 16 / UART_DIV_MAX, ++ port->uartclk / 13); ++ omap_8250_get_divisor(port, baud, priv); ++ ++ /* ++ * Ok, we're now changing the port state. Do it with ++ * interrupts disabled. ++ */ ++ pm_runtime_get_sync(port->dev); ++ spin_lock_irq(&port->lock); ++ ++ /* ++ * Update the per-port timeout. ++ */ ++ uart_update_timeout(port, termios->c_cflag, baud); ++ ++ up->port.read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR; ++ if (termios->c_iflag & INPCK) ++ up->port.read_status_mask |= UART_LSR_FE | UART_LSR_PE; ++ if (termios->c_iflag & (IGNBRK | PARMRK)) ++ up->port.read_status_mask |= UART_LSR_BI; ++ ++ /* ++ * Characters to ignore ++ */ ++ up->port.ignore_status_mask = 0; ++ if (termios->c_iflag & IGNPAR) ++ up->port.ignore_status_mask |= UART_LSR_PE | UART_LSR_FE; ++ if (termios->c_iflag & IGNBRK) { ++ up->port.ignore_status_mask |= UART_LSR_BI; ++ /* ++ * If we're ignoring parity and break indicators, ++ * ignore overruns too (for real raw support). ++ */ ++ if (termios->c_iflag & IGNPAR) ++ up->port.ignore_status_mask |= UART_LSR_OE; ++ } ++ ++ /* ++ * ignore all characters if CREAD is not set ++ */ ++ if ((termios->c_cflag & CREAD) == 0) ++ up->port.ignore_status_mask |= UART_LSR_DR; ++ ++ /* ++ * Modem status interrupts ++ */ ++ up->ier &= ~UART_IER_MSI; ++ if (UART_ENABLE_MS(&up->port, termios->c_cflag)) ++ up->ier |= UART_IER_MSI; ++ ++ up->lcr = cval; ++ /* Up to here it was mostly serial8250_do_set_termios() */ ++ ++ /* ++ * We enable TRIG_GRANU for RX and TX and additionally we set ++ * SCR_TX_EMPTY bit. The result is the following: ++ * - RX_TRIGGER amount of bytes in the FIFO will cause an interrupt. ++ * - less than RX_TRIGGER number of bytes will also cause an interrupt ++ * once the UART decides that there no new bytes arriving. ++ * - Once THRE is enabled, the interrupt will be fired once the FIFO is ++ * empty - the trigger level is ignored here. ++ * ++ * Once DMA is enabled: ++ * - UART will assert the TX DMA line once there is room for TX_TRIGGER ++ * bytes in the TX FIFO. On each assert the DMA engine will move ++ * TX_TRIGGER bytes into the FIFO. ++ * - UART will assert the RX DMA line once there are RX_TRIGGER bytes in ++ * the FIFO and move RX_TRIGGER bytes. ++ * This is because threshold and trigger values are the same. ++ */ ++ up->fcr = UART_FCR_ENABLE_FIFO; ++ up->fcr |= TRIGGER_FCR_MASK(priv->tx_trigger) << OMAP_UART_FCR_TX_TRIG; ++ up->fcr |= TRIGGER_FCR_MASK(priv->rx_trigger) << OMAP_UART_FCR_RX_TRIG; ++ ++ priv->scr = OMAP_UART_SCR_RX_TRIG_GRANU1_MASK | OMAP_UART_SCR_TX_EMPTY | ++ OMAP_UART_SCR_TX_TRIG_GRANU1_MASK; ++ ++ if (up->dma) ++ priv->scr |= OMAP_UART_SCR_DMAMODE_1 | ++ OMAP_UART_SCR_DMAMODE_CTL; ++ ++ priv->xon = termios->c_cc[VSTART]; ++ priv->xoff = termios->c_cc[VSTOP]; ++ ++ priv->efr = 0; ++ up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF); ++ ++ if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW && ++ !mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_RTS) && ++ !mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_CTS)) { ++ /* Enable AUTOCTS (autoRTS is enabled when RTS is raised) */ ++ up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS; ++ priv->efr |= UART_EFR_CTS; ++ } else if (up->port.flags & UPF_SOFT_FLOW) { ++ /* ++ * OMAP rx s/w flow control is borked; the transmitter remains ++ * stuck off even if rx flow control is subsequently disabled ++ */ ++ ++ /* ++ * IXOFF Flag: ++ * Enable XON/XOFF flow control on output. ++ * Transmit XON1, XOFF1 ++ */ ++ if (termios->c_iflag & IXOFF) { ++ up->port.status |= UPSTAT_AUTOXOFF; ++ priv->efr |= OMAP_UART_SW_TX; ++ } ++ } ++ omap8250_restore_regs(up); ++ ++ spin_unlock_irq(&up->port.lock); ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++ ++ /* calculate wakeup latency constraint */ ++ priv->calc_latency = USEC_PER_SEC * 64 * 8 / baud; ++ priv->latency = priv->calc_latency; ++ ++ schedule_work(&priv->qos_work); ++ ++ /* Don't rewrite B0 */ ++ if (tty_termios_baud_rate(termios)) ++ tty_termios_encode_baud_rate(termios, baud, baud); ++} ++ ++/* same as 8250 except that we may have extra flow bits set in EFR */ ++static void omap_8250_pm(struct uart_port *port, unsigned int state, ++ unsigned int oldstate) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ u8 efr; ++ ++ pm_runtime_get_sync(port->dev); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ efr = serial_in(up, UART_EFR); ++ serial_out(up, UART_EFR, efr | UART_EFR_ECB); ++ serial_out(up, UART_LCR, 0); ++ ++ serial_out(up, UART_IER, (state != 0) ? UART_IERX_SLEEP : 0); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_EFR, efr); ++ serial_out(up, UART_LCR, 0); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++} ++ ++static void omap_serial_fill_features_erratas(struct uart_8250_port *up, ++ struct omap8250_priv *priv) ++{ ++ static const struct soc_device_attribute k3_soc_devices[] = { ++ { .family = "AM65X", }, ++ { .family = "J721E", .revision = "SR1.0" }, ++ { /* sentinel */ } ++ }; ++ u32 mvr, scheme; ++ u16 revision, major, minor; ++ ++ mvr = uart_read(up, UART_OMAP_MVER); ++ ++ /* Check revision register scheme */ ++ scheme = mvr >> OMAP_UART_MVR_SCHEME_SHIFT; ++ ++ switch (scheme) { ++ case 0: /* Legacy Scheme: OMAP2/3 */ ++ /* MINOR_REV[0:4], MAJOR_REV[4:7] */ ++ major = (mvr & OMAP_UART_LEGACY_MVR_MAJ_MASK) >> ++ OMAP_UART_LEGACY_MVR_MAJ_SHIFT; ++ minor = (mvr & OMAP_UART_LEGACY_MVR_MIN_MASK); ++ break; ++ case 1: ++ /* New Scheme: OMAP4+ */ ++ /* MINOR_REV[0:5], MAJOR_REV[8:10] */ ++ major = (mvr & OMAP_UART_MVR_MAJ_MASK) >> ++ OMAP_UART_MVR_MAJ_SHIFT; ++ minor = (mvr & OMAP_UART_MVR_MIN_MASK); ++ break; ++ default: ++ dev_warn(up->port.dev, ++ "Unknown revision, defaulting to highest\n"); ++ /* highest possible revision */ ++ major = 0xff; ++ minor = 0xff; ++ } ++ /* normalize revision for the driver */ ++ revision = UART_BUILD_REVISION(major, minor); ++ ++ switch (revision) { ++ case OMAP_UART_REV_46: ++ priv->habit |= UART_ERRATA_i202_MDR1_ACCESS; ++ break; ++ case OMAP_UART_REV_52: ++ priv->habit |= UART_ERRATA_i202_MDR1_ACCESS | ++ OMAP_UART_WER_HAS_TX_WAKEUP; ++ break; ++ case OMAP_UART_REV_63: ++ priv->habit |= UART_ERRATA_i202_MDR1_ACCESS | ++ OMAP_UART_WER_HAS_TX_WAKEUP; ++ break; ++ default: ++ break; ++ } ++ ++ /* ++ * AM65x SR1.0, AM65x SR2.0 and J721e SR1.0 don't ++ * don't have RHR_IT_DIS bit in IER2 register. So drop to flag ++ * to enable errata workaround. ++ */ ++ if (soc_device_match(k3_soc_devices)) ++ priv->habit &= ~UART_HAS_RHR_IT_DIS; ++} ++ ++static void omap8250_uart_qos_work(struct work_struct *work) ++{ ++ struct omap8250_priv *priv; ++ ++ priv = container_of(work, struct omap8250_priv, qos_work); ++ cpu_latency_qos_update_request(&priv->pm_qos_request, priv->latency); ++} ++ ++#ifdef CONFIG_SERIAL_8250_DMA ++static int omap_8250_dma_handle_irq(struct uart_port *port); ++#endif ++ ++static irqreturn_t omap8250_irq(int irq, void *dev_id) ++{ ++ struct uart_port *port = dev_id; ++ struct omap8250_priv *priv = port->private_data; ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int iir, lsr; ++ int ret; ++ ++#ifdef CONFIG_SERIAL_8250_DMA ++ if (up->dma) { ++ ret = omap_8250_dma_handle_irq(port); ++ return IRQ_RETVAL(ret); ++ } ++#endif ++ ++ serial8250_rpm_get(up); ++ lsr = serial_port_in(port, UART_LSR); ++ iir = serial_port_in(port, UART_IIR); ++ ret = serial8250_handle_irq(port, iir); ++ ++ /* ++ * On K3 SoCs, it is observed that RX TIMEOUT is signalled after ++ * FIFO has been drained, in which case a dummy read of RX FIFO ++ * is required to clear RX TIMEOUT condition. ++ */ ++ if (priv->habit & UART_RX_TIMEOUT_QUIRK && ++ (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT && ++ serial_port_in(port, UART_OMAP_RX_LVL) == 0) { ++ serial_port_in(port, UART_RX); ++ } ++ ++ /* Stop processing interrupts on input overrun */ ++ if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) { ++ unsigned long delay; ++ ++ up->ier = port->serial_in(port, UART_IER); ++ if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { ++ port->ops->stop_rx(port); ++ } else { ++ /* Keep restarting the timer until ++ * the input overrun subsides. ++ */ ++ cancel_delayed_work(&up->overrun_backoff); ++ } ++ ++ delay = msecs_to_jiffies(up->overrun_backoff_time_ms); ++ schedule_delayed_work(&up->overrun_backoff, delay); ++ } ++ ++ serial8250_rpm_put(up); ++ ++ return IRQ_RETVAL(ret); ++} ++ ++static int omap_8250_startup(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = port->private_data; ++ int ret; ++ ++ if (priv->wakeirq) { ++ ret = dev_pm_set_dedicated_wake_irq(port->dev, priv->wakeirq); ++ if (ret) ++ return ret; ++ } ++ ++ pm_runtime_get_sync(port->dev); ++ ++ serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); ++ ++ serial_out(up, UART_LCR, UART_LCR_WLEN8); ++ ++ up->lsr_saved_flags = 0; ++ up->msr_saved_flags = 0; ++ ++ /* Disable DMA for console UART */ ++ if (uart_console(port)) ++ up->dma = NULL; ++ ++ if (up->dma) { ++ ret = serial8250_request_dma(up); ++ if (ret) { ++ dev_warn_ratelimited(port->dev, ++ "failed to request DMA\n"); ++ up->dma = NULL; ++ } ++ } ++ ++ ret = request_irq(port->irq, omap8250_irq, IRQF_SHARED, ++ dev_name(port->dev), port); ++ if (ret < 0) ++ goto err; ++ ++ up->ier = UART_IER_RLSI | UART_IER_RDI; ++ serial_out(up, UART_IER, up->ier); ++ ++#ifdef CONFIG_PM ++ up->capabilities |= UART_CAP_RPM; ++#endif ++ ++ /* Enable module level wake up */ ++ priv->wer = OMAP_UART_WER_MOD_WKUP; ++ if (priv->habit & OMAP_UART_WER_HAS_TX_WAKEUP) ++ priv->wer |= OMAP_UART_TX_WAKEUP_EN; ++ serial_out(up, UART_OMAP_WER, priv->wer); ++ ++ if (up->dma && !(priv->habit & UART_HAS_EFR2)) ++ up->dma->rx_dma(up); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++ return 0; ++err: ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++ dev_pm_clear_wake_irq(port->dev); ++ return ret; ++} ++ ++static void omap_8250_shutdown(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = port->private_data; ++ ++ flush_work(&priv->qos_work); ++ if (up->dma) ++ omap_8250_rx_dma_flush(up); ++ ++ pm_runtime_get_sync(port->dev); ++ ++ serial_out(up, UART_OMAP_WER, 0); ++ if (priv->habit & UART_HAS_EFR2) ++ serial_out(up, UART_OMAP_EFR2, 0x0); ++ ++ up->ier = 0; ++ serial_out(up, UART_IER, 0); ++ ++ if (up->dma) ++ serial8250_release_dma(up); ++ ++ /* ++ * Disable break condition and FIFOs ++ */ ++ if (up->lcr & UART_LCR_SBC) ++ serial_out(up, UART_LCR, up->lcr & ~UART_LCR_SBC); ++ serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++ free_irq(port->irq, port); ++ dev_pm_clear_wake_irq(port->dev); ++} ++ ++static void omap_8250_throttle(struct uart_port *port) ++{ ++ struct omap8250_priv *priv = port->private_data; ++ unsigned long flags; ++ ++ pm_runtime_get_sync(port->dev); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ port->ops->stop_rx(port); ++ priv->throttled = true; ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++} ++ ++static void omap_8250_unthrottle(struct uart_port *port) ++{ ++ struct omap8250_priv *priv = port->private_data; ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ ++ pm_runtime_get_sync(port->dev); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ priv->throttled = false; ++ if (up->dma) ++ up->dma->rx_dma(up); ++ up->ier |= UART_IER_RLSI | UART_IER_RDI; ++ port->read_status_mask |= UART_LSR_DR; ++ serial_out(up, UART_IER, up->ier); ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++} ++ ++#ifdef CONFIG_SERIAL_8250_DMA ++static int omap_8250_rx_dma(struct uart_8250_port *p); ++ ++/* Must be called while priv->rx_dma_lock is held */ ++static void __dma_rx_do_complete(struct uart_8250_port *p) ++{ ++ struct uart_8250_dma *dma = p->dma; ++ struct tty_port *tty_port = &p->port.state->port; ++ struct omap8250_priv *priv = p->port.private_data; ++ struct dma_chan *rxchan = dma->rxchan; ++ dma_cookie_t cookie; ++ struct dma_tx_state state; ++ int count; ++ int ret; ++ u32 reg; ++ ++ if (!dma->rx_running) ++ goto out; ++ ++ cookie = dma->rx_cookie; ++ dma->rx_running = 0; ++ ++ /* Re-enable RX FIFO interrupt now that transfer is complete */ ++ if (priv->habit & UART_HAS_RHR_IT_DIS) { ++ reg = serial_in(p, UART_OMAP_IER2); ++ reg &= ~UART_OMAP_IER2_RHR_IT_DIS; ++ serial_out(p, UART_OMAP_IER2, UART_OMAP_IER2_RHR_IT_DIS); ++ } ++ ++ dmaengine_tx_status(rxchan, cookie, &state); ++ ++ count = dma->rx_size - state.residue + state.in_flight_bytes; ++ if (count < dma->rx_size) { ++ dmaengine_terminate_async(rxchan); ++ ++ /* ++ * Poll for teardown to complete which guarantees in ++ * flight data is drained. ++ */ ++ if (state.in_flight_bytes) { ++ int poll_count = 25; ++ ++ while (dmaengine_tx_status(rxchan, cookie, NULL) && ++ poll_count--) ++ cpu_relax(); ++ ++ if (poll_count == -1) ++ dev_err(p->port.dev, "teardown incomplete\n"); ++ } ++ } ++ if (!count) ++ goto out; ++ ret = tty_insert_flip_string(tty_port, dma->rx_buf, count); ++ ++ p->port.icount.rx += ret; ++ p->port.icount.buf_overrun += count - ret; ++out: ++ ++ tty_flip_buffer_push(tty_port); ++} ++ ++static void __dma_rx_complete(void *param) ++{ ++ struct uart_8250_port *p = param; ++ struct omap8250_priv *priv = p->port.private_data; ++ struct uart_8250_dma *dma = p->dma; ++ struct dma_tx_state state; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&p->port.lock, flags); ++ ++ /* ++ * If the tx status is not DMA_COMPLETE, then this is a delayed ++ * completion callback. A previous RX timeout flush would have ++ * already pushed the data, so exit. ++ */ ++ if (dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state) != ++ DMA_COMPLETE) { ++ spin_unlock_irqrestore(&p->port.lock, flags); ++ return; ++ } ++ __dma_rx_do_complete(p); ++ if (!priv->throttled) { ++ p->ier |= UART_IER_RLSI | UART_IER_RDI; ++ serial_out(p, UART_IER, p->ier); ++ if (!(priv->habit & UART_HAS_EFR2)) ++ omap_8250_rx_dma(p); ++ } ++ ++ spin_unlock_irqrestore(&p->port.lock, flags); ++} ++ ++static void omap_8250_rx_dma_flush(struct uart_8250_port *p) ++{ ++ struct omap8250_priv *priv = p->port.private_data; ++ struct uart_8250_dma *dma = p->dma; ++ struct dma_tx_state state; ++ unsigned long flags; ++ int ret; ++ ++ spin_lock_irqsave(&priv->rx_dma_lock, flags); ++ ++ if (!dma->rx_running) { ++ spin_unlock_irqrestore(&priv->rx_dma_lock, flags); ++ return; ++ } ++ ++ ret = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state); ++ if (ret == DMA_IN_PROGRESS) { ++ ret = dmaengine_pause(dma->rxchan); ++ if (WARN_ON_ONCE(ret)) ++ priv->rx_dma_broken = true; ++ } ++ __dma_rx_do_complete(p); ++ spin_unlock_irqrestore(&priv->rx_dma_lock, flags); ++} ++ ++static int omap_8250_rx_dma(struct uart_8250_port *p) ++{ ++ struct omap8250_priv *priv = p->port.private_data; ++ struct uart_8250_dma *dma = p->dma; ++ int err = 0; ++ struct dma_async_tx_descriptor *desc; ++ unsigned long flags; ++ u32 reg; ++ ++ if (priv->rx_dma_broken) ++ return -EINVAL; ++ ++ spin_lock_irqsave(&priv->rx_dma_lock, flags); ++ ++ if (dma->rx_running) { ++ enum dma_status state; ++ ++ state = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, NULL); ++ if (state == DMA_COMPLETE) { ++ /* ++ * Disable RX interrupts to allow RX DMA completion ++ * callback to run. ++ */ ++ p->ier &= ~(UART_IER_RLSI | UART_IER_RDI); ++ serial_out(p, UART_IER, p->ier); ++ } ++ goto out; ++ } ++ ++ desc = dmaengine_prep_slave_single(dma->rxchan, dma->rx_addr, ++ dma->rx_size, DMA_DEV_TO_MEM, ++ DMA_PREP_INTERRUPT | DMA_CTRL_ACK); ++ if (!desc) { ++ err = -EBUSY; ++ goto out; ++ } ++ ++ dma->rx_running = 1; ++ desc->callback = __dma_rx_complete; ++ desc->callback_param = p; ++ ++ dma->rx_cookie = dmaengine_submit(desc); ++ ++ /* ++ * Disable RX FIFO interrupt while RX DMA is enabled, else ++ * spurious interrupt may be raised when data is in the RX FIFO ++ * but is yet to be drained by DMA. ++ */ ++ if (priv->habit & UART_HAS_RHR_IT_DIS) { ++ reg = serial_in(p, UART_OMAP_IER2); ++ reg |= UART_OMAP_IER2_RHR_IT_DIS; ++ serial_out(p, UART_OMAP_IER2, UART_OMAP_IER2_RHR_IT_DIS); ++ } ++ ++ dma_async_issue_pending(dma->rxchan); ++out: ++ spin_unlock_irqrestore(&priv->rx_dma_lock, flags); ++ return err; ++} ++ ++static int omap_8250_tx_dma(struct uart_8250_port *p); ++ ++static void omap_8250_dma_tx_complete(void *param) ++{ ++ struct uart_8250_port *p = param; ++ struct uart_8250_dma *dma = p->dma; ++ struct circ_buf *xmit = &p->port.state->xmit; ++ unsigned long flags; ++ bool en_thri = false; ++ struct omap8250_priv *priv = p->port.private_data; ++ ++ dma_sync_single_for_cpu(dma->txchan->device->dev, dma->tx_addr, ++ UART_XMIT_SIZE, DMA_TO_DEVICE); ++ ++ spin_lock_irqsave(&p->port.lock, flags); ++ ++ dma->tx_running = 0; ++ ++ xmit->tail += dma->tx_size; ++ xmit->tail &= UART_XMIT_SIZE - 1; ++ p->port.icount.tx += dma->tx_size; ++ ++ if (priv->delayed_restore) { ++ priv->delayed_restore = 0; ++ omap8250_restore_regs(p); ++ } ++ ++ if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) ++ uart_write_wakeup(&p->port); ++ ++ if (!uart_circ_empty(xmit) && !uart_tx_stopped(&p->port)) { ++ int ret; ++ ++ ret = omap_8250_tx_dma(p); ++ if (ret) ++ en_thri = true; ++ } else if (p->capabilities & UART_CAP_RPM) { ++ en_thri = true; ++ } ++ ++ if (en_thri) { ++ dma->tx_err = 1; ++ serial8250_set_THRI(p); ++ } ++ ++ spin_unlock_irqrestore(&p->port.lock, flags); ++} ++ ++static int omap_8250_tx_dma(struct uart_8250_port *p) ++{ ++ struct uart_8250_dma *dma = p->dma; ++ struct omap8250_priv *priv = p->port.private_data; ++ struct circ_buf *xmit = &p->port.state->xmit; ++ struct dma_async_tx_descriptor *desc; ++ unsigned int skip_byte = 0; ++ int ret; ++ ++ if (dma->tx_running) ++ return 0; ++ if (uart_tx_stopped(&p->port) || uart_circ_empty(xmit)) { ++ ++ /* ++ * Even if no data, we need to return an error for the two cases ++ * below so serial8250_tx_chars() is invoked and properly clears ++ * THRI and/or runtime suspend. ++ */ ++ if (dma->tx_err || p->capabilities & UART_CAP_RPM) { ++ ret = -EBUSY; ++ goto err; ++ } ++ serial8250_clear_THRI(p); ++ return 0; ++ } ++ ++ dma->tx_size = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE); ++ if (priv->habit & OMAP_DMA_TX_KICK) { ++ u8 tx_lvl; ++ ++ /* ++ * We need to put the first byte into the FIFO in order to start ++ * the DMA transfer. For transfers smaller than four bytes we ++ * don't bother doing DMA at all. It seem not matter if there ++ * are still bytes in the FIFO from the last transfer (in case ++ * we got here directly from omap_8250_dma_tx_complete()). Bytes ++ * leaving the FIFO seem not to trigger the DMA transfer. It is ++ * really the byte that we put into the FIFO. ++ * If the FIFO is already full then we most likely got here from ++ * omap_8250_dma_tx_complete(). And this means the DMA engine ++ * just completed its work. We don't have to wait the complete ++ * 86us at 115200,8n1 but around 60us (not to mention lower ++ * baudrates). So in that case we take the interrupt and try ++ * again with an empty FIFO. ++ */ ++ tx_lvl = serial_in(p, UART_OMAP_TX_LVL); ++ if (tx_lvl == p->tx_loadsz) { ++ ret = -EBUSY; ++ goto err; ++ } ++ if (dma->tx_size < 4) { ++ ret = -EINVAL; ++ goto err; ++ } ++ skip_byte = 1; ++ } ++ ++ desc = dmaengine_prep_slave_single(dma->txchan, ++ dma->tx_addr + xmit->tail + skip_byte, ++ dma->tx_size - skip_byte, DMA_MEM_TO_DEV, ++ DMA_PREP_INTERRUPT | DMA_CTRL_ACK); ++ if (!desc) { ++ ret = -EBUSY; ++ goto err; ++ } ++ ++ dma->tx_running = 1; ++ ++ desc->callback = omap_8250_dma_tx_complete; ++ desc->callback_param = p; ++ ++ dma->tx_cookie = dmaengine_submit(desc); ++ ++ dma_sync_single_for_device(dma->txchan->device->dev, dma->tx_addr, ++ UART_XMIT_SIZE, DMA_TO_DEVICE); ++ ++ dma_async_issue_pending(dma->txchan); ++ if (dma->tx_err) ++ dma->tx_err = 0; ++ ++ serial8250_clear_THRI(p); ++ if (skip_byte) ++ serial_out(p, UART_TX, xmit->buf[xmit->tail]); ++ return 0; ++err: ++ dma->tx_err = 1; ++ return ret; ++} ++ ++static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir) ++{ ++ switch (iir & 0x3f) { ++ case UART_IIR_RLSI: ++ case UART_IIR_RX_TIMEOUT: ++ case UART_IIR_RDI: ++ omap_8250_rx_dma_flush(up); ++ return true; ++ } ++ return omap_8250_rx_dma(up); ++} ++ ++static u16 omap_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, u16 status) ++{ ++ if ((status & (UART_LSR_DR | UART_LSR_BI)) && ++ (iir & UART_IIR_RDI)) { ++ if (handle_rx_dma(up, iir)) { ++ status = serial8250_rx_chars(up, status); ++ omap_8250_rx_dma(up); ++ } ++ } ++ ++ return status; ++} ++ ++static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, ++ u16 status) ++{ ++ /* ++ * Queue a new transfer if FIFO has data. ++ */ ++ if ((status & (UART_LSR_DR | UART_LSR_BI)) && ++ (up->ier & UART_IER_RDI)) { ++ omap_8250_rx_dma(up); ++ serial_out(up, UART_OMAP_EFR2, UART_OMAP_EFR2_TIMEOUT_BEHAVE); ++ } else if ((iir & 0x3f) == UART_IIR_RX_TIMEOUT) { ++ /* ++ * Disable RX timeout, read IIR to clear ++ * current timeout condition, clear EFR2 to ++ * periodic timeouts, re-enable interrupts. ++ */ ++ up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); ++ serial_out(up, UART_IER, up->ier); ++ omap_8250_rx_dma_flush(up); ++ serial_in(up, UART_IIR); ++ serial_out(up, UART_OMAP_EFR2, 0x0); ++ up->ier |= UART_IER_RLSI | UART_IER_RDI; ++ serial_out(up, UART_IER, up->ier); ++ } ++} ++ ++/* ++ * This is mostly serial8250_handle_irq(). We have a slightly different DMA ++ * hoook for RX/TX and need different logic for them in the ISR. Therefore we ++ * use the default routine in the non-DMA case and this one for with DMA. ++ */ ++static int omap_8250_dma_handle_irq(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = up->port.private_data; ++ u16 status; ++ u8 iir; ++ ++ serial8250_rpm_get(up); ++ ++ iir = serial_port_in(port, UART_IIR); ++ if (iir & UART_IIR_NO_INT) { ++ serial8250_rpm_put(up); ++ return IRQ_HANDLED; ++ } ++ ++ spin_lock(&port->lock); ++ ++ status = serial_port_in(port, UART_LSR); ++ ++ if (priv->habit & UART_HAS_EFR2) ++ am654_8250_handle_rx_dma(up, iir, status); ++ else ++ status = omap_8250_handle_rx_dma(up, iir, status); ++ ++ serial8250_modem_status(up); ++ if (status & UART_LSR_THRE && up->dma->tx_err) { ++ if (uart_tx_stopped(&up->port) || ++ uart_circ_empty(&up->port.state->xmit)) { ++ up->dma->tx_err = 0; ++ serial8250_tx_chars(up); ++ } else { ++ /* ++ * try again due to an earlier failer which ++ * might have been resolved by now. ++ */ ++ if (omap_8250_tx_dma(up)) ++ serial8250_tx_chars(up); ++ } ++ } ++ ++ uart_unlock_and_check_sysrq(port); ++ ++ serial8250_rpm_put(up); ++ return 1; ++} ++ ++static bool the_no_dma_filter_fn(struct dma_chan *chan, void *param) ++{ ++ return false; ++} ++ ++#else ++ ++static inline int omap_8250_rx_dma(struct uart_8250_port *p) ++{ ++ return -EINVAL; ++} ++#endif ++ ++static int omap8250_no_handle_irq(struct uart_port *port) ++{ ++ /* IRQ has not been requested but handling irq? */ ++ WARN_ONCE(1, "Unexpected irq handling before port startup\n"); ++ return 0; ++} ++ ++static struct omap8250_dma_params am654_dma = { ++ .rx_size = SZ_2K, ++ .rx_trigger = 1, ++ .tx_trigger = TX_TRIGGER, ++}; ++ ++static struct omap8250_dma_params am33xx_dma = { ++ .rx_size = RX_TRIGGER, ++ .rx_trigger = RX_TRIGGER, ++ .tx_trigger = TX_TRIGGER, ++}; ++ ++static struct omap8250_platdata am654_platdata = { ++ .dma_params = &am654_dma, ++ .habit = UART_HAS_EFR2 | UART_HAS_RHR_IT_DIS | ++ UART_RX_TIMEOUT_QUIRK, ++}; ++ ++static struct omap8250_platdata am33xx_platdata = { ++ .dma_params = &am33xx_dma, ++ .habit = OMAP_DMA_TX_KICK | UART_ERRATA_CLOCK_DISABLE, ++}; ++ ++static struct omap8250_platdata omap4_platdata = { ++ .dma_params = &am33xx_dma, ++ .habit = UART_ERRATA_CLOCK_DISABLE, ++}; ++ ++static const struct of_device_id omap8250_dt_ids[] = { ++ { .compatible = "ti,am654-uart", .data = &am654_platdata, }, ++ { .compatible = "ti,omap2-uart" }, ++ { .compatible = "ti,omap3-uart" }, ++ { .compatible = "ti,omap4-uart", .data = &omap4_platdata, }, ++ { .compatible = "ti,am3352-uart", .data = &am33xx_platdata, }, ++ { .compatible = "ti,am4372-uart", .data = &am33xx_platdata, }, ++ { .compatible = "ti,dra742-uart", .data = &omap4_platdata, }, ++ {}, ++}; ++MODULE_DEVICE_TABLE(of, omap8250_dt_ids); ++ ++static int omap8250_probe(struct platform_device *pdev) ++{ ++ struct device_node *np = pdev->dev.of_node; ++ struct omap8250_priv *priv; ++ const struct omap8250_platdata *pdata; ++ struct uart_8250_port up; ++ struct resource *regs; ++ void __iomem *membase; ++ int irq, ret; ++ ++ irq = platform_get_irq(pdev, 0); ++ if (irq < 0) ++ return irq; ++ ++ regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); ++ if (!regs) { ++ dev_err(&pdev->dev, "missing registers\n"); ++ return -EINVAL; ++ } ++ ++ priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); ++ if (!priv) ++ return -ENOMEM; ++ ++ membase = devm_ioremap(&pdev->dev, regs->start, ++ resource_size(regs)); ++ if (!membase) ++ return -ENODEV; ++ ++ memset(&up, 0, sizeof(up)); ++ up.port.dev = &pdev->dev; ++ up.port.mapbase = regs->start; ++ up.port.membase = membase; ++ up.port.irq = irq; ++ /* ++ * It claims to be 16C750 compatible however it is a little different. ++ * It has EFR and has no FCR7_64byte bit. The AFE (which it claims to ++ * have) is enabled via EFR instead of MCR. The type is set here 8250 ++ * just to get things going. UNKNOWN does not work for a few reasons and ++ * we don't need our own type since we don't use 8250's set_termios() ++ * or pm callback. ++ */ ++ up.port.type = PORT_8250; ++ up.port.iotype = UPIO_MEM; ++ up.port.flags = UPF_FIXED_PORT | UPF_FIXED_TYPE | UPF_SOFT_FLOW | ++ UPF_HARD_FLOW; ++ up.port.private_data = priv; ++ ++ up.port.regshift = 2; ++ up.port.fifosize = 64; ++ up.tx_loadsz = 64; ++ up.capabilities = UART_CAP_FIFO; ++#ifdef CONFIG_PM ++ /* ++ * Runtime PM is mostly transparent. However to do it right we need to a ++ * TX empty interrupt before we can put the device to auto idle. So if ++ * PM is not enabled we don't add that flag and can spare that one extra ++ * interrupt in the TX path. ++ */ ++ up.capabilities |= UART_CAP_RPM; ++#endif ++ up.port.set_termios = omap_8250_set_termios; ++ up.port.set_mctrl = omap8250_set_mctrl; ++ up.port.pm = omap_8250_pm; ++ up.port.startup = omap_8250_startup; ++ up.port.shutdown = omap_8250_shutdown; ++ up.port.throttle = omap_8250_throttle; ++ up.port.unthrottle = omap_8250_unthrottle; ++ up.port.rs485_config = serial8250_em485_config; ++ up.port.rs485_supported = serial8250_em485_supported; ++ up.rs485_start_tx = serial8250_em485_start_tx; ++ up.rs485_stop_tx = serial8250_em485_stop_tx; ++ up.port.has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE); ++ ++ ret = of_alias_get_id(np, "serial"); ++ if (ret < 0) { ++ dev_err(&pdev->dev, "failed to get alias\n"); ++ return ret; ++ } ++ up.port.line = ret; ++ ++ if (of_property_read_u32(np, "clock-frequency", &up.port.uartclk)) { ++ struct clk *clk; ++ ++ clk = devm_clk_get(&pdev->dev, NULL); ++ if (IS_ERR(clk)) { ++ if (PTR_ERR(clk) == -EPROBE_DEFER) ++ return -EPROBE_DEFER; ++ } else { ++ up.port.uartclk = clk_get_rate(clk); ++ } ++ } ++ ++ if (of_property_read_u32(np, "overrun-throttle-ms", ++ &up.overrun_backoff_time_ms) != 0) ++ up.overrun_backoff_time_ms = 0; ++ ++ priv->wakeirq = irq_of_parse_and_map(np, 1); ++ ++ pdata = of_device_get_match_data(&pdev->dev); ++ if (pdata) ++ priv->habit |= pdata->habit; ++ ++ if (!up.port.uartclk) { ++ up.port.uartclk = DEFAULT_CLK_SPEED; ++ dev_warn(&pdev->dev, ++ "No clock speed specified: using default: %d\n", ++ DEFAULT_CLK_SPEED); ++ } ++ ++ priv->latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE; ++ priv->calc_latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE; ++ cpu_latency_qos_add_request(&priv->pm_qos_request, priv->latency); ++ INIT_WORK(&priv->qos_work, omap8250_uart_qos_work); ++ ++ spin_lock_init(&priv->rx_dma_lock); ++ ++ device_init_wakeup(&pdev->dev, true); ++ pm_runtime_enable(&pdev->dev); ++ pm_runtime_use_autosuspend(&pdev->dev); ++ ++ /* ++ * Disable runtime PM until autosuspend delay unless specifically ++ * enabled by the user via sysfs. This is the historic way to ++ * prevent an unsafe default policy with lossy characters on wake-up. ++ * For serdev devices this is not needed, the policy can be managed by ++ * the serdev driver. ++ */ ++ if (!of_get_available_child_count(pdev->dev.of_node)) ++ pm_runtime_set_autosuspend_delay(&pdev->dev, -1); ++ ++ pm_runtime_irq_safe(&pdev->dev); ++ ++ pm_runtime_get_sync(&pdev->dev); ++ ++ omap_serial_fill_features_erratas(&up, priv); ++ up.port.handle_irq = omap8250_no_handle_irq; ++ priv->rx_trigger = RX_TRIGGER; ++ priv->tx_trigger = TX_TRIGGER; ++#ifdef CONFIG_SERIAL_8250_DMA ++ /* ++ * Oh DMA support. If there are no DMA properties in the DT then ++ * we will fall back to a generic DMA channel which does not ++ * really work here. To ensure that we do not get a generic DMA ++ * channel assigned, we have the the_no_dma_filter_fn() here. ++ * To avoid "failed to request DMA" messages we check for DMA ++ * properties in DT. ++ */ ++ ret = of_property_count_strings(np, "dma-names"); ++ if (ret == 2) { ++ struct omap8250_dma_params *dma_params = NULL; ++ ++ up.dma = &priv->omap8250_dma; ++ up.dma->fn = the_no_dma_filter_fn; ++ up.dma->tx_dma = omap_8250_tx_dma; ++ up.dma->rx_dma = omap_8250_rx_dma; ++ if (pdata) ++ dma_params = pdata->dma_params; ++ ++ if (dma_params) { ++ up.dma->rx_size = dma_params->rx_size; ++ up.dma->rxconf.src_maxburst = dma_params->rx_trigger; ++ up.dma->txconf.dst_maxburst = dma_params->tx_trigger; ++ priv->rx_trigger = dma_params->rx_trigger; ++ priv->tx_trigger = dma_params->tx_trigger; ++ } else { ++ up.dma->rx_size = RX_TRIGGER; ++ up.dma->rxconf.src_maxburst = RX_TRIGGER; ++ up.dma->txconf.dst_maxburst = TX_TRIGGER; ++ } ++ } ++#endif ++ ret = serial8250_register_8250_port(&up); ++ if (ret < 0) { ++ dev_err(&pdev->dev, "unable to register 8250 port\n"); ++ goto err; ++ } ++ priv->line = ret; ++ platform_set_drvdata(pdev, priv); ++ pm_runtime_mark_last_busy(&pdev->dev); ++ pm_runtime_put_autosuspend(&pdev->dev); ++ return 0; ++err: ++ pm_runtime_dont_use_autosuspend(&pdev->dev); ++ pm_runtime_put_sync(&pdev->dev); ++ pm_runtime_disable(&pdev->dev); ++ return ret; ++} ++ ++static int omap8250_remove(struct platform_device *pdev) ++{ ++ struct omap8250_priv *priv = platform_get_drvdata(pdev); ++ int err; ++ ++ err = pm_runtime_resume_and_get(&pdev->dev); ++ if (err) ++ return err; ++ ++ pm_runtime_dont_use_autosuspend(&pdev->dev); ++ pm_runtime_put_sync(&pdev->dev); ++ flush_work(&priv->qos_work); ++ pm_runtime_disable(&pdev->dev); ++ serial8250_unregister_port(priv->line); ++ cpu_latency_qos_remove_request(&priv->pm_qos_request); ++ device_init_wakeup(&pdev->dev, false); ++ return 0; ++} ++ ++#ifdef CONFIG_PM_SLEEP ++static int omap8250_prepare(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ ++ if (!priv) ++ return 0; ++ priv->is_suspending = true; ++ return 0; ++} ++ ++static void omap8250_complete(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ ++ if (!priv) ++ return; ++ priv->is_suspending = false; ++} ++ ++static int omap8250_suspend(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ struct uart_8250_port *up = serial8250_get_port(priv->line); ++ ++ serial8250_suspend_port(priv->line); ++ ++ pm_runtime_get_sync(dev); ++ if (!device_may_wakeup(dev)) ++ priv->wer = 0; ++ serial_out(up, UART_OMAP_WER, priv->wer); ++ pm_runtime_mark_last_busy(dev); ++ pm_runtime_put_autosuspend(dev); ++ ++ flush_work(&priv->qos_work); ++ return 0; ++} ++ ++static int omap8250_resume(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ ++ serial8250_resume_port(priv->line); ++ return 0; ++} ++#else ++#define omap8250_prepare NULL ++#define omap8250_complete NULL ++#endif ++ ++#ifdef CONFIG_PM ++static int omap8250_lost_context(struct uart_8250_port *up) ++{ ++ u32 val; ++ ++ val = serial_in(up, UART_OMAP_SCR); ++ /* ++ * If we lose context, then SCR is set to its reset value of zero. ++ * After set_termios() we set bit 3 of SCR (TX_EMPTY_CTL_IT) to 1, ++ * among other bits, to never set the register back to zero again. ++ */ ++ if (!val) ++ return 1; ++ return 0; ++} ++ ++/* TODO: in future, this should happen via API in drivers/reset/ */ ++static int omap8250_soft_reset(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ struct uart_8250_port *up = serial8250_get_port(priv->line); ++ int timeout = 100; ++ int sysc; ++ int syss; ++ ++ /* ++ * At least on omap4, unused uarts may not idle after reset without ++ * a basic scr dma configuration even with no dma in use. The ++ * module clkctrl status bits will be 1 instead of 3 blocking idle ++ * for the whole clockdomain. The softreset below will clear scr, ++ * and we restore it on resume so this is safe to do on all SoCs ++ * needing omap8250_soft_reset() quirk. Do it in two writes as ++ * recommended in the comment for omap8250_update_scr(). ++ */ ++ serial_out(up, UART_OMAP_SCR, OMAP_UART_SCR_DMAMODE_1); ++ serial_out(up, UART_OMAP_SCR, ++ OMAP_UART_SCR_DMAMODE_1 | OMAP_UART_SCR_DMAMODE_CTL); ++ ++ sysc = serial_in(up, UART_OMAP_SYSC); ++ ++ /* softreset the UART */ ++ sysc |= OMAP_UART_SYSC_SOFTRESET; ++ serial_out(up, UART_OMAP_SYSC, sysc); ++ ++ /* By experiments, 1us enough for reset complete on AM335x */ ++ do { ++ udelay(1); ++ syss = serial_in(up, UART_OMAP_SYSS); ++ } while (--timeout && !(syss & OMAP_UART_SYSS_RESETDONE)); ++ ++ if (!timeout) { ++ dev_err(dev, "timed out waiting for reset done\n"); ++ return -ETIMEDOUT; ++ } ++ ++ return 0; ++} ++ ++static int omap8250_runtime_suspend(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ struct uart_8250_port *up; ++ ++ /* In case runtime-pm tries this before we are setup */ ++ if (!priv) ++ return 0; ++ ++ up = serial8250_get_port(priv->line); ++ /* ++ * When using 'no_console_suspend', the console UART must not be ++ * suspended. Since driver suspend is managed by runtime suspend, ++ * preventing runtime suspend (by returning error) will keep device ++ * active during suspend. ++ */ ++ if (priv->is_suspending && !console_suspend_enabled) { ++ if (uart_console(&up->port)) ++ return -EBUSY; ++ } ++ ++ if (priv->habit & UART_ERRATA_CLOCK_DISABLE) { ++ int ret; ++ ++ ret = omap8250_soft_reset(dev); ++ if (ret) ++ return ret; ++ ++ /* Restore to UART mode after reset (for wakeup) */ ++ omap8250_update_mdr1(up, priv); ++ /* Restore wakeup enable register */ ++ serial_out(up, UART_OMAP_WER, priv->wer); ++ } ++ ++ if (up->dma && up->dma->rxchan) ++ omap_8250_rx_dma_flush(up); ++ ++ priv->latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE; ++ schedule_work(&priv->qos_work); ++ ++ return 0; ++} ++ ++static int omap8250_runtime_resume(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ struct uart_8250_port *up; ++ ++ /* In case runtime-pm tries this before we are setup */ ++ if (!priv) ++ return 0; ++ ++ up = serial8250_get_port(priv->line); ++ ++ if (omap8250_lost_context(up)) ++ omap8250_restore_regs(up); ++ ++ if (up->dma && up->dma->rxchan && !(priv->habit & UART_HAS_EFR2)) ++ omap_8250_rx_dma(up); ++ ++ priv->latency = priv->calc_latency; ++ schedule_work(&priv->qos_work); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SERIAL_8250_OMAP_TTYO_FIXUP ++static int __init omap8250_console_fixup(void) ++{ ++ char *omap_str; ++ char *options; ++ u8 idx; ++ ++ if (strstr(boot_command_line, "console=ttyS")) ++ /* user set a ttyS based name for the console */ ++ return 0; ++ ++ omap_str = strstr(boot_command_line, "console=ttyO"); ++ if (!omap_str) ++ /* user did not set ttyO based console, so we don't care */ ++ return 0; ++ ++ omap_str += 12; ++ if ('0' <= *omap_str && *omap_str <= '9') ++ idx = *omap_str - '0'; ++ else ++ return 0; ++ ++ omap_str++; ++ if (omap_str[0] == ',') { ++ omap_str++; ++ options = omap_str; ++ } else { ++ options = NULL; ++ } ++ ++ add_preferred_console("ttyS", idx, options); ++ pr_err("WARNING: Your 'console=ttyO%d' has been replaced by 'ttyS%d'\n", ++ idx, idx); ++ pr_err("This ensures that you still see kernel messages. Please\n"); ++ pr_err("update your kernel commandline.\n"); ++ return 0; ++} ++console_initcall(omap8250_console_fixup); ++#endif ++ ++static const struct dev_pm_ops omap8250_dev_pm_ops = { ++ SET_SYSTEM_SLEEP_PM_OPS(omap8250_suspend, omap8250_resume) ++ SET_RUNTIME_PM_OPS(omap8250_runtime_suspend, ++ omap8250_runtime_resume, NULL) ++ .prepare = omap8250_prepare, ++ .complete = omap8250_complete, ++}; ++ ++static struct platform_driver omap8250_platform_driver = { ++ .driver = { ++ .name = "omap8250", ++ .pm = &omap8250_dev_pm_ops, ++ .of_match_table = omap8250_dt_ids, ++ }, ++ .probe = omap8250_probe, ++ .remove = omap8250_remove, ++}; ++module_platform_driver(omap8250_platform_driver); ++ ++MODULE_AUTHOR("Sebastian Andrzej Siewior"); ++MODULE_DESCRIPTION("OMAP 8250 Driver"); ++MODULE_LICENSE("GPL v2"); +Binary files linux.orig/drivers/tty/serial/8250/.8250_omap.c.rej.swp and linux/drivers/tty/serial/8250/.8250_omap.c.rej.swp differ +diff -rupN linux.orig/drivers/tty/serial/8250/8250_port.c linux/drivers/tty/serial/8250/8250_port.c +--- linux.orig/drivers/tty/serial/8250/8250_port.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_port.c 2022-12-04 10:40:26.700034085 -0500 +@@ -743,7 +743,7 @@ static void serial8250_set_sleep(struct serial_out(p, UART_EFR, UART_EFR_ECB); serial_out(p, UART_LCR, 0); } @@ -3637,7 +22982,7 @@ index 2030a92ac66e7..326549603740d 100644 if (p->capabilities & UART_CAP_EFR) { serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, efr); -@@ -1017,8 +1017,11 @@ static int broken_efr(struct uart_8250_port *up) +@@ -1017,8 +1017,11 @@ static int broken_efr(struct uart_8250_p */ static void autoconfig_16550a(struct uart_8250_port *up) { @@ -3649,7 +22994,7 @@ index 2030a92ac66e7..326549603740d 100644 up->port.type = PORT_16550A; up->capabilities |= UART_CAP_FIFO; -@@ -1130,6 +1133,11 @@ static void autoconfig_16550a(struct uart_8250_port *up) +@@ -1130,6 +1133,11 @@ static void autoconfig_16550a(struct uar return; } @@ -3661,7 +23006,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Try writing and reading the UART_IER_UUE bit (b6). * If it works, this is probably one of the Xscale platform's -@@ -1165,6 +1173,9 @@ static void autoconfig_16550a(struct uart_8250_port *up) +@@ -1165,6 +1173,9 @@ static void autoconfig_16550a(struct uar } serial_out(up, UART_IER, iersave); @@ -3671,7 +23016,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * We distinguish between 16550A and U6 16550A by counting * how many bytes are in the FIFO. -@@ -1187,8 +1198,10 @@ static void autoconfig(struct uart_8250_port *up) +@@ -1187,8 +1198,10 @@ static void autoconfig(struct uart_8250_ unsigned char status1, scratch, scratch2, scratch3; unsigned char save_lcr, save_mcr; struct uart_port *port = &up->port; @@ -3682,7 +23027,7 @@ index 2030a92ac66e7..326549603740d 100644 if (!port->iobase && !port->mapbase && !port->membase) return; -@@ -1206,6 +1219,11 @@ static void autoconfig(struct uart_8250_port *up) +@@ -1206,6 +1219,11 @@ static void autoconfig(struct uart_8250_ up->bugs = 0; if (!(port->flags & UPF_BUGGY_UART)) { @@ -3694,7 +23039,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Do a simple existence test first; if we fail this, * there's no point trying anything else. -@@ -1235,6 +1253,10 @@ static void autoconfig(struct uart_8250_port *up) +@@ -1235,6 +1253,10 @@ static void autoconfig(struct uart_8250_ #endif scratch3 = serial_in(up, UART_IER) & 0x0f; serial_out(up, UART_IER, scratch); @@ -3705,7 +23050,7 @@ index 2030a92ac66e7..326549603740d 100644 if (scratch2 != 0 || scratch3 != 0x0F) { /* * We failed; there's nothing here -@@ -1332,10 +1354,7 @@ static void autoconfig(struct uart_8250_port *up) +@@ -1332,10 +1354,7 @@ static void autoconfig(struct uart_8250_ serial8250_out_MCR(up, save_mcr); serial8250_clear_fifos(up); serial_in(up, UART_RX); @@ -3717,7 +23062,7 @@ index 2030a92ac66e7..326549603740d 100644 out_unlock: spin_unlock_irqrestore(&port->lock, flags); -@@ -1361,7 +1380,9 @@ static void autoconfig_irq(struct uart_8250_port *up) +@@ -1361,7 +1380,9 @@ static void autoconfig_irq(struct uart_8 unsigned char save_mcr, save_ier; unsigned char save_ICP = 0; unsigned int ICP = 0; @@ -3727,7 +23072,7 @@ index 2030a92ac66e7..326549603740d 100644 int irq; if (port->flags & UPF_FOURPORT) { -@@ -1371,8 +1392,12 @@ static void autoconfig_irq(struct uart_8250_port *up) +@@ -1371,8 +1392,12 @@ static void autoconfig_irq(struct uart_8 inb_p(ICP); } @@ -3741,7 +23086,7 @@ index 2030a92ac66e7..326549603740d 100644 /* forget possible initially masked and pending IRQ */ probe_irq_off(probe_irq_on()); -@@ -1404,8 +1429,10 @@ static void autoconfig_irq(struct uart_8250_port *up) +@@ -1404,8 +1429,10 @@ static void autoconfig_irq(struct uart_8 if (port->flags & UPF_FOURPORT) outb_p(save_ICP, ICP); @@ -3753,7 +23098,7 @@ index 2030a92ac66e7..326549603740d 100644 port->irq = (irq > 0) ? irq : 0; } -@@ -1418,7 +1445,7 @@ static void serial8250_stop_rx(struct uart_port *port) +@@ -1418,7 +1445,7 @@ static void serial8250_stop_rx(struct ua up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); up->port.read_status_mask &= ~UART_LSR_DR; @@ -3762,7 +23107,7 @@ index 2030a92ac66e7..326549603740d 100644 serial8250_rpm_put(up); } -@@ -1448,7 +1475,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p) +@@ -1448,7 +1475,7 @@ void serial8250_em485_stop_tx(struct uar serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; @@ -3771,7 +23116,7 @@ index 2030a92ac66e7..326549603740d 100644 } } EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); -@@ -1697,7 +1724,7 @@ static void serial8250_disable_ms(struct uart_port *port) +@@ -1697,7 +1724,7 @@ static void serial8250_disable_ms(struct mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; @@ -3780,7 +23125,7 @@ index 2030a92ac66e7..326549603740d 100644 } static void serial8250_enable_ms(struct uart_port *port) -@@ -1713,7 +1740,7 @@ static void serial8250_enable_ms(struct uart_port *port) +@@ -1713,7 +1740,7 @@ static void serial8250_enable_ms(struct up->ier |= UART_IER_MSI; serial8250_rpm_get(up); @@ -3789,7 +23134,7 @@ index 2030a92ac66e7..326549603740d 100644 serial8250_rpm_put(up); } -@@ -2144,14 +2171,7 @@ static void serial8250_put_poll_char(struct uart_port *port, +@@ -2147,14 +2174,7 @@ static void serial8250_put_poll_char(str struct uart_8250_port *up = up_to_u8250p(port); serial8250_rpm_get(up); @@ -3805,7 +23150,7 @@ index 2030a92ac66e7..326549603740d 100644 wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); /* -@@ -2164,7 +2184,7 @@ static void serial8250_put_poll_char(struct uart_port *port, +@@ -2167,7 +2187,7 @@ static void serial8250_put_poll_char(str * and restore the IER */ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); @@ -3814,7 +23159,7 @@ index 2030a92ac66e7..326549603740d 100644 serial8250_rpm_put(up); } -@@ -2173,8 +2193,10 @@ static void serial8250_put_poll_char(struct uart_port *port, +@@ -2176,8 +2196,10 @@ static void serial8250_put_poll_char(str int serial8250_do_startup(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); @@ -3825,7 +23170,7 @@ index 2030a92ac66e7..326549603740d 100644 int retval; u16 lsr; -@@ -2195,7 +2217,7 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2198,7 +2220,7 @@ int serial8250_do_startup(struct uart_po up->acr = 0; serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); serial_port_out(port, UART_EFR, UART_EFR_ECB); @@ -3834,7 +23179,7 @@ index 2030a92ac66e7..326549603740d 100644 serial_port_out(port, UART_LCR, 0); serial_icr_write(up, UART_CSR, 0); /* Reset the UART */ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); -@@ -2205,7 +2227,7 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2208,7 +2230,7 @@ int serial8250_do_startup(struct uart_po if (port->type == PORT_DA830) { /* Reset the port */ @@ -3843,7 +23188,7 @@ index 2030a92ac66e7..326549603740d 100644 serial_port_out(port, UART_DA830_PWREMU_MGMT, 0); mdelay(10); -@@ -2304,6 +2326,8 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2307,6 +2329,8 @@ int serial8250_do_startup(struct uart_po if (retval) goto out; @@ -3852,7 +23197,7 @@ index 2030a92ac66e7..326549603740d 100644 if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) { unsigned char iir1; -@@ -2320,6 +2344,9 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2323,6 +2347,9 @@ int serial8250_do_startup(struct uart_po */ spin_lock_irqsave(&port->lock, flags); @@ -3862,7 +23207,7 @@ index 2030a92ac66e7..326549603740d 100644 wait_for_xmitr(up, UART_LSR_THRE); serial_port_out_sync(port, UART_IER, UART_IER_THRI); udelay(1); /* allow THRE to set */ -@@ -2330,6 +2357,9 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2333,6 +2360,9 @@ int serial8250_do_startup(struct uart_po iir = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); @@ -3872,7 +23217,7 @@ index 2030a92ac66e7..326549603740d 100644 spin_unlock_irqrestore(&port->lock, flags); if (port->irqflags & IRQF_SHARED) -@@ -2384,10 +2414,14 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2387,10 +2417,14 @@ int serial8250_do_startup(struct uart_po * Do a quick test to see if we receive an interrupt when we enable * the TX irq. */ @@ -3887,7 +23232,7 @@ index 2030a92ac66e7..326549603740d 100644 if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { if (!(up->bugs & UART_BUG_TXEN)) { -@@ -2419,7 +2453,7 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2422,7 +2456,7 @@ dont_test_tx_en: if (up->dma) { const char *msg = NULL; @@ -3896,7 +23241,7 @@ index 2030a92ac66e7..326549603740d 100644 msg = "forbid DMA for kernel console"; else if (serial8250_request_dma(up)) msg = "failed to request DMA"; -@@ -2470,7 +2504,7 @@ void serial8250_do_shutdown(struct uart_port *port) +@@ -2473,7 +2507,7 @@ void serial8250_do_shutdown(struct uart_ */ spin_lock_irqsave(&port->lock, flags); up->ier = 0; @@ -3905,7 +23250,7 @@ index 2030a92ac66e7..326549603740d 100644 spin_unlock_irqrestore(&port->lock, flags); synchronize_irq(port->irq); -@@ -2836,7 +2870,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, +@@ -2839,7 +2873,7 @@ serial8250_do_set_termios(struct uart_po if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; @@ -3914,7 +23259,7 @@ index 2030a92ac66e7..326549603740d 100644 if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; -@@ -3301,7 +3335,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); +@@ -3304,7 +3338,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_default #ifdef CONFIG_SERIAL_8250_CONSOLE @@ -3923,7 +23268,7 @@ index 2030a92ac66e7..326549603740d 100644 { struct uart_8250_port *up = up_to_u8250p(port); -@@ -3309,6 +3343,18 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) +@@ -3312,6 +3346,18 @@ static void serial8250_console_putchar(s serial_port_out(port, UART_TX, ch); } @@ -3942,7 +23287,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Restore serial console when h/w power-off detected */ -@@ -3335,6 +3381,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) +@@ -3338,6 +3384,32 @@ static void serial8250_console_restore(s serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); } @@ -3975,7 +23320,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Print a string to the serial port using the device FIFO * -@@ -3380,24 +3452,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3383,24 +3455,12 @@ void serial8250_console_write(struct uar struct uart_port *port = &up->port; unsigned long flags; unsigned int ier, use_fifo; @@ -3987,13 +23332,13 @@ index 2030a92ac66e7..326549603740d 100644 - locked = spin_trylock_irqsave(&port->lock, flags); - else - spin_lock_irqsave(&port->lock, flags); -+ spin_lock_irqsave(&port->lock, flags); - +- - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); -- ++ spin_lock_irqsave(&port->lock, flags); + - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else @@ -4002,7 +23347,7 @@ index 2030a92ac66e7..326549603740d 100644 /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { -@@ -3431,10 +3491,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3434,10 +3494,12 @@ void serial8250_console_write(struct uar */ !(up->port.flags & UPF_CONS_FLOW); @@ -4015,7 +23360,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Finally, wait for transmitter to become empty -@@ -3447,8 +3509,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3450,8 +3512,7 @@ void serial8250_console_write(struct uar if (em485->tx_stopped) up->rs485_stop_tx(up); } @@ -4025,7 +23370,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * The receive handling will happen properly because the -@@ -3460,8 +3521,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3463,8 +3524,7 @@ void serial8250_console_write(struct uar if (up->msr_saved_flags) serial8250_modem_status(up); @@ -4035,7 +23380,7 @@ index 2030a92ac66e7..326549603740d 100644 } static unsigned int probe_baud(struct uart_port *port) -@@ -3481,6 +3541,7 @@ static unsigned int probe_baud(struct uart_port *port) +@@ -3484,6 +3544,7 @@ static unsigned int probe_baud(struct ua int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { @@ -4043,7 +23388,7 @@ index 2030a92ac66e7..326549603740d 100644 int baud = 9600; int bits = 8; int parity = 'n'; -@@ -3490,6 +3551,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) +@@ -3493,6 +3554,8 @@ int serial8250_console_setup(struct uart if (!port->iobase && !port->membase) return -ENODEV; @@ -4052,10 +23397,3534 @@ index 2030a92ac66e7..326549603740d 100644 if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) -diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig -index d0b49e15fbf5e..02c308467339c 100644 ---- a/drivers/tty/serial/8250/Kconfig -+++ b/drivers/tty/serial/8250/Kconfig +diff -rupN linux.orig/drivers/tty/serial/8250/8250_port.c.orig linux/drivers/tty/serial/8250/8250_port.c.orig +--- linux.orig/drivers/tty/serial/8250/8250_port.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_port.c.orig 2022-12-04 10:40:18.432055273 -0500 +@@ -0,0 +1,3521 @@ ++// SPDX-License-Identifier: GPL-2.0+ ++/* ++ * Base port operations for 8250/16550-type serial ports ++ * ++ * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. ++ * Split from 8250_core.c, Copyright (C) 2001 Russell King. ++ * ++ * A note about mapbase / membase ++ * ++ * mapbase is the physical address of the IO port. ++ * membase is an 'ioremapped' cookie. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "8250.h" ++ ++/* Nuvoton NPCM timeout register */ ++#define UART_NPCM_TOR 7 ++#define UART_NPCM_TOIE BIT(7) /* Timeout Interrupt Enable */ ++ ++/* ++ * Debugging. ++ */ ++#if 0 ++#define DEBUG_AUTOCONF(fmt...) printk(fmt) ++#else ++#define DEBUG_AUTOCONF(fmt...) do { } while (0) ++#endif ++ ++/* ++ * Here we define the default xmit fifo size used for each type of UART. ++ */ ++static const struct serial8250_config uart_config[] = { ++ [PORT_UNKNOWN] = { ++ .name = "unknown", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_8250] = { ++ .name = "8250", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16450] = { ++ .name = "16450", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16550] = { ++ .name = "16550", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16550A] = { ++ .name = "16550A", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_CIRRUS] = { ++ .name = "Cirrus", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16650] = { ++ .name = "ST16650", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, ++ }, ++ [PORT_16650V2] = { ++ .name = "ST16650V2", ++ .fifo_size = 32, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | ++ UART_FCR_T_TRIG_00, ++ .rxtrig_bytes = {8, 16, 24, 28}, ++ .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, ++ }, ++ [PORT_16750] = { ++ .name = "TI16750", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | ++ UART_FCR7_64BYTE, ++ .rxtrig_bytes = {1, 16, 32, 56}, ++ .flags = UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE, ++ }, ++ [PORT_STARTECH] = { ++ .name = "Startech", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16C950] = { ++ .name = "16C950/954", ++ .fifo_size = 128, ++ .tx_loadsz = 128, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01, ++ .rxtrig_bytes = {16, 32, 112, 120}, ++ /* UART_CAP_EFR breaks billionon CF bluetooth card. */ ++ .flags = UART_CAP_FIFO | UART_CAP_SLEEP, ++ }, ++ [PORT_16654] = { ++ .name = "ST16654", ++ .fifo_size = 64, ++ .tx_loadsz = 32, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | ++ UART_FCR_T_TRIG_10, ++ .rxtrig_bytes = {8, 16, 56, 60}, ++ .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, ++ }, ++ [PORT_16850] = { ++ .name = "XR16850", ++ .fifo_size = 128, ++ .tx_loadsz = 128, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, ++ }, ++ [PORT_RSA] = { ++ .name = "RSA", ++ .fifo_size = 2048, ++ .tx_loadsz = 2048, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_NS16550A] = { ++ .name = "NS16550A", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_NATSEMI, ++ }, ++ [PORT_XSCALE] = { ++ .name = "XScale", ++ .fifo_size = 32, ++ .tx_loadsz = 32, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE, ++ }, ++ [PORT_OCTEON] = { ++ .name = "OCTEON", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_AR7] = { ++ .name = "AR7", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00, ++ .flags = UART_CAP_FIFO /* | UART_CAP_AFE */, ++ }, ++ [PORT_U6_16550A] = { ++ .name = "U6_16550A", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ [PORT_TEGRA] = { ++ .name = "Tegra", ++ .fifo_size = 32, ++ .tx_loadsz = 8, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | ++ UART_FCR_T_TRIG_01, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO | UART_CAP_RTOIE, ++ }, ++ [PORT_XR17D15X] = { ++ .name = "XR17D15X", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR | ++ UART_CAP_SLEEP, ++ }, ++ [PORT_XR17V35X] = { ++ .name = "XR17V35X", ++ .fifo_size = 256, ++ .tx_loadsz = 256, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 | ++ UART_FCR_T_TRIG_11, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR | ++ UART_CAP_SLEEP, ++ }, ++ [PORT_LPC3220] = { ++ .name = "LPC3220", ++ .fifo_size = 64, ++ .tx_loadsz = 32, ++ .fcr = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO | ++ UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_BRCM_TRUMANAGE] = { ++ .name = "TruManage", ++ .fifo_size = 1, ++ .tx_loadsz = 1024, ++ .flags = UART_CAP_HFIFO, ++ }, ++ [PORT_8250_CIR] = { ++ .name = "CIR port" ++ }, ++ [PORT_ALTR_16550_F32] = { ++ .name = "Altera 16550 FIFO32", ++ .fifo_size = 32, ++ .tx_loadsz = 32, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 8, 16, 30}, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ [PORT_ALTR_16550_F64] = { ++ .name = "Altera 16550 FIFO64", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 16, 32, 62}, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ [PORT_ALTR_16550_F128] = { ++ .name = "Altera 16550 FIFO128", ++ .fifo_size = 128, ++ .tx_loadsz = 128, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 32, 64, 126}, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ /* ++ * tx_loadsz is set to 63-bytes instead of 64-bytes to implement ++ * workaround of errata A-008006 which states that tx_loadsz should ++ * be configured less than Maximum supported fifo bytes. ++ */ ++ [PORT_16550A_FSL64] = { ++ .name = "16550A_FSL64", ++ .fifo_size = 64, ++ .tx_loadsz = 63, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | ++ UART_FCR7_64BYTE, ++ .flags = UART_CAP_FIFO | UART_CAP_NOTEMT, ++ }, ++ [PORT_RT2880] = { ++ .name = "Palmchip BK-3103", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_DA830] = { ++ .name = "TI DA8xx/66AK2x", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO | ++ UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ [PORT_MTK_BTIF] = { ++ .name = "MediaTek BTIF", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | ++ UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_NPCM] = { ++ .name = "Nuvoton 16550", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | ++ UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_SUNIX] = { ++ .name = "Sunix", ++ .fifo_size = 128, ++ .tx_loadsz = 128, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 32, 64, 112}, ++ .flags = UART_CAP_FIFO | UART_CAP_SLEEP, ++ }, ++ [PORT_ASPEED_VUART] = { ++ .name = "ASPEED VUART", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO, ++ }, ++}; ++ ++/* Uart divisor latch read */ ++static int default_serial_dl_read(struct uart_8250_port *up) ++{ ++ /* Assign these in pieces to truncate any bits above 7. */ ++ unsigned char dll = serial_in(up, UART_DLL); ++ unsigned char dlm = serial_in(up, UART_DLM); ++ ++ return dll | dlm << 8; ++} ++ ++/* Uart divisor latch write */ ++static void default_serial_dl_write(struct uart_8250_port *up, int value) ++{ ++ serial_out(up, UART_DLL, value & 0xff); ++ serial_out(up, UART_DLM, value >> 8 & 0xff); ++} ++ ++#ifdef CONFIG_SERIAL_8250_RT288X ++ ++#define UART_REG_UNMAPPED -1 ++ ++/* Au1x00/RT288x UART hardware has a weird register layout */ ++static const s8 au_io_in_map[8] = { ++ [UART_RX] = 0, ++ [UART_IER] = 2, ++ [UART_IIR] = 3, ++ [UART_LCR] = 5, ++ [UART_MCR] = 6, ++ [UART_LSR] = 7, ++ [UART_MSR] = 8, ++ [UART_SCR] = UART_REG_UNMAPPED, ++}; ++ ++static const s8 au_io_out_map[8] = { ++ [UART_TX] = 1, ++ [UART_IER] = 2, ++ [UART_FCR] = 4, ++ [UART_LCR] = 5, ++ [UART_MCR] = 6, ++ [UART_LSR] = UART_REG_UNMAPPED, ++ [UART_MSR] = UART_REG_UNMAPPED, ++ [UART_SCR] = UART_REG_UNMAPPED, ++}; ++ ++unsigned int au_serial_in(struct uart_port *p, int offset) ++{ ++ if (offset >= ARRAY_SIZE(au_io_in_map)) ++ return UINT_MAX; ++ offset = au_io_in_map[offset]; ++ if (offset == UART_REG_UNMAPPED) ++ return UINT_MAX; ++ return __raw_readl(p->membase + (offset << p->regshift)); ++} ++ ++void au_serial_out(struct uart_port *p, int offset, int value) ++{ ++ if (offset >= ARRAY_SIZE(au_io_out_map)) ++ return; ++ offset = au_io_out_map[offset]; ++ if (offset == UART_REG_UNMAPPED) ++ return; ++ __raw_writel(value, p->membase + (offset << p->regshift)); ++} ++ ++/* Au1x00 haven't got a standard divisor latch */ ++static int au_serial_dl_read(struct uart_8250_port *up) ++{ ++ return __raw_readl(up->port.membase + 0x28); ++} ++ ++static void au_serial_dl_write(struct uart_8250_port *up, int value) ++{ ++ __raw_writel(value, up->port.membase + 0x28); ++} ++ ++#endif ++ ++static unsigned int hub6_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ outb(p->hub6 - 1 + offset, p->iobase); ++ return inb(p->iobase + 1); ++} ++ ++static void hub6_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ outb(p->hub6 - 1 + offset, p->iobase); ++ outb(value, p->iobase + 1); ++} ++ ++static unsigned int mem_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return readb(p->membase + offset); ++} ++ ++static void mem_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ writeb(value, p->membase + offset); ++} ++ ++static void mem16_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ writew(value, p->membase + offset); ++} ++ ++static unsigned int mem16_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return readw(p->membase + offset); ++} ++ ++static void mem32_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ writel(value, p->membase + offset); ++} ++ ++static unsigned int mem32_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return readl(p->membase + offset); ++} ++ ++static void mem32be_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ iowrite32be(value, p->membase + offset); ++} ++ ++static unsigned int mem32be_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return ioread32be(p->membase + offset); ++} ++ ++static unsigned int io_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return inb(p->iobase + offset); ++} ++ ++static void io_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ outb(value, p->iobase + offset); ++} ++ ++static int serial8250_default_handle_irq(struct uart_port *port); ++ ++static void set_io_from_upio(struct uart_port *p) ++{ ++ struct uart_8250_port *up = up_to_u8250p(p); ++ ++ up->dl_read = default_serial_dl_read; ++ up->dl_write = default_serial_dl_write; ++ ++ switch (p->iotype) { ++ case UPIO_HUB6: ++ p->serial_in = hub6_serial_in; ++ p->serial_out = hub6_serial_out; ++ break; ++ ++ case UPIO_MEM: ++ p->serial_in = mem_serial_in; ++ p->serial_out = mem_serial_out; ++ break; ++ ++ case UPIO_MEM16: ++ p->serial_in = mem16_serial_in; ++ p->serial_out = mem16_serial_out; ++ break; ++ ++ case UPIO_MEM32: ++ p->serial_in = mem32_serial_in; ++ p->serial_out = mem32_serial_out; ++ break; ++ ++ case UPIO_MEM32BE: ++ p->serial_in = mem32be_serial_in; ++ p->serial_out = mem32be_serial_out; ++ break; ++ ++#ifdef CONFIG_SERIAL_8250_RT288X ++ case UPIO_AU: ++ p->serial_in = au_serial_in; ++ p->serial_out = au_serial_out; ++ up->dl_read = au_serial_dl_read; ++ up->dl_write = au_serial_dl_write; ++ break; ++#endif ++ ++ default: ++ p->serial_in = io_serial_in; ++ p->serial_out = io_serial_out; ++ break; ++ } ++ /* Remember loaded iotype */ ++ up->cur_iotype = p->iotype; ++ p->handle_irq = serial8250_default_handle_irq; ++} ++ ++static void ++serial_port_out_sync(struct uart_port *p, int offset, int value) ++{ ++ switch (p->iotype) { ++ case UPIO_MEM: ++ case UPIO_MEM16: ++ case UPIO_MEM32: ++ case UPIO_MEM32BE: ++ case UPIO_AU: ++ p->serial_out(p, offset, value); ++ p->serial_in(p, UART_LCR); /* safe, no side-effects */ ++ break; ++ default: ++ p->serial_out(p, offset, value); ++ } ++} ++ ++/* ++ * FIFO support. ++ */ ++static void serial8250_clear_fifos(struct uart_8250_port *p) ++{ ++ if (p->capabilities & UART_CAP_FIFO) { ++ serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO); ++ serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO | ++ UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); ++ serial_out(p, UART_FCR, 0); ++ } ++} ++ ++static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t); ++static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t); ++ ++void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p) ++{ ++ serial8250_clear_fifos(p); ++ serial_out(p, UART_FCR, p->fcr); ++} ++EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos); ++ ++void serial8250_rpm_get(struct uart_8250_port *p) ++{ ++ if (!(p->capabilities & UART_CAP_RPM)) ++ return; ++ pm_runtime_get_sync(p->port.dev); ++} ++EXPORT_SYMBOL_GPL(serial8250_rpm_get); ++ ++void serial8250_rpm_put(struct uart_8250_port *p) ++{ ++ if (!(p->capabilities & UART_CAP_RPM)) ++ return; ++ pm_runtime_mark_last_busy(p->port.dev); ++ pm_runtime_put_autosuspend(p->port.dev); ++} ++EXPORT_SYMBOL_GPL(serial8250_rpm_put); ++ ++/** ++ * serial8250_em485_init() - put uart_8250_port into rs485 emulating ++ * @p: uart_8250_port port instance ++ * ++ * The function is used to start rs485 software emulating on the ++ * &struct uart_8250_port* @p. Namely, RTS is switched before/after ++ * transmission. The function is idempotent, so it is safe to call it ++ * multiple times. ++ * ++ * The caller MUST enable interrupt on empty shift register before ++ * calling serial8250_em485_init(). This interrupt is not a part of ++ * 8250 standard, but implementation defined. ++ * ++ * The function is supposed to be called from .rs485_config callback ++ * or from any other callback protected with p->port.lock spinlock. ++ * ++ * See also serial8250_em485_destroy() ++ * ++ * Return 0 - success, -errno - otherwise ++ */ ++static int serial8250_em485_init(struct uart_8250_port *p) ++{ ++ if (p->em485) ++ goto deassert_rts; ++ ++ p->em485 = kmalloc(sizeof(struct uart_8250_em485), GFP_ATOMIC); ++ if (!p->em485) ++ return -ENOMEM; ++ ++ hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx; ++ p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx; ++ p->em485->port = p; ++ p->em485->active_timer = NULL; ++ p->em485->tx_stopped = true; ++ ++deassert_rts: ++ if (p->em485->tx_stopped) ++ p->rs485_stop_tx(p); ++ ++ return 0; ++} ++ ++/** ++ * serial8250_em485_destroy() - put uart_8250_port into normal state ++ * @p: uart_8250_port port instance ++ * ++ * The function is used to stop rs485 software emulating on the ++ * &struct uart_8250_port* @p. The function is idempotent, so it is safe to ++ * call it multiple times. ++ * ++ * The function is supposed to be called from .rs485_config callback ++ * or from any other callback protected with p->port.lock spinlock. ++ * ++ * See also serial8250_em485_init() ++ */ ++void serial8250_em485_destroy(struct uart_8250_port *p) ++{ ++ if (!p->em485) ++ return; ++ ++ hrtimer_cancel(&p->em485->start_tx_timer); ++ hrtimer_cancel(&p->em485->stop_tx_timer); ++ ++ kfree(p->em485); ++ p->em485 = NULL; ++} ++EXPORT_SYMBOL_GPL(serial8250_em485_destroy); ++ ++struct serial_rs485 serial8250_em485_supported = { ++ .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND | ++ SER_RS485_TERMINATE_BUS | SER_RS485_RX_DURING_TX, ++ .delay_rts_before_send = 1, ++ .delay_rts_after_send = 1, ++}; ++EXPORT_SYMBOL_GPL(serial8250_em485_supported); ++ ++/** ++ * serial8250_em485_config() - generic ->rs485_config() callback ++ * @port: uart port ++ * @rs485: rs485 settings ++ * ++ * Generic callback usable by 8250 uart drivers to activate rs485 settings ++ * if the uart is incapable of driving RTS as a Transmit Enable signal in ++ * hardware, relying on software emulation instead. ++ */ ++int serial8250_em485_config(struct uart_port *port, struct ktermios *termios, ++ struct serial_rs485 *rs485) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ /* pick sane settings if the user hasn't */ ++ if (!!(rs485->flags & SER_RS485_RTS_ON_SEND) == ++ !!(rs485->flags & SER_RS485_RTS_AFTER_SEND)) { ++ rs485->flags |= SER_RS485_RTS_ON_SEND; ++ rs485->flags &= ~SER_RS485_RTS_AFTER_SEND; ++ } ++ ++ /* ++ * Both serial8250_em485_init() and serial8250_em485_destroy() ++ * are idempotent. ++ */ ++ if (rs485->flags & SER_RS485_ENABLED) ++ return serial8250_em485_init(up); ++ ++ serial8250_em485_destroy(up); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(serial8250_em485_config); ++ ++/* ++ * These two wrappers ensure that enable_runtime_pm_tx() can be called more than ++ * once and disable_runtime_pm_tx() will still disable RPM because the fifo is ++ * empty and the HW can idle again. ++ */ ++void serial8250_rpm_get_tx(struct uart_8250_port *p) ++{ ++ unsigned char rpm_active; ++ ++ if (!(p->capabilities & UART_CAP_RPM)) ++ return; ++ ++ rpm_active = xchg(&p->rpm_tx_active, 1); ++ if (rpm_active) ++ return; ++ pm_runtime_get_sync(p->port.dev); ++} ++EXPORT_SYMBOL_GPL(serial8250_rpm_get_tx); ++ ++void serial8250_rpm_put_tx(struct uart_8250_port *p) ++{ ++ unsigned char rpm_active; ++ ++ if (!(p->capabilities & UART_CAP_RPM)) ++ return; ++ ++ rpm_active = xchg(&p->rpm_tx_active, 0); ++ if (!rpm_active) ++ return; ++ pm_runtime_mark_last_busy(p->port.dev); ++ pm_runtime_put_autosuspend(p->port.dev); ++} ++EXPORT_SYMBOL_GPL(serial8250_rpm_put_tx); ++ ++/* ++ * IER sleep support. UARTs which have EFRs need the "extended ++ * capability" bit enabled. Note that on XR16C850s, we need to ++ * reset LCR to write to IER. ++ */ ++static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) ++{ ++ unsigned char lcr = 0, efr = 0; ++ ++ serial8250_rpm_get(p); ++ ++ if (p->capabilities & UART_CAP_SLEEP) { ++ if (p->capabilities & UART_CAP_EFR) { ++ lcr = serial_in(p, UART_LCR); ++ efr = serial_in(p, UART_EFR); ++ serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(p, UART_EFR, UART_EFR_ECB); ++ serial_out(p, UART_LCR, 0); ++ } ++ serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); ++ if (p->capabilities & UART_CAP_EFR) { ++ serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(p, UART_EFR, efr); ++ serial_out(p, UART_LCR, lcr); ++ } ++ } ++ ++ serial8250_rpm_put(p); ++} ++ ++#ifdef CONFIG_SERIAL_8250_RSA ++/* ++ * Attempts to turn on the RSA FIFO. Returns zero on failure. ++ * We set the port uart clock rate if we succeed. ++ */ ++static int __enable_rsa(struct uart_8250_port *up) ++{ ++ unsigned char mode; ++ int result; ++ ++ mode = serial_in(up, UART_RSA_MSR); ++ result = mode & UART_RSA_MSR_FIFO; ++ ++ if (!result) { ++ serial_out(up, UART_RSA_MSR, mode | UART_RSA_MSR_FIFO); ++ mode = serial_in(up, UART_RSA_MSR); ++ result = mode & UART_RSA_MSR_FIFO; ++ } ++ ++ if (result) ++ up->port.uartclk = SERIAL_RSA_BAUD_BASE * 16; ++ ++ return result; ++} ++ ++static void enable_rsa(struct uart_8250_port *up) ++{ ++ if (up->port.type == PORT_RSA) { ++ if (up->port.uartclk != SERIAL_RSA_BAUD_BASE * 16) { ++ spin_lock_irq(&up->port.lock); ++ __enable_rsa(up); ++ spin_unlock_irq(&up->port.lock); ++ } ++ if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) ++ serial_out(up, UART_RSA_FRR, 0); ++ } ++} ++ ++/* ++ * Attempts to turn off the RSA FIFO. Returns zero on failure. ++ * It is unknown why interrupts were disabled in here. However, ++ * the caller is expected to preserve this behaviour by grabbing ++ * the spinlock before calling this function. ++ */ ++static void disable_rsa(struct uart_8250_port *up) ++{ ++ unsigned char mode; ++ int result; ++ ++ if (up->port.type == PORT_RSA && ++ up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) { ++ spin_lock_irq(&up->port.lock); ++ ++ mode = serial_in(up, UART_RSA_MSR); ++ result = !(mode & UART_RSA_MSR_FIFO); ++ ++ if (!result) { ++ serial_out(up, UART_RSA_MSR, mode & ~UART_RSA_MSR_FIFO); ++ mode = serial_in(up, UART_RSA_MSR); ++ result = !(mode & UART_RSA_MSR_FIFO); ++ } ++ ++ if (result) ++ up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16; ++ spin_unlock_irq(&up->port.lock); ++ } ++} ++#endif /* CONFIG_SERIAL_8250_RSA */ ++ ++/* ++ * This is a quickie test to see how big the FIFO is. ++ * It doesn't work at all the time, more's the pity. ++ */ ++static int size_fifo(struct uart_8250_port *up) ++{ ++ unsigned char old_fcr, old_mcr, old_lcr; ++ unsigned short old_dl; ++ int count; ++ ++ old_lcr = serial_in(up, UART_LCR); ++ serial_out(up, UART_LCR, 0); ++ old_fcr = serial_in(up, UART_FCR); ++ old_mcr = serial8250_in_MCR(up); ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | ++ UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); ++ serial8250_out_MCR(up, UART_MCR_LOOP); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ old_dl = serial_dl_read(up); ++ serial_dl_write(up, 0x0001); ++ serial_out(up, UART_LCR, UART_LCR_WLEN8); ++ for (count = 0; count < 256; count++) ++ serial_out(up, UART_TX, count); ++ mdelay(20);/* FIXME - schedule_timeout */ ++ for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) && ++ (count < 256); count++) ++ serial_in(up, UART_RX); ++ serial_out(up, UART_FCR, old_fcr); ++ serial8250_out_MCR(up, old_mcr); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ serial_dl_write(up, old_dl); ++ serial_out(up, UART_LCR, old_lcr); ++ ++ return count; ++} ++ ++/* ++ * Read UART ID using the divisor method - set DLL and DLM to zero ++ * and the revision will be in DLL and device type in DLM. We ++ * preserve the device state across this. ++ */ ++static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p) ++{ ++ unsigned char old_lcr; ++ unsigned int id, old_dl; ++ ++ old_lcr = serial_in(p, UART_LCR); ++ serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A); ++ old_dl = serial_dl_read(p); ++ serial_dl_write(p, 0); ++ id = serial_dl_read(p); ++ serial_dl_write(p, old_dl); ++ ++ serial_out(p, UART_LCR, old_lcr); ++ ++ return id; ++} ++ ++/* ++ * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's. ++ * When this function is called we know it is at least a StarTech ++ * 16650 V2, but it might be one of several StarTech UARTs, or one of ++ * its clones. (We treat the broken original StarTech 16650 V1 as a ++ * 16550, and why not? Startech doesn't seem to even acknowledge its ++ * existence.) ++ * ++ * What evil have men's minds wrought... ++ */ ++static void autoconfig_has_efr(struct uart_8250_port *up) ++{ ++ unsigned int id1, id2, id3, rev; ++ ++ /* ++ * Everything with an EFR has SLEEP ++ */ ++ up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP; ++ ++ /* ++ * First we check to see if it's an Oxford Semiconductor UART. ++ * ++ * If we have to do this here because some non-National ++ * Semiconductor clone chips lock up if you try writing to the ++ * LSR register (which serial_icr_read does) ++ */ ++ ++ /* ++ * Check for Oxford Semiconductor 16C950. ++ * ++ * EFR [4] must be set else this test fails. ++ * ++ * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca) ++ * claims that it's needed for 952 dual UART's (which are not ++ * recommended for new designs). ++ */ ++ up->acr = 0; ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_EFR, UART_EFR_ECB); ++ serial_out(up, UART_LCR, 0x00); ++ id1 = serial_icr_read(up, UART_ID1); ++ id2 = serial_icr_read(up, UART_ID2); ++ id3 = serial_icr_read(up, UART_ID3); ++ rev = serial_icr_read(up, UART_REV); ++ ++ DEBUG_AUTOCONF("950id=%02x:%02x:%02x:%02x ", id1, id2, id3, rev); ++ ++ if (id1 == 0x16 && id2 == 0xC9 && ++ (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) { ++ up->port.type = PORT_16C950; ++ ++ /* ++ * Enable work around for the Oxford Semiconductor 952 rev B ++ * chip which causes it to seriously miscalculate baud rates ++ * when DLL is 0. ++ */ ++ if (id3 == 0x52 && rev == 0x01) ++ up->bugs |= UART_BUG_QUOT; ++ return; ++ } ++ ++ /* ++ * We check for a XR16C850 by setting DLL and DLM to 0, and then ++ * reading back DLL and DLM. The chip type depends on the DLM ++ * value read back: ++ * 0x10 - XR16C850 and the DLL contains the chip revision. ++ * 0x12 - XR16C2850. ++ * 0x14 - XR16C854. ++ */ ++ id1 = autoconfig_read_divisor_id(up); ++ DEBUG_AUTOCONF("850id=%04x ", id1); ++ ++ id2 = id1 >> 8; ++ if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) { ++ up->port.type = PORT_16850; ++ return; ++ } ++ ++ /* ++ * It wasn't an XR16C850. ++ * ++ * We distinguish between the '654 and the '650 by counting ++ * how many bytes are in the FIFO. I'm using this for now, ++ * since that's the technique that was sent to me in the ++ * serial driver update, but I'm not convinced this works. ++ * I've had problems doing this in the past. -TYT ++ */ ++ if (size_fifo(up) == 64) ++ up->port.type = PORT_16654; ++ else ++ up->port.type = PORT_16650V2; ++} ++ ++/* ++ * We detected a chip without a FIFO. Only two fall into ++ * this category - the original 8250 and the 16450. The ++ * 16450 has a scratch register (accessible with LCR=0) ++ */ ++static void autoconfig_8250(struct uart_8250_port *up) ++{ ++ unsigned char scratch, status1, status2; ++ ++ up->port.type = PORT_8250; ++ ++ scratch = serial_in(up, UART_SCR); ++ serial_out(up, UART_SCR, 0xa5); ++ status1 = serial_in(up, UART_SCR); ++ serial_out(up, UART_SCR, 0x5a); ++ status2 = serial_in(up, UART_SCR); ++ serial_out(up, UART_SCR, scratch); ++ ++ if (status1 == 0xa5 && status2 == 0x5a) ++ up->port.type = PORT_16450; ++} ++ ++static int broken_efr(struct uart_8250_port *up) ++{ ++ /* ++ * Exar ST16C2550 "A2" devices incorrectly detect as ++ * having an EFR, and report an ID of 0x0201. See ++ * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html ++ */ ++ if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * We know that the chip has FIFOs. Does it have an EFR? The ++ * EFR is located in the same register position as the IIR and ++ * we know the top two bits of the IIR are currently set. The ++ * EFR should contain zero. Try to read the EFR. ++ */ ++static void autoconfig_16550a(struct uart_8250_port *up) ++{ ++ unsigned char status1, status2; ++ unsigned int iersave; ++ ++ up->port.type = PORT_16550A; ++ up->capabilities |= UART_CAP_FIFO; ++ ++ if (!IS_ENABLED(CONFIG_SERIAL_8250_16550A_VARIANTS) && ++ !(up->port.flags & UPF_FULL_PROBE)) ++ return; ++ ++ /* ++ * Check for presence of the EFR when DLAB is set. ++ * Only ST16C650V1 UARTs pass this test. ++ */ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ if (serial_in(up, UART_EFR) == 0) { ++ serial_out(up, UART_EFR, 0xA8); ++ if (serial_in(up, UART_EFR) != 0) { ++ DEBUG_AUTOCONF("EFRv1 "); ++ up->port.type = PORT_16650; ++ up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP; ++ } else { ++ serial_out(up, UART_LCR, 0); ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | ++ UART_FCR7_64BYTE); ++ status1 = serial_in(up, UART_IIR) >> 5; ++ serial_out(up, UART_FCR, 0); ++ serial_out(up, UART_LCR, 0); ++ ++ if (status1 == 7) ++ up->port.type = PORT_16550A_FSL64; ++ else ++ DEBUG_AUTOCONF("Motorola 8xxx DUART "); ++ } ++ serial_out(up, UART_EFR, 0); ++ return; ++ } ++ ++ /* ++ * Maybe it requires 0xbf to be written to the LCR. ++ * (other ST16C650V2 UARTs, TI16C752A, etc) ++ */ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) { ++ DEBUG_AUTOCONF("EFRv2 "); ++ autoconfig_has_efr(up); ++ return; ++ } ++ ++ /* ++ * Check for a National Semiconductor SuperIO chip. ++ * Attempt to switch to bank 2, read the value of the LOOP bit ++ * from EXCR1. Switch back to bank 0, change it in MCR. Then ++ * switch back to bank 2, read it from EXCR1 again and check ++ * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2 ++ */ ++ serial_out(up, UART_LCR, 0); ++ status1 = serial8250_in_MCR(up); ++ serial_out(up, UART_LCR, 0xE0); ++ status2 = serial_in(up, 0x02); /* EXCR1 */ ++ ++ if (!((status2 ^ status1) & UART_MCR_LOOP)) { ++ serial_out(up, UART_LCR, 0); ++ serial8250_out_MCR(up, status1 ^ UART_MCR_LOOP); ++ serial_out(up, UART_LCR, 0xE0); ++ status2 = serial_in(up, 0x02); /* EXCR1 */ ++ serial_out(up, UART_LCR, 0); ++ serial8250_out_MCR(up, status1); ++ ++ if ((status2 ^ status1) & UART_MCR_LOOP) { ++ unsigned short quot; ++ ++ serial_out(up, UART_LCR, 0xE0); ++ ++ quot = serial_dl_read(up); ++ quot <<= 3; ++ ++ if (ns16550a_goto_highspeed(up)) ++ serial_dl_write(up, quot); ++ ++ serial_out(up, UART_LCR, 0); ++ ++ up->port.uartclk = 921600*16; ++ up->port.type = PORT_NS16550A; ++ up->capabilities |= UART_NATSEMI; ++ return; ++ } ++ } ++ ++ /* ++ * No EFR. Try to detect a TI16750, which only sets bit 5 of ++ * the IIR when 64 byte FIFO mode is enabled when DLAB is set. ++ * Try setting it with and without DLAB set. Cheap clones ++ * set bit 5 without DLAB set. ++ */ ++ serial_out(up, UART_LCR, 0); ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); ++ status1 = serial_in(up, UART_IIR) >> 5; ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); ++ status2 = serial_in(up, UART_IIR) >> 5; ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); ++ serial_out(up, UART_LCR, 0); ++ ++ DEBUG_AUTOCONF("iir1=%d iir2=%d ", status1, status2); ++ ++ if (status1 == 6 && status2 == 7) { ++ up->port.type = PORT_16750; ++ up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP; ++ return; ++ } ++ ++ /* ++ * Try writing and reading the UART_IER_UUE bit (b6). ++ * If it works, this is probably one of the Xscale platform's ++ * internal UARTs. ++ * We're going to explicitly set the UUE bit to 0 before ++ * trying to write and read a 1 just to make sure it's not ++ * already a 1 and maybe locked there before we even start start. ++ */ ++ iersave = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, iersave & ~UART_IER_UUE); ++ if (!(serial_in(up, UART_IER) & UART_IER_UUE)) { ++ /* ++ * OK it's in a known zero state, try writing and reading ++ * without disturbing the current state of the other bits. ++ */ ++ serial_out(up, UART_IER, iersave | UART_IER_UUE); ++ if (serial_in(up, UART_IER) & UART_IER_UUE) { ++ /* ++ * It's an Xscale. ++ * We'll leave the UART_IER_UUE bit set to 1 (enabled). ++ */ ++ DEBUG_AUTOCONF("Xscale "); ++ up->port.type = PORT_XSCALE; ++ up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE; ++ return; ++ } ++ } else { ++ /* ++ * If we got here we couldn't force the IER_UUE bit to 0. ++ * Log it and continue. ++ */ ++ DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 "); ++ } ++ serial_out(up, UART_IER, iersave); ++ ++ /* ++ * We distinguish between 16550A and U6 16550A by counting ++ * how many bytes are in the FIFO. ++ */ ++ if (up->port.type == PORT_16550A && size_fifo(up) == 64) { ++ up->port.type = PORT_U6_16550A; ++ up->capabilities |= UART_CAP_AFE; ++ } ++} ++ ++/* ++ * This routine is called by rs_init() to initialize a specific serial ++ * port. It determines what type of UART chip this serial port is ++ * using: 8250, 16450, 16550, 16550A. The important question is ++ * whether or not this UART is a 16550A or not, since this will ++ * determine whether or not we can use its FIFO features or not. ++ */ ++static void autoconfig(struct uart_8250_port *up) ++{ ++ unsigned char status1, scratch, scratch2, scratch3; ++ unsigned char save_lcr, save_mcr; ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int old_capabilities; ++ ++ if (!port->iobase && !port->mapbase && !port->membase) ++ return; ++ ++ DEBUG_AUTOCONF("%s: autoconf (0x%04lx, 0x%p): ", ++ port->name, port->iobase, port->membase); ++ ++ /* ++ * We really do need global IRQs disabled here - we're going to ++ * be frobbing the chips IRQ enable register to see if it exists. ++ */ ++ spin_lock_irqsave(&port->lock, flags); ++ ++ up->capabilities = 0; ++ up->bugs = 0; ++ ++ if (!(port->flags & UPF_BUGGY_UART)) { ++ /* ++ * Do a simple existence test first; if we fail this, ++ * there's no point trying anything else. ++ * ++ * 0x80 is used as a nonsense port to prevent against ++ * false positives due to ISA bus float. The ++ * assumption is that 0x80 is a non-existent port; ++ * which should be safe since include/asm/io.h also ++ * makes this assumption. ++ * ++ * Note: this is safe as long as MCR bit 4 is clear ++ * and the device is in "PC" mode. ++ */ ++ scratch = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, 0); ++#ifdef __i386__ ++ outb(0xff, 0x080); ++#endif ++ /* ++ * Mask out IER[7:4] bits for test as some UARTs (e.g. TL ++ * 16C754B) allow only to modify them if an EFR bit is set. ++ */ ++ scratch2 = serial_in(up, UART_IER) & 0x0f; ++ serial_out(up, UART_IER, 0x0F); ++#ifdef __i386__ ++ outb(0, 0x080); ++#endif ++ scratch3 = serial_in(up, UART_IER) & 0x0f; ++ serial_out(up, UART_IER, scratch); ++ if (scratch2 != 0 || scratch3 != 0x0F) { ++ /* ++ * We failed; there's nothing here ++ */ ++ spin_unlock_irqrestore(&port->lock, flags); ++ DEBUG_AUTOCONF("IER test failed (%02x, %02x) ", ++ scratch2, scratch3); ++ goto out; ++ } ++ } ++ ++ save_mcr = serial8250_in_MCR(up); ++ save_lcr = serial_in(up, UART_LCR); ++ ++ /* ++ * Check to see if a UART is really there. Certain broken ++ * internal modems based on the Rockwell chipset fail this ++ * test, because they apparently don't implement the loopback ++ * test mode. So this test is skipped on the COM 1 through ++ * COM 4 ports. This *should* be safe, since no board ++ * manufacturer would be stupid enough to design a board ++ * that conflicts with COM 1-4 --- we hope! ++ */ ++ if (!(port->flags & UPF_SKIP_TEST)) { ++ serial8250_out_MCR(up, UART_MCR_LOOP | 0x0A); ++ status1 = serial_in(up, UART_MSR) & 0xF0; ++ serial8250_out_MCR(up, save_mcr); ++ if (status1 != 0x90) { ++ spin_unlock_irqrestore(&port->lock, flags); ++ DEBUG_AUTOCONF("LOOP test failed (%02x) ", ++ status1); ++ goto out; ++ } ++ } ++ ++ /* ++ * We're pretty sure there's a port here. Lets find out what ++ * type of port it is. The IIR top two bits allows us to find ++ * out if it's 8250 or 16450, 16550, 16550A or later. This ++ * determines what we test for next. ++ * ++ * We also initialise the EFR (if any) to zero for later. The ++ * EFR occupies the same register location as the FCR and IIR. ++ */ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_EFR, 0); ++ serial_out(up, UART_LCR, 0); ++ ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); ++ ++ /* Assign this as it is to truncate any bits above 7. */ ++ scratch = serial_in(up, UART_IIR); ++ ++ switch (scratch >> 6) { ++ case 0: ++ autoconfig_8250(up); ++ break; ++ case 1: ++ port->type = PORT_UNKNOWN; ++ break; ++ case 2: ++ port->type = PORT_16550; ++ break; ++ case 3: ++ autoconfig_16550a(up); ++ break; ++ } ++ ++#ifdef CONFIG_SERIAL_8250_RSA ++ /* ++ * Only probe for RSA ports if we got the region. ++ */ ++ if (port->type == PORT_16550A && up->probe & UART_PROBE_RSA && ++ __enable_rsa(up)) ++ port->type = PORT_RSA; ++#endif ++ ++ serial_out(up, UART_LCR, save_lcr); ++ ++ port->fifosize = uart_config[up->port.type].fifo_size; ++ old_capabilities = up->capabilities; ++ up->capabilities = uart_config[port->type].flags; ++ up->tx_loadsz = uart_config[port->type].tx_loadsz; ++ ++ if (port->type == PORT_UNKNOWN) ++ goto out_unlock; ++ ++ /* ++ * Reset the UART. ++ */ ++#ifdef CONFIG_SERIAL_8250_RSA ++ if (port->type == PORT_RSA) ++ serial_out(up, UART_RSA_FRR, 0); ++#endif ++ serial8250_out_MCR(up, save_mcr); ++ serial8250_clear_fifos(up); ++ serial_in(up, UART_RX); ++ if (up->capabilities & UART_CAP_UUE) ++ serial_out(up, UART_IER, UART_IER_UUE); ++ else ++ serial_out(up, UART_IER, 0); ++ ++out_unlock: ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ /* ++ * Check if the device is a Fintek F81216A ++ */ ++ if (port->type == PORT_16550A && port->iotype == UPIO_PORT) ++ fintek_8250_probe(up); ++ ++ if (up->capabilities != old_capabilities) { ++ dev_warn(port->dev, "detected caps %08x should be %08x\n", ++ old_capabilities, up->capabilities); ++ } ++out: ++ DEBUG_AUTOCONF("iir=%d ", scratch); ++ DEBUG_AUTOCONF("type=%s\n", uart_config[port->type].name); ++} ++ ++static void autoconfig_irq(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned char save_mcr, save_ier; ++ unsigned char save_ICP = 0; ++ unsigned int ICP = 0; ++ unsigned long irqs; ++ int irq; ++ ++ if (port->flags & UPF_FOURPORT) { ++ ICP = (port->iobase & 0xfe0) | 0x1f; ++ save_ICP = inb_p(ICP); ++ outb_p(0x80, ICP); ++ inb_p(ICP); ++ } ++ ++ if (uart_console(port)) ++ console_lock(); ++ ++ /* forget possible initially masked and pending IRQ */ ++ probe_irq_off(probe_irq_on()); ++ save_mcr = serial8250_in_MCR(up); ++ save_ier = serial_in(up, UART_IER); ++ serial8250_out_MCR(up, UART_MCR_OUT1 | UART_MCR_OUT2); ++ ++ irqs = probe_irq_on(); ++ serial8250_out_MCR(up, 0); ++ udelay(10); ++ if (port->flags & UPF_FOURPORT) { ++ serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); ++ } else { ++ serial8250_out_MCR(up, ++ UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2); ++ } ++ serial_out(up, UART_IER, 0x0f); /* enable all intrs */ ++ serial_in(up, UART_LSR); ++ serial_in(up, UART_RX); ++ serial_in(up, UART_IIR); ++ serial_in(up, UART_MSR); ++ serial_out(up, UART_TX, 0xFF); ++ udelay(20); ++ irq = probe_irq_off(irqs); ++ ++ serial8250_out_MCR(up, save_mcr); ++ serial_out(up, UART_IER, save_ier); ++ ++ if (port->flags & UPF_FOURPORT) ++ outb_p(save_ICP, ICP); ++ ++ if (uart_console(port)) ++ console_unlock(); ++ ++ port->irq = (irq > 0) ? irq : 0; ++} ++ ++static void serial8250_stop_rx(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ serial8250_rpm_get(up); ++ ++ up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); ++ up->port.read_status_mask &= ~UART_LSR_DR; ++ serial_port_out(port, UART_IER, up->ier); ++ ++ serial8250_rpm_put(up); ++} ++ ++/** ++ * serial8250_em485_stop_tx() - generic ->rs485_stop_tx() callback ++ * @p: uart 8250 port ++ * ++ * Generic callback usable by 8250 uart drivers to stop rs485 transmission. ++ */ ++void serial8250_em485_stop_tx(struct uart_8250_port *p) ++{ ++ unsigned char mcr = serial8250_in_MCR(p); ++ ++ if (p->port.rs485.flags & SER_RS485_RTS_AFTER_SEND) ++ mcr |= UART_MCR_RTS; ++ else ++ mcr &= ~UART_MCR_RTS; ++ serial8250_out_MCR(p, mcr); ++ ++ /* ++ * Empty the RX FIFO, we are not interested in anything ++ * received during the half-duplex transmission. ++ * Enable previously disabled RX interrupts. ++ */ ++ if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) { ++ serial8250_clear_and_reinit_fifos(p); ++ ++ p->ier |= UART_IER_RLSI | UART_IER_RDI; ++ serial_port_out(&p->port, UART_IER, p->ier); ++ } ++} ++EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); ++ ++static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t) ++{ ++ struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485, ++ stop_tx_timer); ++ struct uart_8250_port *p = em485->port; ++ unsigned long flags; ++ ++ serial8250_rpm_get(p); ++ spin_lock_irqsave(&p->port.lock, flags); ++ if (em485->active_timer == &em485->stop_tx_timer) { ++ p->rs485_stop_tx(p); ++ em485->active_timer = NULL; ++ em485->tx_stopped = true; ++ } ++ spin_unlock_irqrestore(&p->port.lock, flags); ++ serial8250_rpm_put(p); ++ ++ return HRTIMER_NORESTART; ++} ++ ++static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec) ++{ ++ hrtimer_start(hrt, ms_to_ktime(msec), HRTIMER_MODE_REL); ++} ++ ++static void __stop_tx_rs485(struct uart_8250_port *p, u64 stop_delay) ++{ ++ struct uart_8250_em485 *em485 = p->em485; ++ ++ stop_delay += (u64)p->port.rs485.delay_rts_after_send * NSEC_PER_MSEC; ++ ++ /* ++ * rs485_stop_tx() is going to set RTS according to config ++ * AND flush RX FIFO if required. ++ */ ++ if (stop_delay > 0) { ++ em485->active_timer = &em485->stop_tx_timer; ++ hrtimer_start(&em485->stop_tx_timer, ns_to_ktime(stop_delay), HRTIMER_MODE_REL); ++ } else { ++ p->rs485_stop_tx(p); ++ em485->active_timer = NULL; ++ em485->tx_stopped = true; ++ } ++} ++ ++static inline void __stop_tx(struct uart_8250_port *p) ++{ ++ struct uart_8250_em485 *em485 = p->em485; ++ ++ if (em485) { ++ u16 lsr = serial_lsr_in(p); ++ u64 stop_delay = 0; ++ ++ p->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS; ++ ++ if (!(lsr & UART_LSR_THRE)) ++ return; ++ /* ++ * To provide required timing and allow FIFO transfer, ++ * __stop_tx_rs485() must be called only when both FIFO and ++ * shift register are empty. The device driver should either ++ * enable interrupt on TEMT or set UART_CAP_NOTEMT that will ++ * enlarge stop_tx_timer by the tx time of one frame to cover ++ * for emptying of the shift register. ++ */ ++ if (!(lsr & UART_LSR_TEMT)) { ++ if (!(p->capabilities & UART_CAP_NOTEMT)) ++ return; ++ /* ++ * RTS might get deasserted too early with the normal ++ * frame timing formula. It seems to suggest THRE might ++ * get asserted already during tx of the stop bit ++ * rather than after it is fully sent. ++ * Roughly estimate 1 extra bit here with / 7. ++ */ ++ stop_delay = p->port.frame_time + DIV_ROUND_UP(p->port.frame_time, 7); ++ } ++ ++ __stop_tx_rs485(p, stop_delay); ++ } ++ ++ if (serial8250_clear_THRI(p)) ++ serial8250_rpm_put_tx(p); ++} ++ ++static void serial8250_stop_tx(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ serial8250_rpm_get(up); ++ __stop_tx(up); ++ ++ /* ++ * We really want to stop the transmitter from sending. ++ */ ++ if (port->type == PORT_16C950) { ++ up->acr |= UART_ACR_TXDIS; ++ serial_icr_write(up, UART_ACR, up->acr); ++ } ++ serial8250_rpm_put(up); ++} ++ ++static inline void __start_tx(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ if (up->dma && !up->dma->tx_dma(up)) ++ return; ++ ++ if (serial8250_set_THRI(up)) { ++ if (up->bugs & UART_BUG_TXEN) { ++ u16 lsr = serial_lsr_in(up); ++ ++ if (lsr & UART_LSR_THRE) ++ serial8250_tx_chars(up); ++ } ++ } ++ ++ /* ++ * Re-enable the transmitter if we disabled it. ++ */ ++ if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) { ++ up->acr &= ~UART_ACR_TXDIS; ++ serial_icr_write(up, UART_ACR, up->acr); ++ } ++} ++ ++/** ++ * serial8250_em485_start_tx() - generic ->rs485_start_tx() callback ++ * @up: uart 8250 port ++ * ++ * Generic callback usable by 8250 uart drivers to start rs485 transmission. ++ * Assumes that setting the RTS bit in the MCR register means RTS is high. ++ * (Some chips use inverse semantics.) Further assumes that reception is ++ * stoppable by disabling the UART_IER_RDI interrupt. (Some chips set the ++ * UART_LSR_DR bit even when UART_IER_RDI is disabled, foiling this approach.) ++ */ ++void serial8250_em485_start_tx(struct uart_8250_port *up) ++{ ++ unsigned char mcr = serial8250_in_MCR(up); ++ ++ if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX)) ++ serial8250_stop_rx(&up->port); ++ ++ if (up->port.rs485.flags & SER_RS485_RTS_ON_SEND) ++ mcr |= UART_MCR_RTS; ++ else ++ mcr &= ~UART_MCR_RTS; ++ serial8250_out_MCR(up, mcr); ++} ++EXPORT_SYMBOL_GPL(serial8250_em485_start_tx); ++ ++/* Returns false, if start_tx_timer was setup to defer TX start */ ++static bool start_tx_rs485(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct uart_8250_em485 *em485 = up->em485; ++ ++ /* ++ * While serial8250_em485_handle_stop_tx() is a noop if ++ * em485->active_timer != &em485->stop_tx_timer, it might happen that ++ * the timer is still armed and triggers only after the current bunch of ++ * chars is send and em485->active_timer == &em485->stop_tx_timer again. ++ * So cancel the timer. There is still a theoretical race condition if ++ * the timer is already running and only comes around to check for ++ * em485->active_timer when &em485->stop_tx_timer is armed again. ++ */ ++ if (em485->active_timer == &em485->stop_tx_timer) ++ hrtimer_try_to_cancel(&em485->stop_tx_timer); ++ ++ em485->active_timer = NULL; ++ ++ if (em485->tx_stopped) { ++ em485->tx_stopped = false; ++ ++ up->rs485_start_tx(up); ++ ++ if (up->port.rs485.delay_rts_before_send > 0) { ++ em485->active_timer = &em485->start_tx_timer; ++ start_hrtimer_ms(&em485->start_tx_timer, ++ up->port.rs485.delay_rts_before_send); ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t) ++{ ++ struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485, ++ start_tx_timer); ++ struct uart_8250_port *p = em485->port; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&p->port.lock, flags); ++ if (em485->active_timer == &em485->start_tx_timer) { ++ __start_tx(&p->port); ++ em485->active_timer = NULL; ++ } ++ spin_unlock_irqrestore(&p->port.lock, flags); ++ ++ return HRTIMER_NORESTART; ++} ++ ++static void serial8250_start_tx(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct uart_8250_em485 *em485 = up->em485; ++ ++ if (!port->x_char && uart_circ_empty(&port->state->xmit)) ++ return; ++ ++ serial8250_rpm_get_tx(up); ++ ++ if (em485) { ++ if ((em485->active_timer == &em485->start_tx_timer) || ++ !start_tx_rs485(port)) ++ return; ++ } ++ __start_tx(port); ++} ++ ++static void serial8250_throttle(struct uart_port *port) ++{ ++ port->throttle(port); ++} ++ ++static void serial8250_unthrottle(struct uart_port *port) ++{ ++ port->unthrottle(port); ++} ++ ++static void serial8250_disable_ms(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ /* no MSR capabilities */ ++ if (up->bugs & UART_BUG_NOMSR) ++ return; ++ ++ mctrl_gpio_disable_ms(up->gpios); ++ ++ up->ier &= ~UART_IER_MSI; ++ serial_port_out(port, UART_IER, up->ier); ++} ++ ++static void serial8250_enable_ms(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ /* no MSR capabilities */ ++ if (up->bugs & UART_BUG_NOMSR) ++ return; ++ ++ mctrl_gpio_enable_ms(up->gpios); ++ ++ up->ier |= UART_IER_MSI; ++ ++ serial8250_rpm_get(up); ++ serial_port_out(port, UART_IER, up->ier); ++ serial8250_rpm_put(up); ++} ++ ++void serial8250_read_char(struct uart_8250_port *up, u16 lsr) ++{ ++ struct uart_port *port = &up->port; ++ unsigned char ch; ++ char flag = TTY_NORMAL; ++ ++ if (likely(lsr & UART_LSR_DR)) ++ ch = serial_in(up, UART_RX); ++ else ++ /* ++ * Intel 82571 has a Serial Over Lan device that will ++ * set UART_LSR_BI without setting UART_LSR_DR when ++ * it receives a break. To avoid reading from the ++ * receive buffer without UART_LSR_DR bit set, we ++ * just force the read character to be 0 ++ */ ++ ch = 0; ++ ++ port->icount.rx++; ++ ++ lsr |= up->lsr_saved_flags; ++ up->lsr_saved_flags = 0; ++ ++ if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) { ++ if (lsr & UART_LSR_BI) { ++ lsr &= ~(UART_LSR_FE | UART_LSR_PE); ++ port->icount.brk++; ++ /* ++ * We do the SysRQ and SAK checking ++ * here because otherwise the break ++ * may get masked by ignore_status_mask ++ * or read_status_mask. ++ */ ++ if (uart_handle_break(port)) ++ return; ++ } else if (lsr & UART_LSR_PE) ++ port->icount.parity++; ++ else if (lsr & UART_LSR_FE) ++ port->icount.frame++; ++ if (lsr & UART_LSR_OE) ++ port->icount.overrun++; ++ ++ /* ++ * Mask off conditions which should be ignored. ++ */ ++ lsr &= port->read_status_mask; ++ ++ if (lsr & UART_LSR_BI) { ++ dev_dbg(port->dev, "handling break\n"); ++ flag = TTY_BREAK; ++ } else if (lsr & UART_LSR_PE) ++ flag = TTY_PARITY; ++ else if (lsr & UART_LSR_FE) ++ flag = TTY_FRAME; ++ } ++ if (uart_prepare_sysrq_char(port, ch)) ++ return; ++ ++ uart_insert_char(port, lsr, UART_LSR_OE, ch, flag); ++} ++EXPORT_SYMBOL_GPL(serial8250_read_char); ++ ++/* ++ * serial8250_rx_chars - Read characters. The first LSR value must be passed in. ++ * ++ * Returns LSR bits. The caller should rely only on non-Rx related LSR bits ++ * (such as THRE) because the LSR value might come from an already consumed ++ * character. ++ */ ++u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr) ++{ ++ struct uart_port *port = &up->port; ++ int max_count = 256; ++ ++ do { ++ serial8250_read_char(up, lsr); ++ if (--max_count == 0) ++ break; ++ lsr = serial_in(up, UART_LSR); ++ } while (lsr & (UART_LSR_DR | UART_LSR_BI)); ++ ++ tty_flip_buffer_push(&port->state->port); ++ return lsr; ++} ++EXPORT_SYMBOL_GPL(serial8250_rx_chars); ++ ++void serial8250_tx_chars(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ struct circ_buf *xmit = &port->state->xmit; ++ int count; ++ ++ if (port->x_char) { ++ uart_xchar_out(port, UART_TX); ++ return; ++ } ++ if (uart_tx_stopped(port)) { ++ serial8250_stop_tx(port); ++ return; ++ } ++ if (uart_circ_empty(xmit)) { ++ __stop_tx(up); ++ return; ++ } ++ ++ count = up->tx_loadsz; ++ do { ++ serial_out(up, UART_TX, xmit->buf[xmit->tail]); ++ if (up->bugs & UART_BUG_TXRACE) { ++ /* ++ * The Aspeed BMC virtual UARTs have a bug where data ++ * may get stuck in the BMC's Tx FIFO from bursts of ++ * writes on the APB interface. ++ * ++ * Delay back-to-back writes by a read cycle to avoid ++ * stalling the VUART. Read a register that won't have ++ * side-effects and discard the result. ++ */ ++ serial_in(up, UART_SCR); ++ } ++ xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); ++ port->icount.tx++; ++ if (uart_circ_empty(xmit)) ++ break; ++ if ((up->capabilities & UART_CAP_HFIFO) && ++ !uart_lsr_tx_empty(serial_in(up, UART_LSR))) ++ break; ++ /* The BCM2835 MINI UART THRE bit is really a not-full bit. */ ++ if ((up->capabilities & UART_CAP_MINI) && ++ !(serial_in(up, UART_LSR) & UART_LSR_THRE)) ++ break; ++ } while (--count > 0); ++ ++ if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) ++ uart_write_wakeup(port); ++ ++ /* ++ * With RPM enabled, we have to wait until the FIFO is empty before the ++ * HW can go idle. So we get here once again with empty FIFO and disable ++ * the interrupt and RPM in __stop_tx() ++ */ ++ if (uart_circ_empty(xmit) && !(up->capabilities & UART_CAP_RPM)) ++ __stop_tx(up); ++} ++EXPORT_SYMBOL_GPL(serial8250_tx_chars); ++ ++/* Caller holds uart port lock */ ++unsigned int serial8250_modem_status(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int status = serial_in(up, UART_MSR); ++ ++ status |= up->msr_saved_flags; ++ up->msr_saved_flags = 0; ++ if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI && ++ port->state != NULL) { ++ if (status & UART_MSR_TERI) ++ port->icount.rng++; ++ if (status & UART_MSR_DDSR) ++ port->icount.dsr++; ++ if (status & UART_MSR_DDCD) ++ uart_handle_dcd_change(port, status & UART_MSR_DCD); ++ if (status & UART_MSR_DCTS) ++ uart_handle_cts_change(port, status & UART_MSR_CTS); ++ ++ wake_up_interruptible(&port->state->port.delta_msr_wait); ++ } ++ ++ return status; ++} ++EXPORT_SYMBOL_GPL(serial8250_modem_status); ++ ++static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir) ++{ ++ switch (iir & 0x3f) { ++ case UART_IIR_RDI: ++ if (!up->dma->rx_running) ++ break; ++ fallthrough; ++ case UART_IIR_RLSI: ++ case UART_IIR_RX_TIMEOUT: ++ serial8250_rx_dma_flush(up); ++ return true; ++ } ++ return up->dma->rx_dma(up); ++} ++ ++/* ++ * This handles the interrupt from one port. ++ */ ++int serial8250_handle_irq(struct uart_port *port, unsigned int iir) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ bool skip_rx = false; ++ unsigned long flags; ++ u16 status; ++ ++ if (iir & UART_IIR_NO_INT) ++ return 0; ++ ++ spin_lock_irqsave(&port->lock, flags); ++ ++ status = serial_lsr_in(up); ++ ++ /* ++ * If port is stopped and there are no error conditions in the ++ * FIFO, then don't drain the FIFO, as this may lead to TTY buffer ++ * overflow. Not servicing, RX FIFO would trigger auto HW flow ++ * control when FIFO occupancy reaches preset threshold, thus ++ * halting RX. This only works when auto HW flow control is ++ * available. ++ */ ++ if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) && ++ (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) && ++ !(port->read_status_mask & UART_LSR_DR)) ++ skip_rx = true; ++ ++ if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) { ++ if (!up->dma || handle_rx_dma(up, iir)) ++ status = serial8250_rx_chars(up, status); ++ } ++ serial8250_modem_status(up); ++ if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) { ++ if (!up->dma || up->dma->tx_err) ++ serial8250_tx_chars(up); ++ else if (!up->dma->tx_running) ++ __stop_tx(up); ++ } ++ ++ uart_unlock_and_check_sysrq_irqrestore(port, flags); ++ ++ return 1; ++} ++EXPORT_SYMBOL_GPL(serial8250_handle_irq); ++ ++static int serial8250_default_handle_irq(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int iir; ++ int ret; ++ ++ serial8250_rpm_get(up); ++ ++ iir = serial_port_in(port, UART_IIR); ++ ret = serial8250_handle_irq(port, iir); ++ ++ serial8250_rpm_put(up); ++ return ret; ++} ++ ++/* ++ * Newer 16550 compatible parts such as the SC16C650 & Altera 16550 Soft IP ++ * have a programmable TX threshold that triggers the THRE interrupt in ++ * the IIR register. In this case, the THRE interrupt indicates the FIFO ++ * has space available. Load it up with tx_loadsz bytes. ++ */ ++static int serial8250_tx_threshold_handle_irq(struct uart_port *port) ++{ ++ unsigned long flags; ++ unsigned int iir = serial_port_in(port, UART_IIR); ++ ++ /* TX Threshold IRQ triggered so load up FIFO */ ++ if ((iir & UART_IIR_ID) == UART_IIR_THRI) { ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ serial8250_tx_chars(up); ++ spin_unlock_irqrestore(&port->lock, flags); ++ } ++ ++ iir = serial_port_in(port, UART_IIR); ++ return serial8250_handle_irq(port, iir); ++} ++ ++static unsigned int serial8250_tx_empty(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ u16 lsr; ++ ++ serial8250_rpm_get(up); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ lsr = serial_lsr_in(up); ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ serial8250_rpm_put(up); ++ ++ return uart_lsr_tx_empty(lsr) ? TIOCSER_TEMT : 0; ++} ++ ++unsigned int serial8250_do_get_mctrl(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int status; ++ unsigned int val; ++ ++ serial8250_rpm_get(up); ++ status = serial8250_modem_status(up); ++ serial8250_rpm_put(up); ++ ++ val = serial8250_MSR_to_TIOCM(status); ++ if (up->gpios) ++ return mctrl_gpio_get(up->gpios, &val); ++ ++ return val; ++} ++EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl); ++ ++static unsigned int serial8250_get_mctrl(struct uart_port *port) ++{ ++ if (port->get_mctrl) ++ return port->get_mctrl(port); ++ return serial8250_do_get_mctrl(port); ++} ++ ++void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned char mcr; ++ ++ mcr = serial8250_TIOCM_to_MCR(mctrl); ++ ++ mcr |= up->mcr; ++ ++ serial8250_out_MCR(up, mcr); ++} ++EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl); ++ ++static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl) ++{ ++ if (port->rs485.flags & SER_RS485_ENABLED) ++ return; ++ ++ if (port->set_mctrl) ++ port->set_mctrl(port, mctrl); ++ else ++ serial8250_do_set_mctrl(port, mctrl); ++} ++ ++static void serial8250_break_ctl(struct uart_port *port, int break_state) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ ++ serial8250_rpm_get(up); ++ spin_lock_irqsave(&port->lock, flags); ++ if (break_state == -1) ++ up->lcr |= UART_LCR_SBC; ++ else ++ up->lcr &= ~UART_LCR_SBC; ++ serial_port_out(port, UART_LCR, up->lcr); ++ spin_unlock_irqrestore(&port->lock, flags); ++ serial8250_rpm_put(up); ++} ++ ++static void wait_for_lsr(struct uart_8250_port *up, int bits) ++{ ++ unsigned int status, tmout = 10000; ++ ++ /* Wait up to 10ms for the character(s) to be sent. */ ++ for (;;) { ++ status = serial_lsr_in(up); ++ ++ if ((status & bits) == bits) ++ break; ++ if (--tmout == 0) ++ break; ++ udelay(1); ++ touch_nmi_watchdog(); ++ } ++} ++ ++/* ++ * Wait for transmitter & holding register to empty ++ */ ++static void wait_for_xmitr(struct uart_8250_port *up, int bits) ++{ ++ unsigned int tmout; ++ ++ wait_for_lsr(up, bits); ++ ++ /* Wait up to 1s for flow control if necessary */ ++ if (up->port.flags & UPF_CONS_FLOW) { ++ for (tmout = 1000000; tmout; tmout--) { ++ unsigned int msr = serial_in(up, UART_MSR); ++ up->msr_saved_flags |= msr & MSR_SAVE_FLAGS; ++ if (msr & UART_MSR_CTS) ++ break; ++ udelay(1); ++ touch_nmi_watchdog(); ++ } ++ } ++} ++ ++#ifdef CONFIG_CONSOLE_POLL ++/* ++ * Console polling routines for writing and reading from the uart while ++ * in an interrupt or debug context. ++ */ ++ ++static int serial8250_get_poll_char(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ int status; ++ u16 lsr; ++ ++ serial8250_rpm_get(up); ++ ++ lsr = serial_port_in(port, UART_LSR); ++ ++ if (!(lsr & UART_LSR_DR)) { ++ status = NO_POLL_CHAR; ++ goto out; ++ } ++ ++ status = serial_port_in(port, UART_RX); ++out: ++ serial8250_rpm_put(up); ++ return status; ++} ++ ++ ++static void serial8250_put_poll_char(struct uart_port *port, ++ unsigned char c) ++{ ++ unsigned int ier; ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ serial8250_rpm_get(up); ++ /* ++ * First save the IER then disable the interrupts ++ */ ++ ier = serial_port_in(port, UART_IER); ++ if (up->capabilities & UART_CAP_UUE) ++ serial_port_out(port, UART_IER, UART_IER_UUE); ++ else ++ serial_port_out(port, UART_IER, 0); ++ ++ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); ++ /* ++ * Send the character out. ++ */ ++ serial_port_out(port, UART_TX, c); ++ ++ /* ++ * Finally, wait for transmitter to become empty ++ * and restore the IER ++ */ ++ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); ++ serial_port_out(port, UART_IER, ier); ++ serial8250_rpm_put(up); ++} ++ ++#endif /* CONFIG_CONSOLE_POLL */ ++ ++int serial8250_do_startup(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ unsigned char iir; ++ int retval; ++ u16 lsr; ++ ++ if (!port->fifosize) ++ port->fifosize = uart_config[port->type].fifo_size; ++ if (!up->tx_loadsz) ++ up->tx_loadsz = uart_config[port->type].tx_loadsz; ++ if (!up->capabilities) ++ up->capabilities = uart_config[port->type].flags; ++ up->mcr = 0; ++ ++ if (port->iotype != up->cur_iotype) ++ set_io_from_upio(port); ++ ++ serial8250_rpm_get(up); ++ if (port->type == PORT_16C950) { ++ /* Wake up and initialize UART */ ++ up->acr = 0; ++ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_port_out(port, UART_EFR, UART_EFR_ECB); ++ serial_port_out(port, UART_IER, 0); ++ serial_port_out(port, UART_LCR, 0); ++ serial_icr_write(up, UART_CSR, 0); /* Reset the UART */ ++ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_port_out(port, UART_EFR, UART_EFR_ECB); ++ serial_port_out(port, UART_LCR, 0); ++ } ++ ++ if (port->type == PORT_DA830) { ++ /* Reset the port */ ++ serial_port_out(port, UART_IER, 0); ++ serial_port_out(port, UART_DA830_PWREMU_MGMT, 0); ++ mdelay(10); ++ ++ /* Enable Tx, Rx and free run mode */ ++ serial_port_out(port, UART_DA830_PWREMU_MGMT, ++ UART_DA830_PWREMU_MGMT_UTRST | ++ UART_DA830_PWREMU_MGMT_URRST | ++ UART_DA830_PWREMU_MGMT_FREE); ++ } ++ ++ if (port->type == PORT_NPCM) { ++ /* ++ * Nuvoton calls the scratch register 'UART_TOR' (timeout ++ * register). Enable it, and set TIOC (timeout interrupt ++ * comparator) to be 0x20 for correct operation. ++ */ ++ serial_port_out(port, UART_NPCM_TOR, UART_NPCM_TOIE | 0x20); ++ } ++ ++#ifdef CONFIG_SERIAL_8250_RSA ++ /* ++ * If this is an RSA port, see if we can kick it up to the ++ * higher speed clock. ++ */ ++ enable_rsa(up); ++#endif ++ ++ /* ++ * Clear the FIFO buffers and disable them. ++ * (they will be reenabled in set_termios()) ++ */ ++ serial8250_clear_fifos(up); ++ ++ /* ++ * Clear the interrupt registers. ++ */ ++ serial_port_in(port, UART_LSR); ++ serial_port_in(port, UART_RX); ++ serial_port_in(port, UART_IIR); ++ serial_port_in(port, UART_MSR); ++ ++ /* ++ * At this point, there's no way the LSR could still be 0xff; ++ * if it is, then bail out, because there's likely no UART ++ * here. ++ */ ++ if (!(port->flags & UPF_BUGGY_UART) && ++ (serial_port_in(port, UART_LSR) == 0xff)) { ++ dev_info_ratelimited(port->dev, "LSR safety check engaged!\n"); ++ retval = -ENODEV; ++ goto out; ++ } ++ ++ /* ++ * For a XR16C850, we need to set the trigger levels ++ */ ++ if (port->type == PORT_16850) { ++ unsigned char fctr; ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ ++ fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX); ++ serial_port_out(port, UART_FCTR, ++ fctr | UART_FCTR_TRGD | UART_FCTR_RX); ++ serial_port_out(port, UART_TRG, UART_TRG_96); ++ serial_port_out(port, UART_FCTR, ++ fctr | UART_FCTR_TRGD | UART_FCTR_TX); ++ serial_port_out(port, UART_TRG, UART_TRG_96); ++ ++ serial_port_out(port, UART_LCR, 0); ++ } ++ ++ /* ++ * For the Altera 16550 variants, set TX threshold trigger level. ++ */ ++ if (((port->type == PORT_ALTR_16550_F32) || ++ (port->type == PORT_ALTR_16550_F64) || ++ (port->type == PORT_ALTR_16550_F128)) && (port->fifosize > 1)) { ++ /* Bounds checking of TX threshold (valid 0 to fifosize-2) */ ++ if ((up->tx_loadsz < 2) || (up->tx_loadsz > port->fifosize)) { ++ dev_err(port->dev, "TX FIFO Threshold errors, skipping\n"); ++ } else { ++ serial_port_out(port, UART_ALTR_AFR, ++ UART_ALTR_EN_TXFIFO_LW); ++ serial_port_out(port, UART_ALTR_TX_LOW, ++ port->fifosize - up->tx_loadsz); ++ port->handle_irq = serial8250_tx_threshold_handle_irq; ++ } ++ } ++ ++ /* Check if we need to have shared IRQs */ ++ if (port->irq && (up->port.flags & UPF_SHARE_IRQ)) ++ up->port.irqflags |= IRQF_SHARED; ++ ++ retval = up->ops->setup_irq(up); ++ if (retval) ++ goto out; ++ ++ if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) { ++ unsigned char iir1; ++ ++ if (port->irqflags & IRQF_SHARED) ++ disable_irq_nosync(port->irq); ++ ++ /* ++ * Test for UARTs that do not reassert THRE when the ++ * transmitter is idle and the interrupt has already ++ * been cleared. Real 16550s should always reassert ++ * this interrupt whenever the transmitter is idle and ++ * the interrupt is enabled. Delays are necessary to ++ * allow register changes to become visible. ++ */ ++ spin_lock_irqsave(&port->lock, flags); ++ ++ wait_for_xmitr(up, UART_LSR_THRE); ++ serial_port_out_sync(port, UART_IER, UART_IER_THRI); ++ udelay(1); /* allow THRE to set */ ++ iir1 = serial_port_in(port, UART_IIR); ++ serial_port_out(port, UART_IER, 0); ++ serial_port_out_sync(port, UART_IER, UART_IER_THRI); ++ udelay(1); /* allow a working UART time to re-assert THRE */ ++ iir = serial_port_in(port, UART_IIR); ++ serial_port_out(port, UART_IER, 0); ++ ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ if (port->irqflags & IRQF_SHARED) ++ enable_irq(port->irq); ++ ++ /* ++ * If the interrupt is not reasserted, or we otherwise ++ * don't trust the iir, setup a timer to kick the UART ++ * on a regular basis. ++ */ ++ if ((!(iir1 & UART_IIR_NO_INT) && (iir & UART_IIR_NO_INT)) || ++ up->port.flags & UPF_BUG_THRE) { ++ up->bugs |= UART_BUG_THRE; ++ } ++ } ++ ++ up->ops->setup_timer(up); ++ ++ /* ++ * Now, initialize the UART ++ */ ++ serial_port_out(port, UART_LCR, UART_LCR_WLEN8); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ if (up->port.flags & UPF_FOURPORT) { ++ if (!up->port.irq) ++ up->port.mctrl |= TIOCM_OUT1; ++ } else ++ /* ++ * Most PC uarts need OUT2 raised to enable interrupts. ++ */ ++ if (port->irq) ++ up->port.mctrl |= TIOCM_OUT2; ++ ++ serial8250_set_mctrl(port, port->mctrl); ++ ++ /* ++ * Serial over Lan (SoL) hack: ++ * Intel 8257x Gigabit ethernet chips have a 16550 emulation, to be ++ * used for Serial Over Lan. Those chips take a longer time than a ++ * normal serial device to signalize that a transmission data was ++ * queued. Due to that, the above test generally fails. One solution ++ * would be to delay the reading of iir. However, this is not ++ * reliable, since the timeout is variable. So, let's just don't ++ * test if we receive TX irq. This way, we'll never enable ++ * UART_BUG_TXEN. ++ */ ++ if (up->port.quirks & UPQ_NO_TXEN_TEST) ++ goto dont_test_tx_en; ++ ++ /* ++ * Do a quick test to see if we receive an interrupt when we enable ++ * the TX irq. ++ */ ++ serial_port_out(port, UART_IER, UART_IER_THRI); ++ lsr = serial_port_in(port, UART_LSR); ++ iir = serial_port_in(port, UART_IIR); ++ serial_port_out(port, UART_IER, 0); ++ ++ if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { ++ if (!(up->bugs & UART_BUG_TXEN)) { ++ up->bugs |= UART_BUG_TXEN; ++ dev_dbg(port->dev, "enabling bad tx status workarounds\n"); ++ } ++ } else { ++ up->bugs &= ~UART_BUG_TXEN; ++ } ++ ++dont_test_tx_en: ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ /* ++ * Clear the interrupt registers again for luck, and clear the ++ * saved flags to avoid getting false values from polling ++ * routines or the previous session. ++ */ ++ serial_port_in(port, UART_LSR); ++ serial_port_in(port, UART_RX); ++ serial_port_in(port, UART_IIR); ++ serial_port_in(port, UART_MSR); ++ up->lsr_saved_flags = 0; ++ up->msr_saved_flags = 0; ++ ++ /* ++ * Request DMA channels for both RX and TX. ++ */ ++ if (up->dma) { ++ const char *msg = NULL; ++ ++ if (uart_console(port)) ++ msg = "forbid DMA for kernel console"; ++ else if (serial8250_request_dma(up)) ++ msg = "failed to request DMA"; ++ if (msg) { ++ dev_warn_ratelimited(port->dev, "%s\n", msg); ++ up->dma = NULL; ++ } ++ } ++ ++ /* ++ * Set the IER shadow for rx interrupts but defer actual interrupt ++ * enable until after the FIFOs are enabled; otherwise, an already- ++ * active sender can swamp the interrupt handler with "too much work". ++ */ ++ up->ier = UART_IER_RLSI | UART_IER_RDI; ++ ++ if (port->flags & UPF_FOURPORT) { ++ unsigned int icp; ++ /* ++ * Enable interrupts on the AST Fourport board ++ */ ++ icp = (port->iobase & 0xfe0) | 0x01f; ++ outb_p(0x80, icp); ++ inb_p(icp); ++ } ++ retval = 0; ++out: ++ serial8250_rpm_put(up); ++ return retval; ++} ++EXPORT_SYMBOL_GPL(serial8250_do_startup); ++ ++static int serial8250_startup(struct uart_port *port) ++{ ++ if (port->startup) ++ return port->startup(port); ++ return serial8250_do_startup(port); ++} ++ ++void serial8250_do_shutdown(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ ++ serial8250_rpm_get(up); ++ /* ++ * Disable interrupts from this port ++ */ ++ spin_lock_irqsave(&port->lock, flags); ++ up->ier = 0; ++ serial_port_out(port, UART_IER, 0); ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ synchronize_irq(port->irq); ++ ++ if (up->dma) ++ serial8250_release_dma(up); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ if (port->flags & UPF_FOURPORT) { ++ /* reset interrupts on the AST Fourport board */ ++ inb((port->iobase & 0xfe0) | 0x1f); ++ port->mctrl |= TIOCM_OUT1; ++ } else ++ port->mctrl &= ~TIOCM_OUT2; ++ ++ serial8250_set_mctrl(port, port->mctrl); ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ /* ++ * Disable break condition and FIFOs ++ */ ++ serial_port_out(port, UART_LCR, ++ serial_port_in(port, UART_LCR) & ~UART_LCR_SBC); ++ serial8250_clear_fifos(up); ++ ++#ifdef CONFIG_SERIAL_8250_RSA ++ /* ++ * Reset the RSA board back to 115kbps compat mode. ++ */ ++ disable_rsa(up); ++#endif ++ ++ /* ++ * Read data port to reset things, and then unlink from ++ * the IRQ chain. ++ */ ++ serial_port_in(port, UART_RX); ++ serial8250_rpm_put(up); ++ ++ up->ops->release_irq(up); ++} ++EXPORT_SYMBOL_GPL(serial8250_do_shutdown); ++ ++static void serial8250_shutdown(struct uart_port *port) ++{ ++ if (port->shutdown) ++ port->shutdown(port); ++ else ++ serial8250_do_shutdown(port); ++} ++ ++/* Nuvoton NPCM UARTs have a custom divisor calculation */ ++static unsigned int npcm_get_divisor(struct uart_8250_port *up, ++ unsigned int baud) ++{ ++ struct uart_port *port = &up->port; ++ ++ return DIV_ROUND_CLOSEST(port->uartclk, 16 * baud + 2) - 2; ++} ++ ++static unsigned int serial8250_do_get_divisor(struct uart_port *port, ++ unsigned int baud, ++ unsigned int *frac) ++{ ++ upf_t magic_multiplier = port->flags & UPF_MAGIC_MULTIPLIER; ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int quot; ++ ++ /* ++ * Handle magic divisors for baud rates above baud_base on SMSC ++ * Super I/O chips. We clamp custom rates from clk/6 and clk/12 ++ * up to clk/4 (0x8001) and clk/8 (0x8002) respectively. These ++ * magic divisors actually reprogram the baud rate generator's ++ * reference clock derived from chips's 14.318MHz clock input. ++ * ++ * Documentation claims that with these magic divisors the base ++ * frequencies of 7.3728MHz and 3.6864MHz are used respectively ++ * for the extra baud rates of 460800bps and 230400bps rather ++ * than the usual base frequency of 1.8462MHz. However empirical ++ * evidence contradicts that. ++ * ++ * Instead bit 7 of the DLM register (bit 15 of the divisor) is ++ * effectively used as a clock prescaler selection bit for the ++ * base frequency of 7.3728MHz, always used. If set to 0, then ++ * the base frequency is divided by 4 for use by the Baud Rate ++ * Generator, for the usual arrangement where the value of 1 of ++ * the divisor produces the baud rate of 115200bps. Conversely, ++ * if set to 1 and high-speed operation has been enabled with the ++ * Serial Port Mode Register in the Device Configuration Space, ++ * then the base frequency is supplied directly to the Baud Rate ++ * Generator, so for the divisor values of 0x8001, 0x8002, 0x8003, ++ * 0x8004, etc. the respective baud rates produced are 460800bps, ++ * 230400bps, 153600bps, 115200bps, etc. ++ * ++ * In all cases only low 15 bits of the divisor are used to divide ++ * the baud base and therefore 32767 is the maximum divisor value ++ * possible, even though documentation says that the programmable ++ * Baud Rate Generator is capable of dividing the internal PLL ++ * clock by any divisor from 1 to 65535. ++ */ ++ if (magic_multiplier && baud >= port->uartclk / 6) ++ quot = 0x8001; ++ else if (magic_multiplier && baud >= port->uartclk / 12) ++ quot = 0x8002; ++ else if (up->port.type == PORT_NPCM) ++ quot = npcm_get_divisor(up, baud); ++ else ++ quot = uart_get_divisor(port, baud); ++ ++ /* ++ * Oxford Semi 952 rev B workaround ++ */ ++ if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0) ++ quot++; ++ ++ return quot; ++} ++ ++static unsigned int serial8250_get_divisor(struct uart_port *port, ++ unsigned int baud, ++ unsigned int *frac) ++{ ++ if (port->get_divisor) ++ return port->get_divisor(port, baud, frac); ++ ++ return serial8250_do_get_divisor(port, baud, frac); ++} ++ ++static unsigned char serial8250_compute_lcr(struct uart_8250_port *up, ++ tcflag_t c_cflag) ++{ ++ unsigned char cval; ++ ++ cval = UART_LCR_WLEN(tty_get_char_size(c_cflag)); ++ ++ if (c_cflag & CSTOPB) ++ cval |= UART_LCR_STOP; ++ if (c_cflag & PARENB) { ++ cval |= UART_LCR_PARITY; ++ if (up->bugs & UART_BUG_PARITY) ++ up->fifo_bug = true; ++ } ++ if (!(c_cflag & PARODD)) ++ cval |= UART_LCR_EPAR; ++ if (c_cflag & CMSPAR) ++ cval |= UART_LCR_SPAR; ++ ++ return cval; ++} ++ ++void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud, ++ unsigned int quot, unsigned int quot_frac) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ /* Workaround to enable 115200 baud on OMAP1510 internal ports */ ++ if (is_omap1510_8250(up)) { ++ if (baud == 115200) { ++ quot = 1; ++ serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1); ++ } else ++ serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0); ++ } ++ ++ /* ++ * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2, ++ * otherwise just set DLAB ++ */ ++ if (up->capabilities & UART_NATSEMI) ++ serial_port_out(port, UART_LCR, 0xe0); ++ else ++ serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB); ++ ++ serial_dl_write(up, quot); ++} ++EXPORT_SYMBOL_GPL(serial8250_do_set_divisor); ++ ++static void serial8250_set_divisor(struct uart_port *port, unsigned int baud, ++ unsigned int quot, unsigned int quot_frac) ++{ ++ if (port->set_divisor) ++ port->set_divisor(port, baud, quot, quot_frac); ++ else ++ serial8250_do_set_divisor(port, baud, quot, quot_frac); ++} ++ ++static unsigned int serial8250_get_baud_rate(struct uart_port *port, ++ struct ktermios *termios, ++ struct ktermios *old) ++{ ++ unsigned int tolerance = port->uartclk / 100; ++ unsigned int min; ++ unsigned int max; ++ ++ /* ++ * Handle magic divisors for baud rates above baud_base on SMSC ++ * Super I/O chips. Enable custom rates of clk/4 and clk/8, but ++ * disable divisor values beyond 32767, which are unavailable. ++ */ ++ if (port->flags & UPF_MAGIC_MULTIPLIER) { ++ min = port->uartclk / 16 / UART_DIV_MAX >> 1; ++ max = (port->uartclk + tolerance) / 4; ++ } else { ++ min = port->uartclk / 16 / UART_DIV_MAX; ++ max = (port->uartclk + tolerance) / 16; ++ } ++ ++ /* ++ * Ask the core to calculate the divisor for us. ++ * Allow 1% tolerance at the upper limit so uart clks marginally ++ * slower than nominal still match standard baud rates without ++ * causing transmission errors. ++ */ ++ return uart_get_baud_rate(port, termios, old, min, max); ++} ++ ++/* ++ * Note in order to avoid the tty port mutex deadlock don't use the next method ++ * within the uart port callbacks. Primarily it's supposed to be utilized to ++ * handle a sudden reference clock rate change. ++ */ ++void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct tty_port *tport = &port->state->port; ++ unsigned int baud, quot, frac = 0; ++ struct ktermios *termios; ++ struct tty_struct *tty; ++ unsigned long flags; ++ ++ tty = tty_port_tty_get(tport); ++ if (!tty) { ++ mutex_lock(&tport->mutex); ++ port->uartclk = uartclk; ++ mutex_unlock(&tport->mutex); ++ return; ++ } ++ ++ down_write(&tty->termios_rwsem); ++ mutex_lock(&tport->mutex); ++ ++ if (port->uartclk == uartclk) ++ goto out_unlock; ++ ++ port->uartclk = uartclk; ++ ++ if (!tty_port_initialized(tport)) ++ goto out_unlock; ++ ++ termios = &tty->termios; ++ ++ baud = serial8250_get_baud_rate(port, termios, NULL); ++ quot = serial8250_get_divisor(port, baud, &frac); ++ ++ serial8250_rpm_get(up); ++ spin_lock_irqsave(&port->lock, flags); ++ ++ uart_update_timeout(port, termios->c_cflag, baud); ++ ++ serial8250_set_divisor(port, baud, quot, frac); ++ serial_port_out(port, UART_LCR, up->lcr); ++ ++ spin_unlock_irqrestore(&port->lock, flags); ++ serial8250_rpm_put(up); ++ ++out_unlock: ++ mutex_unlock(&tport->mutex); ++ up_write(&tty->termios_rwsem); ++ tty_kref_put(tty); ++} ++EXPORT_SYMBOL_GPL(serial8250_update_uartclk); ++ ++void ++serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, ++ struct ktermios *old) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned char cval; ++ unsigned long flags; ++ unsigned int baud, quot, frac = 0; ++ ++ if (up->capabilities & UART_CAP_MINI) { ++ termios->c_cflag &= ~(CSTOPB | PARENB | PARODD | CMSPAR); ++ if ((termios->c_cflag & CSIZE) == CS5 || ++ (termios->c_cflag & CSIZE) == CS6) ++ termios->c_cflag = (termios->c_cflag & ~CSIZE) | CS7; ++ } ++ cval = serial8250_compute_lcr(up, termios->c_cflag); ++ ++ baud = serial8250_get_baud_rate(port, termios, old); ++ quot = serial8250_get_divisor(port, baud, &frac); ++ ++ /* ++ * Ok, we're now changing the port state. Do it with ++ * interrupts disabled. ++ */ ++ serial8250_rpm_get(up); ++ spin_lock_irqsave(&port->lock, flags); ++ ++ up->lcr = cval; /* Save computed LCR */ ++ ++ if (up->capabilities & UART_CAP_FIFO && port->fifosize > 1) { ++ /* NOTE: If fifo_bug is not set, a user can set RX_trigger. */ ++ if ((baud < 2400 && !up->dma) || up->fifo_bug) { ++ up->fcr &= ~UART_FCR_TRIGGER_MASK; ++ up->fcr |= UART_FCR_TRIGGER_1; ++ } ++ } ++ ++ /* ++ * MCR-based auto flow control. When AFE is enabled, RTS will be ++ * deasserted when the receive FIFO contains more characters than ++ * the trigger, or the MCR RTS bit is cleared. ++ */ ++ if (up->capabilities & UART_CAP_AFE) { ++ up->mcr &= ~UART_MCR_AFE; ++ if (termios->c_cflag & CRTSCTS) ++ up->mcr |= UART_MCR_AFE; ++ } ++ ++ /* ++ * Update the per-port timeout. ++ */ ++ uart_update_timeout(port, termios->c_cflag, baud); ++ ++ port->read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR; ++ if (termios->c_iflag & INPCK) ++ port->read_status_mask |= UART_LSR_FE | UART_LSR_PE; ++ if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK)) ++ port->read_status_mask |= UART_LSR_BI; ++ ++ /* ++ * Characters to ignore ++ */ ++ port->ignore_status_mask = 0; ++ if (termios->c_iflag & IGNPAR) ++ port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE; ++ if (termios->c_iflag & IGNBRK) { ++ port->ignore_status_mask |= UART_LSR_BI; ++ /* ++ * If we're ignoring parity and break indicators, ++ * ignore overruns too (for real raw support). ++ */ ++ if (termios->c_iflag & IGNPAR) ++ port->ignore_status_mask |= UART_LSR_OE; ++ } ++ ++ /* ++ * ignore all characters if CREAD is not set ++ */ ++ if ((termios->c_cflag & CREAD) == 0) ++ port->ignore_status_mask |= UART_LSR_DR; ++ ++ /* ++ * CTS flow control flag and modem status interrupts ++ */ ++ up->ier &= ~UART_IER_MSI; ++ if (!(up->bugs & UART_BUG_NOMSR) && ++ UART_ENABLE_MS(&up->port, termios->c_cflag)) ++ up->ier |= UART_IER_MSI; ++ if (up->capabilities & UART_CAP_UUE) ++ up->ier |= UART_IER_UUE; ++ if (up->capabilities & UART_CAP_RTOIE) ++ up->ier |= UART_IER_RTOIE; ++ ++ serial_port_out(port, UART_IER, up->ier); ++ ++ if (up->capabilities & UART_CAP_EFR) { ++ unsigned char efr = 0; ++ /* ++ * TI16C752/Startech hardware flow control. FIXME: ++ * - TI16C752 requires control thresholds to be set. ++ * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled. ++ */ ++ if (termios->c_cflag & CRTSCTS) ++ efr |= UART_EFR_CTS; ++ ++ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); ++ if (port->flags & UPF_EXAR_EFR) ++ serial_port_out(port, UART_XR_EFR, efr); ++ else ++ serial_port_out(port, UART_EFR, efr); ++ } ++ ++ serial8250_set_divisor(port, baud, quot, frac); ++ ++ /* ++ * LCR DLAB must be set to enable 64-byte FIFO mode. If the FCR ++ * is written without DLAB set, this mode will be disabled. ++ */ ++ if (port->type == PORT_16750) ++ serial_port_out(port, UART_FCR, up->fcr); ++ ++ serial_port_out(port, UART_LCR, up->lcr); /* reset DLAB */ ++ if (port->type != PORT_16750) { ++ /* emulated UARTs (Lucent Venus 167x) need two steps */ ++ if (up->fcr & UART_FCR_ENABLE_FIFO) ++ serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO); ++ serial_port_out(port, UART_FCR, up->fcr); /* set fcr */ ++ } ++ serial8250_set_mctrl(port, port->mctrl); ++ spin_unlock_irqrestore(&port->lock, flags); ++ serial8250_rpm_put(up); ++ ++ /* Don't rewrite B0 */ ++ if (tty_termios_baud_rate(termios)) ++ tty_termios_encode_baud_rate(termios, baud, baud); ++} ++EXPORT_SYMBOL(serial8250_do_set_termios); ++ ++static void ++serial8250_set_termios(struct uart_port *port, struct ktermios *termios, ++ struct ktermios *old) ++{ ++ if (port->set_termios) ++ port->set_termios(port, termios, old); ++ else ++ serial8250_do_set_termios(port, termios, old); ++} ++ ++void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios) ++{ ++ if (termios->c_line == N_PPS) { ++ port->flags |= UPF_HARDPPS_CD; ++ spin_lock_irq(&port->lock); ++ serial8250_enable_ms(port); ++ spin_unlock_irq(&port->lock); ++ } else { ++ port->flags &= ~UPF_HARDPPS_CD; ++ if (!UART_ENABLE_MS(port, termios->c_cflag)) { ++ spin_lock_irq(&port->lock); ++ serial8250_disable_ms(port); ++ spin_unlock_irq(&port->lock); ++ } ++ } ++} ++EXPORT_SYMBOL_GPL(serial8250_do_set_ldisc); ++ ++static void ++serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios) ++{ ++ if (port->set_ldisc) ++ port->set_ldisc(port, termios); ++ else ++ serial8250_do_set_ldisc(port, termios); ++} ++ ++void serial8250_do_pm(struct uart_port *port, unsigned int state, ++ unsigned int oldstate) ++{ ++ struct uart_8250_port *p = up_to_u8250p(port); ++ ++ serial8250_set_sleep(p, state != 0); ++} ++EXPORT_SYMBOL(serial8250_do_pm); ++ ++static void ++serial8250_pm(struct uart_port *port, unsigned int state, ++ unsigned int oldstate) ++{ ++ if (port->pm) ++ port->pm(port, state, oldstate); ++ else ++ serial8250_do_pm(port, state, oldstate); ++} ++ ++static unsigned int serial8250_port_size(struct uart_8250_port *pt) ++{ ++ if (pt->port.mapsize) ++ return pt->port.mapsize; ++ if (pt->port.iotype == UPIO_AU) { ++ if (pt->port.type == PORT_RT2880) ++ return 0x100; ++ return 0x1000; ++ } ++ if (is_omap1_8250(pt)) ++ return 0x16 << pt->port.regshift; ++ ++ return 8 << pt->port.regshift; ++} ++ ++/* ++ * Resource handling. ++ */ ++static int serial8250_request_std_resource(struct uart_8250_port *up) ++{ ++ unsigned int size = serial8250_port_size(up); ++ struct uart_port *port = &up->port; ++ int ret = 0; ++ ++ switch (port->iotype) { ++ case UPIO_AU: ++ case UPIO_TSI: ++ case UPIO_MEM32: ++ case UPIO_MEM32BE: ++ case UPIO_MEM16: ++ case UPIO_MEM: ++ if (!port->mapbase) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (!request_mem_region(port->mapbase, size, "serial")) { ++ ret = -EBUSY; ++ break; ++ } ++ ++ if (port->flags & UPF_IOREMAP) { ++ port->membase = ioremap(port->mapbase, size); ++ if (!port->membase) { ++ release_mem_region(port->mapbase, size); ++ ret = -ENOMEM; ++ } ++ } ++ break; ++ ++ case UPIO_HUB6: ++ case UPIO_PORT: ++ if (!request_region(port->iobase, size, "serial")) ++ ret = -EBUSY; ++ break; ++ } ++ return ret; ++} ++ ++static void serial8250_release_std_resource(struct uart_8250_port *up) ++{ ++ unsigned int size = serial8250_port_size(up); ++ struct uart_port *port = &up->port; ++ ++ switch (port->iotype) { ++ case UPIO_AU: ++ case UPIO_TSI: ++ case UPIO_MEM32: ++ case UPIO_MEM32BE: ++ case UPIO_MEM16: ++ case UPIO_MEM: ++ if (!port->mapbase) ++ break; ++ ++ if (port->flags & UPF_IOREMAP) { ++ iounmap(port->membase); ++ port->membase = NULL; ++ } ++ ++ release_mem_region(port->mapbase, size); ++ break; ++ ++ case UPIO_HUB6: ++ case UPIO_PORT: ++ release_region(port->iobase, size); ++ break; ++ } ++} ++ ++static void serial8250_release_port(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ serial8250_release_std_resource(up); ++} ++ ++static int serial8250_request_port(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ return serial8250_request_std_resource(up); ++} ++ ++static int fcr_get_rxtrig_bytes(struct uart_8250_port *up) ++{ ++ const struct serial8250_config *conf_type = &uart_config[up->port.type]; ++ unsigned char bytes; ++ ++ bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)]; ++ ++ return bytes ? bytes : -EOPNOTSUPP; ++} ++ ++static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes) ++{ ++ const struct serial8250_config *conf_type = &uart_config[up->port.type]; ++ int i; ++ ++ if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)]) ++ return -EOPNOTSUPP; ++ ++ for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) { ++ if (bytes < conf_type->rxtrig_bytes[i]) ++ /* Use the nearest lower value */ ++ return (--i) << UART_FCR_R_TRIG_SHIFT; ++ } ++ ++ return UART_FCR_R_TRIG_11; ++} ++ ++static int do_get_rxtrig(struct tty_port *port) ++{ ++ struct uart_state *state = container_of(port, struct uart_state, port); ++ struct uart_port *uport = state->uart_port; ++ struct uart_8250_port *up = up_to_u8250p(uport); ++ ++ if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1) ++ return -EINVAL; ++ ++ return fcr_get_rxtrig_bytes(up); ++} ++ ++static int do_serial8250_get_rxtrig(struct tty_port *port) ++{ ++ int rxtrig_bytes; ++ ++ mutex_lock(&port->mutex); ++ rxtrig_bytes = do_get_rxtrig(port); ++ mutex_unlock(&port->mutex); ++ ++ return rxtrig_bytes; ++} ++ ++static ssize_t rx_trig_bytes_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ struct tty_port *port = dev_get_drvdata(dev); ++ int rxtrig_bytes; ++ ++ rxtrig_bytes = do_serial8250_get_rxtrig(port); ++ if (rxtrig_bytes < 0) ++ return rxtrig_bytes; ++ ++ return sysfs_emit(buf, "%d\n", rxtrig_bytes); ++} ++ ++static int do_set_rxtrig(struct tty_port *port, unsigned char bytes) ++{ ++ struct uart_state *state = container_of(port, struct uart_state, port); ++ struct uart_port *uport = state->uart_port; ++ struct uart_8250_port *up = up_to_u8250p(uport); ++ int rxtrig; ++ ++ if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1 || ++ up->fifo_bug) ++ return -EINVAL; ++ ++ rxtrig = bytes_to_fcr_rxtrig(up, bytes); ++ if (rxtrig < 0) ++ return rxtrig; ++ ++ serial8250_clear_fifos(up); ++ up->fcr &= ~UART_FCR_TRIGGER_MASK; ++ up->fcr |= (unsigned char)rxtrig; ++ serial_out(up, UART_FCR, up->fcr); ++ return 0; ++} ++ ++static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes) ++{ ++ int ret; ++ ++ mutex_lock(&port->mutex); ++ ret = do_set_rxtrig(port, bytes); ++ mutex_unlock(&port->mutex); ++ ++ return ret; ++} ++ ++static ssize_t rx_trig_bytes_store(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ struct tty_port *port = dev_get_drvdata(dev); ++ unsigned char bytes; ++ int ret; ++ ++ if (!count) ++ return -EINVAL; ++ ++ ret = kstrtou8(buf, 10, &bytes); ++ if (ret < 0) ++ return ret; ++ ++ ret = do_serial8250_set_rxtrig(port, bytes); ++ if (ret < 0) ++ return ret; ++ ++ return count; ++} ++ ++static DEVICE_ATTR_RW(rx_trig_bytes); ++ ++static struct attribute *serial8250_dev_attrs[] = { ++ &dev_attr_rx_trig_bytes.attr, ++ NULL ++}; ++ ++static struct attribute_group serial8250_dev_attr_group = { ++ .attrs = serial8250_dev_attrs, ++}; ++ ++static void register_dev_spec_attr_grp(struct uart_8250_port *up) ++{ ++ const struct serial8250_config *conf_type = &uart_config[up->port.type]; ++ ++ if (conf_type->rxtrig_bytes[0]) ++ up->port.attr_group = &serial8250_dev_attr_group; ++} ++ ++static void serial8250_config_port(struct uart_port *port, int flags) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ int ret; ++ ++ /* ++ * Find the region that we can probe for. This in turn ++ * tells us whether we can probe for the type of port. ++ */ ++ ret = serial8250_request_std_resource(up); ++ if (ret < 0) ++ return; ++ ++ if (port->iotype != up->cur_iotype) ++ set_io_from_upio(port); ++ ++ if (flags & UART_CONFIG_TYPE) ++ autoconfig(up); ++ ++ /* if access method is AU, it is a 16550 with a quirk */ ++ if (port->type == PORT_16550A && port->iotype == UPIO_AU) ++ up->bugs |= UART_BUG_NOMSR; ++ ++ /* HW bugs may trigger IRQ while IIR == NO_INT */ ++ if (port->type == PORT_TEGRA) ++ up->bugs |= UART_BUG_NOMSR; ++ ++ if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ) ++ autoconfig_irq(up); ++ ++ if (port->type == PORT_UNKNOWN) ++ serial8250_release_std_resource(up); ++ ++ register_dev_spec_attr_grp(up); ++ up->fcr = uart_config[up->port.type].fcr; ++} ++ ++static int ++serial8250_verify_port(struct uart_port *port, struct serial_struct *ser) ++{ ++ if (ser->irq >= nr_irqs || ser->irq < 0 || ++ ser->baud_base < 9600 || ser->type < PORT_UNKNOWN || ++ ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS || ++ ser->type == PORT_STARTECH) ++ return -EINVAL; ++ return 0; ++} ++ ++static const char *serial8250_type(struct uart_port *port) ++{ ++ int type = port->type; ++ ++ if (type >= ARRAY_SIZE(uart_config)) ++ type = 0; ++ return uart_config[type].name; ++} ++ ++static const struct uart_ops serial8250_pops = { ++ .tx_empty = serial8250_tx_empty, ++ .set_mctrl = serial8250_set_mctrl, ++ .get_mctrl = serial8250_get_mctrl, ++ .stop_tx = serial8250_stop_tx, ++ .start_tx = serial8250_start_tx, ++ .throttle = serial8250_throttle, ++ .unthrottle = serial8250_unthrottle, ++ .stop_rx = serial8250_stop_rx, ++ .enable_ms = serial8250_enable_ms, ++ .break_ctl = serial8250_break_ctl, ++ .startup = serial8250_startup, ++ .shutdown = serial8250_shutdown, ++ .set_termios = serial8250_set_termios, ++ .set_ldisc = serial8250_set_ldisc, ++ .pm = serial8250_pm, ++ .type = serial8250_type, ++ .release_port = serial8250_release_port, ++ .request_port = serial8250_request_port, ++ .config_port = serial8250_config_port, ++ .verify_port = serial8250_verify_port, ++#ifdef CONFIG_CONSOLE_POLL ++ .poll_get_char = serial8250_get_poll_char, ++ .poll_put_char = serial8250_put_poll_char, ++#endif ++}; ++ ++void serial8250_init_port(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ ++ spin_lock_init(&port->lock); ++ port->ops = &serial8250_pops; ++ port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE); ++ ++ up->cur_iotype = 0xFF; ++} ++EXPORT_SYMBOL_GPL(serial8250_init_port); ++ ++void serial8250_set_defaults(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ ++ if (up->port.flags & UPF_FIXED_TYPE) { ++ unsigned int type = up->port.type; ++ ++ if (!up->port.fifosize) ++ up->port.fifosize = uart_config[type].fifo_size; ++ if (!up->tx_loadsz) ++ up->tx_loadsz = uart_config[type].tx_loadsz; ++ if (!up->capabilities) ++ up->capabilities = uart_config[type].flags; ++ } ++ ++ set_io_from_upio(port); ++ ++ /* default dma handlers */ ++ if (up->dma) { ++ if (!up->dma->tx_dma) ++ up->dma->tx_dma = serial8250_tx_dma; ++ if (!up->dma->rx_dma) ++ up->dma->rx_dma = serial8250_rx_dma; ++ } ++} ++EXPORT_SYMBOL_GPL(serial8250_set_defaults); ++ ++#ifdef CONFIG_SERIAL_8250_CONSOLE ++ ++static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ wait_for_xmitr(up, UART_LSR_THRE); ++ serial_port_out(port, UART_TX, ch); ++} ++ ++/* ++ * Restore serial console when h/w power-off detected ++ */ ++static void serial8250_console_restore(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ struct ktermios termios; ++ unsigned int baud, quot, frac = 0; ++ ++ termios.c_cflag = port->cons->cflag; ++ termios.c_ispeed = port->cons->ispeed; ++ termios.c_ospeed = port->cons->ospeed; ++ if (port->state->port.tty && termios.c_cflag == 0) { ++ termios.c_cflag = port->state->port.tty->termios.c_cflag; ++ termios.c_ispeed = port->state->port.tty->termios.c_ispeed; ++ termios.c_ospeed = port->state->port.tty->termios.c_ospeed; ++ } ++ ++ baud = serial8250_get_baud_rate(port, &termios, NULL); ++ quot = serial8250_get_divisor(port, baud, &frac); ++ ++ serial8250_set_divisor(port, baud, quot, frac); ++ serial_port_out(port, UART_LCR, up->lcr); ++ serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); ++} ++ ++/* ++ * Print a string to the serial port using the device FIFO ++ * ++ * It sends fifosize bytes and then waits for the fifo ++ * to get empty. ++ */ ++static void serial8250_console_fifo_write(struct uart_8250_port *up, ++ const char *s, unsigned int count) ++{ ++ int i; ++ const char *end = s + count; ++ unsigned int fifosize = up->tx_loadsz; ++ bool cr_sent = false; ++ ++ while (s != end) { ++ wait_for_lsr(up, UART_LSR_THRE); ++ ++ for (i = 0; i < fifosize && s != end; ++i) { ++ if (*s == '\n' && !cr_sent) { ++ serial_out(up, UART_TX, '\r'); ++ cr_sent = true; ++ } else { ++ serial_out(up, UART_TX, *s++); ++ cr_sent = false; ++ } ++ } ++ } ++} ++ ++/* ++ * Print a string to the serial port trying not to disturb ++ * any possible real use of the port... ++ * ++ * The console_lock must be held when we get here. ++ * ++ * Doing runtime PM is really a bad idea for the kernel console. ++ * Thus, we assume the function is called when device is powered up. ++ */ ++void serial8250_console_write(struct uart_8250_port *up, const char *s, ++ unsigned int count) ++{ ++ struct uart_8250_em485 *em485 = up->em485; ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int ier, use_fifo; ++ int locked = 1; ++ ++ touch_nmi_watchdog(); ++ ++ if (oops_in_progress) ++ locked = spin_trylock_irqsave(&port->lock, flags); ++ else ++ spin_lock_irqsave(&port->lock, flags); ++ ++ /* ++ * First save the IER then disable the interrupts ++ */ ++ ier = serial_port_in(port, UART_IER); ++ ++ if (up->capabilities & UART_CAP_UUE) ++ serial_port_out(port, UART_IER, UART_IER_UUE); ++ else ++ serial_port_out(port, UART_IER, 0); ++ ++ /* check scratch reg to see if port powered off during system sleep */ ++ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { ++ serial8250_console_restore(up); ++ up->canary = 0; ++ } ++ ++ if (em485) { ++ if (em485->tx_stopped) ++ up->rs485_start_tx(up); ++ mdelay(port->rs485.delay_rts_before_send); ++ } ++ ++ use_fifo = (up->capabilities & UART_CAP_FIFO) && ++ /* ++ * BCM283x requires to check the fifo ++ * after each byte. ++ */ ++ !(up->capabilities & UART_CAP_MINI) && ++ /* ++ * tx_loadsz contains the transmit fifo size ++ */ ++ up->tx_loadsz > 1 && ++ (up->fcr & UART_FCR_ENABLE_FIFO) && ++ port->state && ++ test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) && ++ /* ++ * After we put a data in the fifo, the controller will send ++ * it regardless of the CTS state. Therefore, only use fifo ++ * if we don't use control flow. ++ */ ++ !(up->port.flags & UPF_CONS_FLOW); ++ ++ if (likely(use_fifo)) ++ serial8250_console_fifo_write(up, s, count); ++ else ++ uart_console_write(port, s, count, serial8250_console_putchar); ++ ++ /* ++ * Finally, wait for transmitter to become empty ++ * and restore the IER ++ */ ++ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); ++ ++ if (em485) { ++ mdelay(port->rs485.delay_rts_after_send); ++ if (em485->tx_stopped) ++ up->rs485_stop_tx(up); ++ } ++ ++ serial_port_out(port, UART_IER, ier); ++ ++ /* ++ * The receive handling will happen properly because the ++ * receive ready bit will still be set; it is not cleared ++ * on read. However, modem control will not, we must ++ * call it if we have saved something in the saved flags ++ * while processing with interrupts off. ++ */ ++ if (up->msr_saved_flags) ++ serial8250_modem_status(up); ++ ++ if (locked) ++ spin_unlock_irqrestore(&port->lock, flags); ++} ++ ++static unsigned int probe_baud(struct uart_port *port) ++{ ++ unsigned char lcr, dll, dlm; ++ unsigned int quot; ++ ++ lcr = serial_port_in(port, UART_LCR); ++ serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB); ++ dll = serial_port_in(port, UART_DLL); ++ dlm = serial_port_in(port, UART_DLM); ++ serial_port_out(port, UART_LCR, lcr); ++ ++ quot = (dlm << 8) | dll; ++ return (port->uartclk / 16) / quot; ++} ++ ++int serial8250_console_setup(struct uart_port *port, char *options, bool probe) ++{ ++ int baud = 9600; ++ int bits = 8; ++ int parity = 'n'; ++ int flow = 'n'; ++ int ret; ++ ++ if (!port->iobase && !port->membase) ++ return -ENODEV; ++ ++ if (options) ++ uart_parse_options(options, &baud, &parity, &bits, &flow); ++ else if (probe) ++ baud = probe_baud(port); ++ ++ ret = uart_set_options(port, port->cons, baud, parity, bits, flow); ++ if (ret) ++ return ret; ++ ++ if (port->dev) ++ pm_runtime_get_sync(port->dev); ++ ++ return 0; ++} ++ ++int serial8250_console_exit(struct uart_port *port) ++{ ++ if (port->dev) ++ pm_runtime_put_sync(port->dev); ++ ++ return 0; ++} ++ ++#endif /* CONFIG_SERIAL_8250_CONSOLE */ ++ ++MODULE_LICENSE("GPL"); +diff -rupN linux.orig/drivers/tty/serial/8250/Kconfig linux/drivers/tty/serial/8250/Kconfig +--- linux.orig/drivers/tty/serial/8250/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/Kconfig 2022-12-04 10:40:26.708034065 -0500 @@ -9,6 +9,7 @@ config SERIAL_8250 depends on !S390 select SERIAL_CORE @@ -4064,11 +26933,10 @@ index d0b49e15fbf5e..02c308467339c 100644 help This selects whether you want to include the driver for the standard serial ports. The standard answer is Y. People who might say N -diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c -index 15f0e4d88c5a0..ffdb001e3d109 100644 ---- a/drivers/tty/serial/amba-pl011.c -+++ b/drivers/tty/serial/amba-pl011.c -@@ -2308,18 +2308,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) +diff -rupN linux.orig/drivers/tty/serial/amba-pl011.c linux/drivers/tty/serial/amba-pl011.c +--- linux.orig/drivers/tty/serial/amba-pl011.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/amba-pl011.c 2022-12-04 10:40:26.708034065 -0500 +@@ -2308,18 +2308,24 @@ pl011_console_write(struct console *co, { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; @@ -4097,7 +26965,7 @@ index 15f0e4d88c5a0..ffdb001e3d109 100644 /* * First save the CR then disable the interrupts -@@ -2345,8 +2351,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) +@@ -2345,8 +2351,7 @@ pl011_console_write(struct console *co, pl011_write(old_cr, uap, REG_CR); if (locked) @@ -4107,11 +26975,10 @@ index 15f0e4d88c5a0..ffdb001e3d109 100644 clk_disable(uap->clk); } -diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c -index 0aa666e247d57..d7130d1ae64c0 100644 ---- a/drivers/tty/serial/omap-serial.c -+++ b/drivers/tty/serial/omap-serial.c -@@ -1240,13 +1240,10 @@ serial_omap_console_write(struct console *co, const char *s, +diff -rupN linux.orig/drivers/tty/serial/omap-serial.c linux/drivers/tty/serial/omap-serial.c +--- linux.orig/drivers/tty/serial/omap-serial.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/omap-serial.c 2022-12-04 10:40:26.708034065 -0500 +@@ -1240,13 +1240,10 @@ serial_omap_console_write(struct console unsigned int ier; int locked = 1; @@ -4128,7 +26995,7 @@ index 0aa666e247d57..d7130d1ae64c0 100644 /* * First save the IER then disable the interrupts -@@ -1273,8 +1270,7 @@ serial_omap_console_write(struct console *co, const char *s, +@@ -1273,8 +1270,7 @@ serial_omap_console_write(struct console check_modem_status(up); if (locked) @@ -4138,11 +27005,10 @@ index 0aa666e247d57..d7130d1ae64c0 100644 } static int __init -diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c -index d2b2720db6ca7..18e623325887f 100644 ---- a/drivers/tty/sysrq.c -+++ b/drivers/tty/sysrq.c -@@ -581,6 +581,7 @@ void __handle_sysrq(int key, bool check_mask) +diff -rupN linux.orig/drivers/tty/sysrq.c linux/drivers/tty/sysrq.c +--- linux.orig/drivers/tty/sysrq.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/sysrq.c 2022-12-04 10:40:26.708034065 -0500 +@@ -581,6 +581,7 @@ void __handle_sysrq(int key, bool check_ rcu_sysrq_start(); rcu_read_lock(); @@ -4150,7 +27016,7 @@ index d2b2720db6ca7..18e623325887f 100644 /* * Raise the apparent loglevel to maximum so that the sysrq header * is shown to provide the user with positive feedback. We do not -@@ -622,6 +623,7 @@ void __handle_sysrq(int key, bool check_mask) +@@ -622,6 +623,7 @@ void __handle_sysrq(int key, bool check_ pr_cont("\n"); console_loglevel = orig_log_level; } @@ -4158,10 +27024,9 @@ index d2b2720db6ca7..18e623325887f 100644 rcu_read_unlock(); rcu_sysrq_end(); -diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h -index 4e0e50e7ac153..173e979b84a93 100644 ---- a/drivers/vdpa/vdpa_user/iova_domain.h -+++ b/drivers/vdpa/vdpa_user/iova_domain.h +diff -rupN linux.orig/drivers/vdpa/vdpa_user/iova_domain.h linux/drivers/vdpa/vdpa_user/iova_domain.h +--- linux.orig/drivers/vdpa/vdpa_user/iova_domain.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/vdpa/vdpa_user/iova_domain.h 2022-12-04 10:40:26.708034065 -0500 @@ -14,7 +14,6 @@ #include #include @@ -4170,10 +27035,9 @@ index 4e0e50e7ac153..173e979b84a93 100644 #define IOVA_START_PFN 1 -diff --git a/fs/dcache.c b/fs/dcache.c -index bb0c4d0038dbd..2ee8636016ee9 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c +diff -rupN linux.orig/fs/dcache.c linux/fs/dcache.c +--- linux.orig/fs/dcache.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/fs/dcache.c 2022-12-04 10:40:26.708034065 -0500 @@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) @@ -4191,7 +27055,7 @@ index bb0c4d0038dbd..2ee8636016ee9 100644 for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) -@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, +@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct in wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); @@ -4201,10 +27065,9 @@ index bb0c4d0038dbd..2ee8636016ee9 100644 wake_up_all(d_wait); } -diff --git a/include/linux/console.h b/include/linux/console.h -index 8c1686e2c2337..8a813cbaf9285 100644 ---- a/include/linux/console.h -+++ b/include/linux/console.h +diff -rupN linux.orig/include/linux/console.h linux/include/linux/console.h +--- linux.orig/include/linux/console.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/console.h 2022-12-04 10:40:26.712034055 -0500 @@ -16,6 +16,7 @@ #include @@ -4269,10 +27132,9 @@ index 8c1686e2c2337..8a813cbaf9285 100644 CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; -diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h -index 84a466b176cf4..df6d17bc30aa3 100644 ---- a/include/linux/entry-common.h -+++ b/include/linux/entry-common.h +diff -rupN linux.orig/include/linux/entry-common.h linux/include/linux/entry-common.h +--- linux.orig/include/linux/entry-common.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/entry-common.h 2022-12-04 10:40:26.712034055 -0500 @@ -57,9 +57,15 @@ # define ARCH_EXIT_TO_USER_MODE_WORK (0) #endif @@ -4290,11 +27152,10 @@ index 84a466b176cf4..df6d17bc30aa3 100644 ARCH_EXIT_TO_USER_MODE_WORK) /** -diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h -index a92bce40b04b3..bf82980f569df 100644 ---- a/include/linux/interrupt.h -+++ b/include/linux/interrupt.h -@@ -605,6 +605,35 @@ extern void __raise_softirq_irqoff(unsigned int nr); +diff -rupN linux.orig/include/linux/interrupt.h linux/include/linux/interrupt.h +--- linux.orig/include/linux/interrupt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/interrupt.h 2022-12-04 10:40:26.712034055 -0500 +@@ -605,6 +605,35 @@ extern void __raise_softirq_irqoff(unsig extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); @@ -4330,11 +27191,10 @@ index a92bce40b04b3..bf82980f569df 100644 DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) -diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h -index 1cd4e36890fbf..844a8e30e6de5 100644 ---- a/include/linux/irqdesc.h -+++ b/include/linux/irqdesc.h -@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int irq); +diff -rupN linux.orig/include/linux/irqdesc.h linux/include/linux/irqdesc.h +--- linux.orig/include/linux/irqdesc.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/irqdesc.h 2022-12-04 10:40:26.712034055 -0500 +@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int * conversion failed. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq); @@ -4342,10 +27202,9 @@ index 1cd4e36890fbf..844a8e30e6de5 100644 int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq); #endif -diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h -index 1f1099dac3f05..1023f349af716 100644 ---- a/include/linux/lockdep.h -+++ b/include/linux/lockdep.h +diff -rupN linux.orig/include/linux/lockdep.h linux/include/linux/lockdep.h +--- linux.orig/include/linux/lockdep.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/lockdep.h 2022-12-04 10:40:26.712034055 -0500 @@ -435,7 +435,6 @@ enum xhlock_context_t { XHLOCK_CTX_NR, }; @@ -4354,11 +27213,10 @@ index 1f1099dac3f05..1023f349af716 100644 /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. -diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h -index 15ae78cd28536..b8728d11c9490 100644 ---- a/include/linux/mmdebug.h -+++ b/include/linux/mmdebug.h -@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm); +diff -rupN linux.orig/include/linux/mmdebug.h linux/include/linux/mmdebug.h +--- linux.orig/include/linux/mmdebug.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/mmdebug.h 2022-12-04 10:40:26.712034055 -0500 +@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif @@ -4371,10 +27229,9 @@ index 15ae78cd28536..b8728d11c9490 100644 #ifdef CONFIG_DEBUG_VIRTUAL #define VIRTUAL_BUG_ON(cond) BUG_ON(cond) #else -diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h -index 05d6f3facd5a5..5e6b840f5a9ac 100644 ---- a/include/linux/netdevice.h -+++ b/include/linux/netdevice.h +diff -rupN linux.orig/include/linux/netdevice.h linux/include/linux/netdevice.h +--- linux.orig/include/linux/netdevice.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/netdevice.h 2022-12-04 10:40:26.712034055 -0500 @@ -3156,7 +3156,11 @@ struct softnet_data { int defer_count; int defer_ipi_scheduled; @@ -4387,10 +27244,9 @@ index 05d6f3facd5a5..5e6b840f5a9ac 100644 }; static inline void input_queue_head_incr(struct softnet_data *sd) -diff --git a/include/linux/preempt.h b/include/linux/preempt.h -index b4381f255a5ca..12f59cdaaedda 100644 ---- a/include/linux/preempt.h -+++ b/include/linux/preempt.h +diff -rupN linux.orig/include/linux/preempt.h linux/include/linux/preempt.h +--- linux.orig/include/linux/preempt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/preempt.h 2022-12-04 10:40:26.712034055 -0500 @@ -196,6 +196,20 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) @@ -4537,10 +27393,9 @@ index b4381f255a5ca..12f59cdaaedda 100644 +} + #endif /* __LINUX_PREEMPT_H */ -diff --git a/include/linux/printk.h b/include/linux/printk.h -index cf7d666ab1f8e..f88ec15f83dcc 100644 ---- a/include/linux/printk.h -+++ b/include/linux/printk.h +diff -rupN linux.orig/include/linux/printk.h linux/include/linux/printk.h +--- linux.orig/include/linux/printk.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/printk.h 2022-12-04 10:40:26.712034055 -0500 @@ -169,7 +169,11 @@ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit @@ -4553,7 +27408,7 @@ index cf7d666ab1f8e..f88ec15f83dcc 100644 /* * Please don't use printk_ratelimit(), because it shares ratelimiting state -@@ -221,11 +225,23 @@ static inline void printk_deferred_exit(void) +@@ -221,11 +225,23 @@ static inline void printk_deferred_exit( { } @@ -4577,10 +27432,9 @@ index cf7d666ab1f8e..f88ec15f83dcc 100644 static inline int printk_ratelimit(void) { return 0; -diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h -index 8f416c5e929ea..c0ef596f340b5 100644 ---- a/include/linux/rwlock.h -+++ b/include/linux/rwlock.h +diff -rupN linux.orig/include/linux/rwlock.h linux/include/linux/rwlock.h +--- linux.orig/include/linux/rwlock.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/rwlock.h 2022-12-04 10:40:26.712034055 -0500 @@ -1,7 +1,7 @@ #ifndef __LINUX_RWLOCK_H #define __LINUX_RWLOCK_H @@ -4590,11 +27444,10 @@ index 8f416c5e929ea..c0ef596f340b5 100644 # error "please don't include this file directly" #endif -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 8d82d6d326701..e1623b3001c5b 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -2038,6 +2038,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) +diff -rupN linux.orig/include/linux/sched.h linux/include/linux/sched.h +--- linux.orig/include/linux/sched.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/sched.h 2022-12-04 10:40:26.712034055 -0500 +@@ -2038,6 +2038,43 @@ static inline int test_tsk_need_resched( return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } @@ -4638,10 +27491,9 @@ index 8d82d6d326701..e1623b3001c5b 100644 /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return -diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h -index 16e3d75a324c7..ee1f719a21678 100644 ---- a/include/linux/serial_8250.h -+++ b/include/linux/serial_8250.h +diff -rupN linux.orig/include/linux/serial_8250.h linux/include/linux/serial_8250.h +--- linux.orig/include/linux/serial_8250.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/serial_8250.h 2022-12-04 10:40:26.712034055 -0500 @@ -7,6 +7,7 @@ #ifndef _LINUX_SERIAL_8250_H #define _LINUX_SERIAL_8250_H @@ -4659,7 +27511,7 @@ index 16e3d75a324c7..ee1f719a21678 100644 struct uart_8250_dma *dma; const struct uart_8250_ops *ops; -@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up); +@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_82 void serial8250_set_defaults(struct uart_8250_port *up); void serial8250_console_write(struct uart_8250_port *up, const char *s, unsigned int count); @@ -4668,28 +27520,9 @@ index 16e3d75a324c7..ee1f719a21678 100644 int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); -diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h -index 5c0c5174155d0..1341f7d62da44 100644 ---- a/include/linux/spinlock.h -+++ b/include/linux/spinlock.h -@@ -1,6 +1,7 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - #ifndef __LINUX_SPINLOCK_H - #define __LINUX_SPINLOCK_H -+#define __LINUX_INSIDE_SPINLOCK_H - - /* - * include/linux/spinlock.h - generic spinlock/rwlock declarations -@@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, - - void free_bucket_spinlocks(spinlock_t *locks); - -+#undef __LINUX_INSIDE_SPINLOCK_H - #endif /* __LINUX_SPINLOCK_H */ -diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h -index 51fa0dab68c4d..89eb6f4c659c7 100644 ---- a/include/linux/spinlock_api_smp.h -+++ b/include/linux/spinlock_api_smp.h +diff -rupN linux.orig/include/linux/spinlock_api_smp.h linux/include/linux/spinlock_api_smp.h +--- linux.orig/include/linux/spinlock_api_smp.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock_api_smp.h 2022-12-04 10:40:26.712034055 -0500 @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_SMP_H #define __LINUX_SPINLOCK_API_SMP_H @@ -4699,10 +27532,9 @@ index 51fa0dab68c4d..89eb6f4c659c7 100644 # error "please don't include this file directly" #endif -diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h -index b8ba00ccccdeb..819aeba1c87e6 100644 ---- a/include/linux/spinlock_api_up.h -+++ b/include/linux/spinlock_api_up.h +diff -rupN linux.orig/include/linux/spinlock_api_up.h linux/include/linux/spinlock_api_up.h +--- linux.orig/include/linux/spinlock_api_up.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock_api_up.h 2022-12-04 10:40:26.712034055 -0500 @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_UP_H #define __LINUX_SPINLOCK_API_UP_H @@ -4712,10 +27544,26 @@ index b8ba00ccccdeb..819aeba1c87e6 100644 # error "please don't include this file directly" #endif -diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h -index 835aedaf68acd..61c49b16f69ab 100644 ---- a/include/linux/spinlock_rt.h -+++ b/include/linux/spinlock_rt.h +diff -rupN linux.orig/include/linux/spinlock.h linux/include/linux/spinlock.h +--- linux.orig/include/linux/spinlock.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock.h 2022-12-04 10:40:26.712034055 -0500 +@@ -1,6 +1,7 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + #ifndef __LINUX_SPINLOCK_H + #define __LINUX_SPINLOCK_H ++#define __LINUX_INSIDE_SPINLOCK_H + + /* + * include/linux/spinlock.h - generic spinlock/rwlock declarations +@@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t + + void free_bucket_spinlocks(spinlock_t *locks); + ++#undef __LINUX_INSIDE_SPINLOCK_H + #endif /* __LINUX_SPINLOCK_H */ +diff -rupN linux.orig/include/linux/spinlock_rt.h linux/include/linux/spinlock_rt.h +--- linux.orig/include/linux/spinlock_rt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock_rt.h 2022-12-04 10:40:26.712034055 -0500 @@ -2,7 +2,7 @@ #ifndef __LINUX_SPINLOCK_RT_H #define __LINUX_SPINLOCK_RT_H @@ -4725,10 +27573,9 @@ index 835aedaf68acd..61c49b16f69ab 100644 #error Do not include directly. Use spinlock.h #endif -diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h -index 16521074b6f7c..c87204247592f 100644 ---- a/include/linux/spinlock_up.h -+++ b/include/linux/spinlock_up.h +diff -rupN linux.orig/include/linux/spinlock_up.h linux/include/linux/spinlock_up.h +--- linux.orig/include/linux/spinlock_up.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock_up.h 2022-12-04 10:40:26.716034044 -0500 @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_UP_H #define __LINUX_SPINLOCK_UP_H @@ -4738,11 +27585,10 @@ index 16521074b6f7c..c87204247592f 100644 # error "please don't include this file directly" #endif -diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h -index 9f392ec76f2bb..779e0e96b9cb0 100644 ---- a/include/linux/thread_info.h -+++ b/include/linux/thread_info.h -@@ -177,7 +177,17 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti +diff -rupN linux.orig/include/linux/thread_info.h linux/include/linux/thread_info.h +--- linux.orig/include/linux/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/thread_info.h 2022-12-04 10:40:26.716034044 -0500 +@@ -177,7 +177,17 @@ static __always_inline unsigned long rea clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ @@ -4761,10 +27607,9 @@ index 9f392ec76f2bb..779e0e96b9cb0 100644 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, -diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h -index 20749bd9db718..224bf60d6563c 100644 ---- a/include/linux/trace_events.h -+++ b/include/linux/trace_events.h +diff -rupN linux.orig/include/linux/trace_events.h linux/include/linux/trace_events.h +--- linux.orig/include/linux/trace_events.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/trace_events.h 2022-12-04 10:40:26.716034044 -0500 @@ -70,6 +70,7 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; @@ -4773,7 +27618,7 @@ index 20749bd9db718..224bf60d6563c 100644 }; #define TRACE_EVENT_TYPE_MAX \ -@@ -159,9 +160,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, +@@ -159,9 +160,10 @@ static inline void tracing_generic_entry unsigned int trace_ctx) { entry->preempt_count = trace_ctx & 0xff; @@ -4799,10 +27644,9 @@ index 20749bd9db718..224bf60d6563c 100644 TRACE_FLAG_NMI = 0x40, TRACE_FLAG_BH_OFF = 0x80, }; -diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h -index 6ad4e9032d538..ffe48e69b3f3a 100644 ---- a/include/linux/u64_stats_sync.h -+++ b/include/linux/u64_stats_sync.h +diff -rupN linux.orig/include/linux/u64_stats_sync.h linux/include/linux/u64_stats_sync.h +--- linux.orig/include/linux/u64_stats_sync.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/u64_stats_sync.h 2022-12-04 10:40:26.716034044 -0500 @@ -8,7 +8,7 @@ * * Key points : @@ -4843,7 +27687,7 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 seqcount_t seq; #endif }; -@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p) +@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_sta local64_inc(&p->v); } @@ -4867,7 +27711,7 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 typedef struct { u64 v; -@@ -123,122 +134,82 @@ static inline void u64_stats_inc(u64_stats_t *p) +@@ -123,122 +134,82 @@ static inline void u64_stats_inc(u64_sta { p->v++; } @@ -4944,25 +27788,50 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 -#else - return 0; -#endif -+} -+ + } + +-static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) -+{ + { +-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) +- preempt_disable(); +-#endif +- return __u64_stats_fetch_begin(syncp); + return read_seqcount_retry(&syncp->seq, start); -+} + } +#endif /* !64 bit */ -+ + +-static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +- unsigned int start) +static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) -+{ + { +-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) +- return read_seqcount_retry(&syncp->seq, start); +-#else +- return false; +-#endif + __u64_stats_update_begin(syncp); -+} -+ + } + +-static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +- unsigned int start) +static inline void u64_stats_update_end(struct u64_stats_sync *syncp) -+{ + { +-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) +- preempt_enable(); +-#endif +- return __u64_stats_fetch_retry(syncp, start); + __u64_stats_update_end(syncp); -+} -+ + } + +-/* +- * In case irq handlers can update u64 counters, readers can use following helpers +- * - SMP 32bit arches use seqcount protection, irq safe. +- * - UP 32bit must disable irqs. +- * - 64bit have no problem atomically reading u64 values, irq safe. +- */ +-static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) +static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +{ + unsigned long flags = __u64_stats_irqsave(); @@ -4976,54 +27845,23 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 +{ + __u64_stats_update_end(syncp); + __u64_stats_irqrestore(flags); - } - - static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) ++} ++ ++static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { --#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) -- preempt_disable(); --#endif - return __u64_stats_fetch_begin(syncp); - } - --static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, -- unsigned int start) --{ --#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) -- return read_seqcount_retry(&syncp->seq, start); --#else -- return false; --#endif --} -- - static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, - unsigned int start) - { --#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) -- preempt_enable(); --#endif -- return __u64_stats_fetch_retry(syncp, start); --} -- --/* -- * In case irq handlers can update u64 counters, readers can use following helpers -- * - SMP 32bit arches use seqcount protection, irq safe. -- * - UP 32bit must disable irqs. -- * - 64bit have no problem atomically reading u64 values, irq safe. -- */ --static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) --{ -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_disable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) - local_irq_disable(); -#endif -- return __u64_stats_fetch_begin(syncp); --} -- + return __u64_stats_fetch_begin(syncp); + } + -static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, - unsigned int start) --{ ++static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, ++ unsigned int start) + { -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_enable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) @@ -5032,10 +27870,9 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 return __u64_stats_fetch_retry(syncp, start); } -diff --git a/init/Kconfig b/init/Kconfig -index 532362fcfe31f..08ec5f25e6642 100644 ---- a/init/Kconfig -+++ b/init/Kconfig +diff -rupN linux.orig/init/Kconfig linux/init/Kconfig +--- linux.orig/init/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/init/Kconfig 2022-12-04 10:40:26.716034044 -0500 @@ -1574,6 +1574,10 @@ config PRINTK very difficult to diagnose system problems, saying N here is strongly discouraged. @@ -5047,27 +27884,10 @@ index 532362fcfe31f..08ec5f25e6642 100644 config BUG bool "BUG() support" if EXPERT default y -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index c2f1fd95a8214..260c08efeb486 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -1,5 +1,11 @@ - # SPDX-License-Identifier: GPL-2.0-only - -+config HAVE_PREEMPT_LAZY -+ bool -+ -+config PREEMPT_LAZY -+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT -+ - config PREEMPT_NONE_BUILD - bool - -diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c -index 22e7a805c6723..b492e482b63a9 100644 ---- a/kernel/bpf/syscall.c -+++ b/kernel/bpf/syscall.c -@@ -2107,11 +2107,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog, +diff -rupN linux.orig/kernel/bpf/syscall.c linux/kernel/bpf/syscall.c +--- linux.orig/kernel/bpf/syscall.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/bpf/syscall.c 2022-12-04 10:40:26.716034044 -0500 +@@ -2118,11 +2118,11 @@ static void bpf_prog_get_stats(const str st = per_cpu_ptr(prog->stats, cpu); do { @@ -5081,11 +27901,5333 @@ index 22e7a805c6723..b492e482b63a9 100644 nsecs += tnsecs; cnt += tcnt; misses += tmisses; -diff --git a/kernel/entry/common.c b/kernel/entry/common.c -index 063068a9ea9b3..26b772720b227 100644 ---- a/kernel/entry/common.c -+++ b/kernel/entry/common.c -@@ -153,7 +153,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, +diff -rupN linux.orig/kernel/bpf/syscall.c.orig linux/kernel/bpf/syscall.c.orig +--- linux.orig/kernel/bpf/syscall.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/kernel/bpf/syscall.c.orig 2022-12-04 10:40:18.684054629 -0500 +@@ -0,0 +1,5319 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ ++ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ ++ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) ++#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) ++#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) ++#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ ++ IS_FD_HASH(map)) ++ ++#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) ++ ++DEFINE_PER_CPU(int, bpf_prog_active); ++static DEFINE_IDR(prog_idr); ++static DEFINE_SPINLOCK(prog_idr_lock); ++static DEFINE_IDR(map_idr); ++static DEFINE_SPINLOCK(map_idr_lock); ++static DEFINE_IDR(link_idr); ++static DEFINE_SPINLOCK(link_idr_lock); ++ ++int sysctl_unprivileged_bpf_disabled __read_mostly = ++ IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; ++ ++static const struct bpf_map_ops * const bpf_map_types[] = { ++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) ++#define BPF_MAP_TYPE(_id, _ops) \ ++ [_id] = &_ops, ++#define BPF_LINK_TYPE(_id, _name) ++#include ++#undef BPF_PROG_TYPE ++#undef BPF_MAP_TYPE ++#undef BPF_LINK_TYPE ++}; ++ ++/* ++ * If we're handed a bigger struct than we know of, ensure all the unknown bits ++ * are 0 - i.e. new user-space does not rely on any kernel feature extensions ++ * we don't know about yet. ++ * ++ * There is a ToCToU between this function call and the following ++ * copy_from_user() call. However, this is not a concern since this function is ++ * meant to be a future-proofing of bits. ++ */ ++int bpf_check_uarg_tail_zero(bpfptr_t uaddr, ++ size_t expected_size, ++ size_t actual_size) ++{ ++ int res; ++ ++ if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ ++ return -E2BIG; ++ ++ if (actual_size <= expected_size) ++ return 0; ++ ++ if (uaddr.is_kernel) ++ res = memchr_inv(uaddr.kernel + expected_size, 0, ++ actual_size - expected_size) == NULL; ++ else ++ res = check_zeroed_user(uaddr.user + expected_size, ++ actual_size - expected_size); ++ if (res < 0) ++ return res; ++ return res ? 0 : -E2BIG; ++} ++ ++const struct bpf_map_ops bpf_map_offload_ops = { ++ .map_meta_equal = bpf_map_meta_equal, ++ .map_alloc = bpf_map_offload_map_alloc, ++ .map_free = bpf_map_offload_map_free, ++ .map_check_btf = map_check_no_btf, ++}; ++ ++static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) ++{ ++ const struct bpf_map_ops *ops; ++ u32 type = attr->map_type; ++ struct bpf_map *map; ++ int err; ++ ++ if (type >= ARRAY_SIZE(bpf_map_types)) ++ return ERR_PTR(-EINVAL); ++ type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); ++ ops = bpf_map_types[type]; ++ if (!ops) ++ return ERR_PTR(-EINVAL); ++ ++ if (ops->map_alloc_check) { ++ err = ops->map_alloc_check(attr); ++ if (err) ++ return ERR_PTR(err); ++ } ++ if (attr->map_ifindex) ++ ops = &bpf_map_offload_ops; ++ map = ops->map_alloc(attr); ++ if (IS_ERR(map)) ++ return map; ++ map->ops = ops; ++ map->map_type = type; ++ return map; ++} ++ ++static void bpf_map_write_active_inc(struct bpf_map *map) ++{ ++ atomic64_inc(&map->writecnt); ++} ++ ++static void bpf_map_write_active_dec(struct bpf_map *map) ++{ ++ atomic64_dec(&map->writecnt); ++} ++ ++bool bpf_map_write_active(const struct bpf_map *map) ++{ ++ return atomic64_read(&map->writecnt) != 0; ++} ++ ++static u32 bpf_map_value_size(const struct bpf_map *map) ++{ ++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || ++ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) ++ return round_up(map->value_size, 8) * num_possible_cpus(); ++ else if (IS_FD_MAP(map)) ++ return sizeof(u32); ++ else ++ return map->value_size; ++} ++ ++static void maybe_wait_bpf_programs(struct bpf_map *map) ++{ ++ /* Wait for any running BPF programs to complete so that ++ * userspace, when we return to it, knows that all programs ++ * that could be running use the new map value. ++ */ ++ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || ++ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) ++ synchronize_rcu(); ++} ++ ++static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, ++ void *value, __u64 flags) ++{ ++ int err; ++ ++ /* Need to create a kthread, thus must support schedule */ ++ if (bpf_map_is_dev_bound(map)) { ++ return bpf_map_offload_update_elem(map, key, value, flags); ++ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || ++ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { ++ return map->ops->map_update_elem(map, key, value, flags); ++ } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || ++ map->map_type == BPF_MAP_TYPE_SOCKMAP) { ++ return sock_map_update_elem_sys(map, key, value, flags); ++ } else if (IS_FD_PROG_ARRAY(map)) { ++ return bpf_fd_array_map_update_elem(map, f.file, key, value, ++ flags); ++ } ++ ++ bpf_disable_instrumentation(); ++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { ++ err = bpf_percpu_hash_update(map, key, value, flags); ++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { ++ err = bpf_percpu_array_update(map, key, value, flags); ++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { ++ err = bpf_percpu_cgroup_storage_update(map, key, value, ++ flags); ++ } else if (IS_FD_ARRAY(map)) { ++ rcu_read_lock(); ++ err = bpf_fd_array_map_update_elem(map, f.file, key, value, ++ flags); ++ rcu_read_unlock(); ++ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { ++ rcu_read_lock(); ++ err = bpf_fd_htab_map_update_elem(map, f.file, key, value, ++ flags); ++ rcu_read_unlock(); ++ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { ++ /* rcu_read_lock() is not needed */ ++ err = bpf_fd_reuseport_array_update_elem(map, key, value, ++ flags); ++ } else if (map->map_type == BPF_MAP_TYPE_QUEUE || ++ map->map_type == BPF_MAP_TYPE_STACK || ++ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { ++ err = map->ops->map_push_elem(map, value, flags); ++ } else { ++ rcu_read_lock(); ++ err = map->ops->map_update_elem(map, key, value, flags); ++ rcu_read_unlock(); ++ } ++ bpf_enable_instrumentation(); ++ maybe_wait_bpf_programs(map); ++ ++ return err; ++} ++ ++static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, ++ __u64 flags) ++{ ++ void *ptr; ++ int err; ++ ++ if (bpf_map_is_dev_bound(map)) ++ return bpf_map_offload_lookup_elem(map, key, value); ++ ++ bpf_disable_instrumentation(); ++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { ++ err = bpf_percpu_hash_copy(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { ++ err = bpf_percpu_array_copy(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { ++ err = bpf_percpu_cgroup_storage_copy(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { ++ err = bpf_stackmap_copy(map, key, value); ++ } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { ++ err = bpf_fd_array_map_lookup_elem(map, key, value); ++ } else if (IS_FD_HASH(map)) { ++ err = bpf_fd_htab_map_lookup_elem(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { ++ err = bpf_fd_reuseport_array_lookup_elem(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_QUEUE || ++ map->map_type == BPF_MAP_TYPE_STACK || ++ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { ++ err = map->ops->map_peek_elem(map, value); ++ } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { ++ /* struct_ops map requires directly updating "value" */ ++ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); ++ } else { ++ rcu_read_lock(); ++ if (map->ops->map_lookup_elem_sys_only) ++ ptr = map->ops->map_lookup_elem_sys_only(map, key); ++ else ++ ptr = map->ops->map_lookup_elem(map, key); ++ if (IS_ERR(ptr)) { ++ err = PTR_ERR(ptr); ++ } else if (!ptr) { ++ err = -ENOENT; ++ } else { ++ err = 0; ++ if (flags & BPF_F_LOCK) ++ /* lock 'ptr' and copy everything but lock */ ++ copy_map_value_locked(map, value, ptr, true); ++ else ++ copy_map_value(map, value, ptr); ++ /* mask lock and timer, since value wasn't zero inited */ ++ check_and_init_map_value(map, value); ++ } ++ rcu_read_unlock(); ++ } ++ ++ bpf_enable_instrumentation(); ++ maybe_wait_bpf_programs(map); ++ ++ return err; ++} ++ ++/* Please, do not use this function outside from the map creation path ++ * (e.g. in map update path) without taking care of setting the active ++ * memory cgroup (see at bpf_map_kmalloc_node() for example). ++ */ ++static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) ++{ ++ /* We really just want to fail instead of triggering OOM killer ++ * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, ++ * which is used for lower order allocation requests. ++ * ++ * It has been observed that higher order allocation requests done by ++ * vmalloc with __GFP_NORETRY being set might fail due to not trying ++ * to reclaim memory from the page cache, thus we set ++ * __GFP_RETRY_MAYFAIL to avoid such situations. ++ */ ++ ++ const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; ++ unsigned int flags = 0; ++ unsigned long align = 1; ++ void *area; ++ ++ if (size >= SIZE_MAX) ++ return NULL; ++ ++ /* kmalloc()'ed memory can't be mmap()'ed */ ++ if (mmapable) { ++ BUG_ON(!PAGE_ALIGNED(size)); ++ align = SHMLBA; ++ flags = VM_USERMAP; ++ } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { ++ area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, ++ numa_node); ++ if (area != NULL) ++ return area; ++ } ++ ++ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, ++ gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, ++ flags, numa_node, __builtin_return_address(0)); ++} ++ ++void *bpf_map_area_alloc(u64 size, int numa_node) ++{ ++ return __bpf_map_area_alloc(size, numa_node, false); ++} ++ ++void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) ++{ ++ return __bpf_map_area_alloc(size, numa_node, true); ++} ++ ++void bpf_map_area_free(void *area) ++{ ++ kvfree(area); ++} ++ ++static u32 bpf_map_flags_retain_permanent(u32 flags) ++{ ++ /* Some map creation flags are not tied to the map object but ++ * rather to the map fd instead, so they have no meaning upon ++ * map object inspection since multiple file descriptors with ++ * different (access) properties can exist here. Thus, given ++ * this has zero meaning for the map itself, lets clear these ++ * from here. ++ */ ++ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); ++} ++ ++void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) ++{ ++ map->map_type = attr->map_type; ++ map->key_size = attr->key_size; ++ map->value_size = attr->value_size; ++ map->max_entries = attr->max_entries; ++ map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); ++ map->numa_node = bpf_map_attr_numa_node(attr); ++ map->map_extra = attr->map_extra; ++} ++ ++static int bpf_map_alloc_id(struct bpf_map *map) ++{ ++ int id; ++ ++ idr_preload(GFP_KERNEL); ++ spin_lock_bh(&map_idr_lock); ++ id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); ++ if (id > 0) ++ map->id = id; ++ spin_unlock_bh(&map_idr_lock); ++ idr_preload_end(); ++ ++ if (WARN_ON_ONCE(!id)) ++ return -ENOSPC; ++ ++ return id > 0 ? 0 : id; ++} ++ ++void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) ++{ ++ unsigned long flags; ++ ++ /* Offloaded maps are removed from the IDR store when their device ++ * disappears - even if someone holds an fd to them they are unusable, ++ * the memory is gone, all ops will fail; they are simply waiting for ++ * refcnt to drop to be freed. ++ */ ++ if (!map->id) ++ return; ++ ++ if (do_idr_lock) ++ spin_lock_irqsave(&map_idr_lock, flags); ++ else ++ __acquire(&map_idr_lock); ++ ++ idr_remove(&map_idr, map->id); ++ map->id = 0; ++ ++ if (do_idr_lock) ++ spin_unlock_irqrestore(&map_idr_lock, flags); ++ else ++ __release(&map_idr_lock); ++} ++ ++#ifdef CONFIG_MEMCG_KMEM ++static void bpf_map_save_memcg(struct bpf_map *map) ++{ ++ /* Currently if a map is created by a process belonging to the root ++ * memory cgroup, get_obj_cgroup_from_current() will return NULL. ++ * So we have to check map->objcg for being NULL each time it's ++ * being used. ++ */ ++ map->objcg = get_obj_cgroup_from_current(); ++} ++ ++static void bpf_map_release_memcg(struct bpf_map *map) ++{ ++ if (map->objcg) ++ obj_cgroup_put(map->objcg); ++} ++ ++static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) ++{ ++ if (map->objcg) ++ return get_mem_cgroup_from_objcg(map->objcg); ++ ++ return root_mem_cgroup; ++} ++ ++void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, ++ int node) ++{ ++ struct mem_cgroup *memcg, *old_memcg; ++ void *ptr; ++ ++ memcg = bpf_map_get_memcg(map); ++ old_memcg = set_active_memcg(memcg); ++ ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); ++ set_active_memcg(old_memcg); ++ mem_cgroup_put(memcg); ++ ++ return ptr; ++} ++ ++void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) ++{ ++ struct mem_cgroup *memcg, *old_memcg; ++ void *ptr; ++ ++ memcg = bpf_map_get_memcg(map); ++ old_memcg = set_active_memcg(memcg); ++ ptr = kzalloc(size, flags | __GFP_ACCOUNT); ++ set_active_memcg(old_memcg); ++ mem_cgroup_put(memcg); ++ ++ return ptr; ++} ++ ++void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, ++ size_t align, gfp_t flags) ++{ ++ struct mem_cgroup *memcg, *old_memcg; ++ void __percpu *ptr; ++ ++ memcg = bpf_map_get_memcg(map); ++ old_memcg = set_active_memcg(memcg); ++ ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); ++ set_active_memcg(old_memcg); ++ mem_cgroup_put(memcg); ++ ++ return ptr; ++} ++ ++#else ++static void bpf_map_save_memcg(struct bpf_map *map) ++{ ++} ++ ++static void bpf_map_release_memcg(struct bpf_map *map) ++{ ++} ++#endif ++ ++static int bpf_map_kptr_off_cmp(const void *a, const void *b) ++{ ++ const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; ++ ++ if (off_desc1->offset < off_desc2->offset) ++ return -1; ++ else if (off_desc1->offset > off_desc2->offset) ++ return 1; ++ return 0; ++} ++ ++struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) ++{ ++ /* Since members are iterated in btf_find_field in increasing order, ++ * offsets appended to kptr_off_tab are in increasing order, so we can ++ * do bsearch to find exact match. ++ */ ++ struct bpf_map_value_off *tab; ++ ++ if (!map_value_has_kptrs(map)) ++ return NULL; ++ tab = map->kptr_off_tab; ++ return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); ++} ++ ++void bpf_map_free_kptr_off_tab(struct bpf_map *map) ++{ ++ struct bpf_map_value_off *tab = map->kptr_off_tab; ++ int i; ++ ++ if (!map_value_has_kptrs(map)) ++ return; ++ for (i = 0; i < tab->nr_off; i++) { ++ if (tab->off[i].kptr.module) ++ module_put(tab->off[i].kptr.module); ++ btf_put(tab->off[i].kptr.btf); ++ } ++ kfree(tab); ++ map->kptr_off_tab = NULL; ++} ++ ++struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) ++{ ++ struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; ++ int size, i; ++ ++ if (!map_value_has_kptrs(map)) ++ return ERR_PTR(-ENOENT); ++ size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); ++ new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); ++ if (!new_tab) ++ return ERR_PTR(-ENOMEM); ++ /* Do a deep copy of the kptr_off_tab */ ++ for (i = 0; i < tab->nr_off; i++) { ++ btf_get(tab->off[i].kptr.btf); ++ if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { ++ while (i--) { ++ if (tab->off[i].kptr.module) ++ module_put(tab->off[i].kptr.module); ++ btf_put(tab->off[i].kptr.btf); ++ } ++ kfree(new_tab); ++ return ERR_PTR(-ENXIO); ++ } ++ } ++ return new_tab; ++} ++ ++bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) ++{ ++ struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; ++ bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); ++ int size; ++ ++ if (!a_has_kptr && !b_has_kptr) ++ return true; ++ if (a_has_kptr != b_has_kptr) ++ return false; ++ if (tab_a->nr_off != tab_b->nr_off) ++ return false; ++ size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); ++ return !memcmp(tab_a, tab_b, size); ++} ++ ++/* Caller must ensure map_value_has_kptrs is true. Note that this function can ++ * be called on a map value while the map_value is visible to BPF programs, as ++ * it ensures the correct synchronization, and we already enforce the same using ++ * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. ++ */ ++void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) ++{ ++ struct bpf_map_value_off *tab = map->kptr_off_tab; ++ unsigned long *btf_id_ptr; ++ int i; ++ ++ for (i = 0; i < tab->nr_off; i++) { ++ struct bpf_map_value_off_desc *off_desc = &tab->off[i]; ++ unsigned long old_ptr; ++ ++ btf_id_ptr = map_value + off_desc->offset; ++ if (off_desc->type == BPF_KPTR_UNREF) { ++ u64 *p = (u64 *)btf_id_ptr; ++ ++ WRITE_ONCE(*p, 0); ++ continue; ++ } ++ old_ptr = xchg(btf_id_ptr, 0); ++ off_desc->kptr.dtor((void *)old_ptr); ++ } ++} ++ ++/* called from workqueue */ ++static void bpf_map_free_deferred(struct work_struct *work) ++{ ++ struct bpf_map *map = container_of(work, struct bpf_map, work); ++ ++ security_bpf_map_free(map); ++ kfree(map->off_arr); ++ bpf_map_release_memcg(map); ++ /* implementation dependent freeing, map_free callback also does ++ * bpf_map_free_kptr_off_tab, if needed. ++ */ ++ map->ops->map_free(map); ++} ++ ++static void bpf_map_put_uref(struct bpf_map *map) ++{ ++ if (atomic64_dec_and_test(&map->usercnt)) { ++ if (map->ops->map_release_uref) ++ map->ops->map_release_uref(map); ++ } ++} ++ ++/* decrement map refcnt and schedule it for freeing via workqueue ++ * (unrelying map implementation ops->map_free() might sleep) ++ */ ++static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) ++{ ++ if (atomic64_dec_and_test(&map->refcnt)) { ++ /* bpf_map_free_id() must be called first */ ++ bpf_map_free_id(map, do_idr_lock); ++ btf_put(map->btf); ++ INIT_WORK(&map->work, bpf_map_free_deferred); ++ schedule_work(&map->work); ++ } ++} ++ ++void bpf_map_put(struct bpf_map *map) ++{ ++ __bpf_map_put(map, true); ++} ++EXPORT_SYMBOL_GPL(bpf_map_put); ++ ++void bpf_map_put_with_uref(struct bpf_map *map) ++{ ++ bpf_map_put_uref(map); ++ bpf_map_put(map); ++} ++ ++static int bpf_map_release(struct inode *inode, struct file *filp) ++{ ++ struct bpf_map *map = filp->private_data; ++ ++ if (map->ops->map_release) ++ map->ops->map_release(map, filp); ++ ++ bpf_map_put_with_uref(map); ++ return 0; ++} ++ ++static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) ++{ ++ fmode_t mode = f.file->f_mode; ++ ++ /* Our file permissions may have been overridden by global ++ * map permissions facing syscall side. ++ */ ++ if (READ_ONCE(map->frozen)) ++ mode &= ~FMODE_CAN_WRITE; ++ return mode; ++} ++ ++#ifdef CONFIG_PROC_FS ++/* Provides an approximation of the map's memory footprint. ++ * Used only to provide a backward compatibility and display ++ * a reasonable "memlock" info. ++ */ ++static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) ++{ ++ unsigned long size; ++ ++ size = round_up(map->key_size + bpf_map_value_size(map), 8); ++ ++ return round_up(map->max_entries * size, PAGE_SIZE); ++} ++ ++static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) ++{ ++ struct bpf_map *map = filp->private_data; ++ u32 type = 0, jited = 0; ++ ++ if (map_type_contains_progs(map)) { ++ spin_lock(&map->owner.lock); ++ type = map->owner.type; ++ jited = map->owner.jited; ++ spin_unlock(&map->owner.lock); ++ } ++ ++ seq_printf(m, ++ "map_type:\t%u\n" ++ "key_size:\t%u\n" ++ "value_size:\t%u\n" ++ "max_entries:\t%u\n" ++ "map_flags:\t%#x\n" ++ "map_extra:\t%#llx\n" ++ "memlock:\t%lu\n" ++ "map_id:\t%u\n" ++ "frozen:\t%u\n", ++ map->map_type, ++ map->key_size, ++ map->value_size, ++ map->max_entries, ++ map->map_flags, ++ (unsigned long long)map->map_extra, ++ bpf_map_memory_footprint(map), ++ map->id, ++ READ_ONCE(map->frozen)); ++ if (type) { ++ seq_printf(m, "owner_prog_type:\t%u\n", type); ++ seq_printf(m, "owner_jited:\t%u\n", jited); ++ } ++} ++#endif ++ ++static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, ++ loff_t *ppos) ++{ ++ /* We need this handler such that alloc_file() enables ++ * f_mode with FMODE_CAN_READ. ++ */ ++ return -EINVAL; ++} ++ ++static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, ++ size_t siz, loff_t *ppos) ++{ ++ /* We need this handler such that alloc_file() enables ++ * f_mode with FMODE_CAN_WRITE. ++ */ ++ return -EINVAL; ++} ++ ++/* called for any extra memory-mapped regions (except initial) */ ++static void bpf_map_mmap_open(struct vm_area_struct *vma) ++{ ++ struct bpf_map *map = vma->vm_file->private_data; ++ ++ if (vma->vm_flags & VM_MAYWRITE) ++ bpf_map_write_active_inc(map); ++} ++ ++/* called for all unmapped memory region (including initial) */ ++static void bpf_map_mmap_close(struct vm_area_struct *vma) ++{ ++ struct bpf_map *map = vma->vm_file->private_data; ++ ++ if (vma->vm_flags & VM_MAYWRITE) ++ bpf_map_write_active_dec(map); ++} ++ ++static const struct vm_operations_struct bpf_map_default_vmops = { ++ .open = bpf_map_mmap_open, ++ .close = bpf_map_mmap_close, ++}; ++ ++static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) ++{ ++ struct bpf_map *map = filp->private_data; ++ int err; ++ ++ if (!map->ops->map_mmap || map_value_has_spin_lock(map) || ++ map_value_has_timer(map) || map_value_has_kptrs(map)) ++ return -ENOTSUPP; ++ ++ if (!(vma->vm_flags & VM_SHARED)) ++ return -EINVAL; ++ ++ mutex_lock(&map->freeze_mutex); ++ ++ if (vma->vm_flags & VM_WRITE) { ++ if (map->frozen) { ++ err = -EPERM; ++ goto out; ++ } ++ /* map is meant to be read-only, so do not allow mapping as ++ * writable, because it's possible to leak a writable page ++ * reference and allows user-space to still modify it after ++ * freezing, while verifier will assume contents do not change ++ */ ++ if (map->map_flags & BPF_F_RDONLY_PROG) { ++ err = -EACCES; ++ goto out; ++ } ++ } ++ ++ /* set default open/close callbacks */ ++ vma->vm_ops = &bpf_map_default_vmops; ++ vma->vm_private_data = map; ++ vma->vm_flags &= ~VM_MAYEXEC; ++ if (!(vma->vm_flags & VM_WRITE)) ++ /* disallow re-mapping with PROT_WRITE */ ++ vma->vm_flags &= ~VM_MAYWRITE; ++ ++ err = map->ops->map_mmap(map, vma); ++ if (err) ++ goto out; ++ ++ if (vma->vm_flags & VM_MAYWRITE) ++ bpf_map_write_active_inc(map); ++out: ++ mutex_unlock(&map->freeze_mutex); ++ return err; ++} ++ ++static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) ++{ ++ struct bpf_map *map = filp->private_data; ++ ++ if (map->ops->map_poll) ++ return map->ops->map_poll(map, filp, pts); ++ ++ return EPOLLERR; ++} ++ ++const struct file_operations bpf_map_fops = { ++#ifdef CONFIG_PROC_FS ++ .show_fdinfo = bpf_map_show_fdinfo, ++#endif ++ .release = bpf_map_release, ++ .read = bpf_dummy_read, ++ .write = bpf_dummy_write, ++ .mmap = bpf_map_mmap, ++ .poll = bpf_map_poll, ++}; ++ ++int bpf_map_new_fd(struct bpf_map *map, int flags) ++{ ++ int ret; ++ ++ ret = security_bpf_map(map, OPEN_FMODE(flags)); ++ if (ret < 0) ++ return ret; ++ ++ return anon_inode_getfd("bpf-map", &bpf_map_fops, map, ++ flags | O_CLOEXEC); ++} ++ ++int bpf_get_file_flag(int flags) ++{ ++ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) ++ return -EINVAL; ++ if (flags & BPF_F_RDONLY) ++ return O_RDONLY; ++ if (flags & BPF_F_WRONLY) ++ return O_WRONLY; ++ return O_RDWR; ++} ++ ++/* helper macro to check that unused fields 'union bpf_attr' are zero */ ++#define CHECK_ATTR(CMD) \ ++ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ ++ sizeof(attr->CMD##_LAST_FIELD), 0, \ ++ sizeof(*attr) - \ ++ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ ++ sizeof(attr->CMD##_LAST_FIELD)) != NULL ++ ++/* dst and src must have at least "size" number of bytes. ++ * Return strlen on success and < 0 on error. ++ */ ++int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) ++{ ++ const char *end = src + size; ++ const char *orig_src = src; ++ ++ memset(dst, 0, size); ++ /* Copy all isalnum(), '_' and '.' chars. */ ++ while (src < end && *src) { ++ if (!isalnum(*src) && ++ *src != '_' && *src != '.') ++ return -EINVAL; ++ *dst++ = *src++; ++ } ++ ++ /* No '\0' found in "size" number of bytes */ ++ if (src == end) ++ return -EINVAL; ++ ++ return src - orig_src; ++} ++ ++int map_check_no_btf(const struct bpf_map *map, ++ const struct btf *btf, ++ const struct btf_type *key_type, ++ const struct btf_type *value_type) ++{ ++ return -ENOTSUPP; ++} ++ ++static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) ++{ ++ const u32 a = *(const u32 *)_a; ++ const u32 b = *(const u32 *)_b; ++ ++ if (a < b) ++ return -1; ++ else if (a > b) ++ return 1; ++ return 0; ++} ++ ++static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) ++{ ++ struct bpf_map *map = (struct bpf_map *)priv; ++ u32 *off_base = map->off_arr->field_off; ++ u32 *a = _a, *b = _b; ++ u8 *sz_a, *sz_b; ++ ++ sz_a = map->off_arr->field_sz + (a - off_base); ++ sz_b = map->off_arr->field_sz + (b - off_base); ++ ++ swap(*a, *b); ++ swap(*sz_a, *sz_b); ++} ++ ++static int bpf_map_alloc_off_arr(struct bpf_map *map) ++{ ++ bool has_spin_lock = map_value_has_spin_lock(map); ++ bool has_timer = map_value_has_timer(map); ++ bool has_kptrs = map_value_has_kptrs(map); ++ struct bpf_map_off_arr *off_arr; ++ u32 i; ++ ++ if (!has_spin_lock && !has_timer && !has_kptrs) { ++ map->off_arr = NULL; ++ return 0; ++ } ++ ++ off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); ++ if (!off_arr) ++ return -ENOMEM; ++ map->off_arr = off_arr; ++ ++ off_arr->cnt = 0; ++ if (has_spin_lock) { ++ i = off_arr->cnt; ++ ++ off_arr->field_off[i] = map->spin_lock_off; ++ off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); ++ off_arr->cnt++; ++ } ++ if (has_timer) { ++ i = off_arr->cnt; ++ ++ off_arr->field_off[i] = map->timer_off; ++ off_arr->field_sz[i] = sizeof(struct bpf_timer); ++ off_arr->cnt++; ++ } ++ if (has_kptrs) { ++ struct bpf_map_value_off *tab = map->kptr_off_tab; ++ u32 *off = &off_arr->field_off[off_arr->cnt]; ++ u8 *sz = &off_arr->field_sz[off_arr->cnt]; ++ ++ for (i = 0; i < tab->nr_off; i++) { ++ *off++ = tab->off[i].offset; ++ *sz++ = sizeof(u64); ++ } ++ off_arr->cnt += tab->nr_off; ++ } ++ ++ if (off_arr->cnt == 1) ++ return 0; ++ sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), ++ map_off_arr_cmp, map_off_arr_swap, map); ++ return 0; ++} ++ ++static int map_check_btf(struct bpf_map *map, const struct btf *btf, ++ u32 btf_key_id, u32 btf_value_id) ++{ ++ const struct btf_type *key_type, *value_type; ++ u32 key_size, value_size; ++ int ret = 0; ++ ++ /* Some maps allow key to be unspecified. */ ++ if (btf_key_id) { ++ key_type = btf_type_id_size(btf, &btf_key_id, &key_size); ++ if (!key_type || key_size != map->key_size) ++ return -EINVAL; ++ } else { ++ key_type = btf_type_by_id(btf, 0); ++ if (!map->ops->map_check_btf) ++ return -EINVAL; ++ } ++ ++ value_type = btf_type_id_size(btf, &btf_value_id, &value_size); ++ if (!value_type || value_size != map->value_size) ++ return -EINVAL; ++ ++ map->spin_lock_off = btf_find_spin_lock(btf, value_type); ++ ++ if (map_value_has_spin_lock(map)) { ++ if (map->map_flags & BPF_F_RDONLY_PROG) ++ return -EACCES; ++ if (map->map_type != BPF_MAP_TYPE_HASH && ++ map->map_type != BPF_MAP_TYPE_ARRAY && ++ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && ++ map->map_type != BPF_MAP_TYPE_SK_STORAGE && ++ map->map_type != BPF_MAP_TYPE_INODE_STORAGE && ++ map->map_type != BPF_MAP_TYPE_TASK_STORAGE) ++ return -ENOTSUPP; ++ if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > ++ map->value_size) { ++ WARN_ONCE(1, ++ "verifier bug spin_lock_off %d value_size %d\n", ++ map->spin_lock_off, map->value_size); ++ return -EFAULT; ++ } ++ } ++ ++ map->timer_off = btf_find_timer(btf, value_type); ++ if (map_value_has_timer(map)) { ++ if (map->map_flags & BPF_F_RDONLY_PROG) ++ return -EACCES; ++ if (map->map_type != BPF_MAP_TYPE_HASH && ++ map->map_type != BPF_MAP_TYPE_LRU_HASH && ++ map->map_type != BPF_MAP_TYPE_ARRAY) ++ return -EOPNOTSUPP; ++ } ++ ++ map->kptr_off_tab = btf_parse_kptrs(btf, value_type); ++ if (map_value_has_kptrs(map)) { ++ if (!bpf_capable()) { ++ ret = -EPERM; ++ goto free_map_tab; ++ } ++ if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { ++ ret = -EACCES; ++ goto free_map_tab; ++ } ++ if (map->map_type != BPF_MAP_TYPE_HASH && ++ map->map_type != BPF_MAP_TYPE_LRU_HASH && ++ map->map_type != BPF_MAP_TYPE_ARRAY) { ++ ret = -EOPNOTSUPP; ++ goto free_map_tab; ++ } ++ } ++ ++ if (map->ops->map_check_btf) { ++ ret = map->ops->map_check_btf(map, btf, key_type, value_type); ++ if (ret < 0) ++ goto free_map_tab; ++ } ++ ++ return ret; ++free_map_tab: ++ bpf_map_free_kptr_off_tab(map); ++ return ret; ++} ++ ++#define BPF_MAP_CREATE_LAST_FIELD map_extra ++/* called via syscall */ ++static int map_create(union bpf_attr *attr) ++{ ++ int numa_node = bpf_map_attr_numa_node(attr); ++ struct bpf_map *map; ++ int f_flags; ++ int err; ++ ++ err = CHECK_ATTR(BPF_MAP_CREATE); ++ if (err) ++ return -EINVAL; ++ ++ if (attr->btf_vmlinux_value_type_id) { ++ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || ++ attr->btf_key_type_id || attr->btf_value_type_id) ++ return -EINVAL; ++ } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { ++ return -EINVAL; ++ } ++ ++ if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && ++ attr->map_extra != 0) ++ return -EINVAL; ++ ++ f_flags = bpf_get_file_flag(attr->map_flags); ++ if (f_flags < 0) ++ return f_flags; ++ ++ if (numa_node != NUMA_NO_NODE && ++ ((unsigned int)numa_node >= nr_node_ids || ++ !node_online(numa_node))) ++ return -EINVAL; ++ ++ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ ++ map = find_and_alloc_map(attr); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ ++ err = bpf_obj_name_cpy(map->name, attr->map_name, ++ sizeof(attr->map_name)); ++ if (err < 0) ++ goto free_map; ++ ++ atomic64_set(&map->refcnt, 1); ++ atomic64_set(&map->usercnt, 1); ++ mutex_init(&map->freeze_mutex); ++ spin_lock_init(&map->owner.lock); ++ ++ map->spin_lock_off = -EINVAL; ++ map->timer_off = -EINVAL; ++ if (attr->btf_key_type_id || attr->btf_value_type_id || ++ /* Even the map's value is a kernel's struct, ++ * the bpf_prog.o must have BTF to begin with ++ * to figure out the corresponding kernel's ++ * counter part. Thus, attr->btf_fd has ++ * to be valid also. ++ */ ++ attr->btf_vmlinux_value_type_id) { ++ struct btf *btf; ++ ++ btf = btf_get_by_fd(attr->btf_fd); ++ if (IS_ERR(btf)) { ++ err = PTR_ERR(btf); ++ goto free_map; ++ } ++ if (btf_is_kernel(btf)) { ++ btf_put(btf); ++ err = -EACCES; ++ goto free_map; ++ } ++ map->btf = btf; ++ ++ if (attr->btf_value_type_id) { ++ err = map_check_btf(map, btf, attr->btf_key_type_id, ++ attr->btf_value_type_id); ++ if (err) ++ goto free_map; ++ } ++ ++ map->btf_key_type_id = attr->btf_key_type_id; ++ map->btf_value_type_id = attr->btf_value_type_id; ++ map->btf_vmlinux_value_type_id = ++ attr->btf_vmlinux_value_type_id; ++ } ++ ++ err = bpf_map_alloc_off_arr(map); ++ if (err) ++ goto free_map; ++ ++ err = security_bpf_map_alloc(map); ++ if (err) ++ goto free_map_off_arr; ++ ++ err = bpf_map_alloc_id(map); ++ if (err) ++ goto free_map_sec; ++ ++ bpf_map_save_memcg(map); ++ ++ err = bpf_map_new_fd(map, f_flags); ++ if (err < 0) { ++ /* failed to allocate fd. ++ * bpf_map_put_with_uref() is needed because the above ++ * bpf_map_alloc_id() has published the map ++ * to the userspace and the userspace may ++ * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. ++ */ ++ bpf_map_put_with_uref(map); ++ return err; ++ } ++ ++ return err; ++ ++free_map_sec: ++ security_bpf_map_free(map); ++free_map_off_arr: ++ kfree(map->off_arr); ++free_map: ++ btf_put(map->btf); ++ map->ops->map_free(map); ++ return err; ++} ++ ++/* if error is returned, fd is released. ++ * On success caller should complete fd access with matching fdput() ++ */ ++struct bpf_map *__bpf_map_get(struct fd f) ++{ ++ if (!f.file) ++ return ERR_PTR(-EBADF); ++ if (f.file->f_op != &bpf_map_fops) { ++ fdput(f); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ return f.file->private_data; ++} ++ ++void bpf_map_inc(struct bpf_map *map) ++{ ++ atomic64_inc(&map->refcnt); ++} ++EXPORT_SYMBOL_GPL(bpf_map_inc); ++ ++void bpf_map_inc_with_uref(struct bpf_map *map) ++{ ++ atomic64_inc(&map->refcnt); ++ atomic64_inc(&map->usercnt); ++} ++EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); ++ ++struct bpf_map *bpf_map_get(u32 ufd) ++{ ++ struct fd f = fdget(ufd); ++ struct bpf_map *map; ++ ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return map; ++ ++ bpf_map_inc(map); ++ fdput(f); ++ ++ return map; ++} ++EXPORT_SYMBOL(bpf_map_get); ++ ++struct bpf_map *bpf_map_get_with_uref(u32 ufd) ++{ ++ struct fd f = fdget(ufd); ++ struct bpf_map *map; ++ ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return map; ++ ++ bpf_map_inc_with_uref(map); ++ fdput(f); ++ ++ return map; ++} ++ ++/* map_idr_lock should have been held */ ++static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) ++{ ++ int refold; ++ ++ refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); ++ if (!refold) ++ return ERR_PTR(-ENOENT); ++ if (uref) ++ atomic64_inc(&map->usercnt); ++ ++ return map; ++} ++ ++struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) ++{ ++ spin_lock_bh(&map_idr_lock); ++ map = __bpf_map_inc_not_zero(map, false); ++ spin_unlock_bh(&map_idr_lock); ++ ++ return map; ++} ++EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); ++ ++int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) ++{ ++ return -ENOTSUPP; ++} ++ ++static void *__bpf_copy_key(void __user *ukey, u64 key_size) ++{ ++ if (key_size) ++ return vmemdup_user(ukey, key_size); ++ ++ if (ukey) ++ return ERR_PTR(-EINVAL); ++ ++ return NULL; ++} ++ ++static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) ++{ ++ if (key_size) ++ return kvmemdup_bpfptr(ukey, key_size); ++ ++ if (!bpfptr_is_null(ukey)) ++ return ERR_PTR(-EINVAL); ++ ++ return NULL; ++} ++ ++/* last field in 'union bpf_attr' used by this command */ ++#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags ++ ++static int map_lookup_elem(union bpf_attr *attr) ++{ ++ void __user *ukey = u64_to_user_ptr(attr->key); ++ void __user *uvalue = u64_to_user_ptr(attr->value); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *value; ++ u32 value_size; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) ++ return -EINVAL; ++ ++ if (attr->flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if ((attr->flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ err = -EINVAL; ++ goto err_put; ++ } ++ ++ key = __bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ ++ value_size = bpf_map_value_size(map); ++ ++ err = -ENOMEM; ++ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) ++ goto free_key; ++ ++ if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { ++ if (copy_from_user(value, uvalue, value_size)) ++ err = -EFAULT; ++ else ++ err = bpf_map_copy_value(map, key, value, attr->flags); ++ goto free_value; ++ } ++ ++ err = bpf_map_copy_value(map, key, value, attr->flags); ++ if (err) ++ goto free_value; ++ ++ err = -EFAULT; ++ if (copy_to_user(uvalue, value, value_size) != 0) ++ goto free_value; ++ ++ err = 0; ++ ++free_value: ++ kvfree(value); ++free_key: ++ kvfree(key); ++err_put: ++ fdput(f); ++ return err; ++} ++ ++ ++#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags ++ ++static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) ++{ ++ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); ++ bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *value; ++ u32 value_size; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ bpf_map_write_active_inc(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if ((attr->flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ err = -EINVAL; ++ goto err_put; ++ } ++ ++ key = ___bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ ++ value_size = bpf_map_value_size(map); ++ ++ err = -ENOMEM; ++ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) ++ goto free_key; ++ ++ err = -EFAULT; ++ if (copy_from_bpfptr(value, uvalue, value_size) != 0) ++ goto free_value; ++ ++ err = bpf_map_update_value(map, f, key, value, attr->flags); ++ ++free_value: ++ kvfree(value); ++free_key: ++ kvfree(key); ++err_put: ++ bpf_map_write_active_dec(map); ++ fdput(f); ++ return err; ++} ++ ++#define BPF_MAP_DELETE_ELEM_LAST_FIELD key ++ ++static int map_delete_elem(union bpf_attr *attr) ++{ ++ void __user *ukey = u64_to_user_ptr(attr->key); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ struct fd f; ++ void *key; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ bpf_map_write_active_inc(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ key = __bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ ++ if (bpf_map_is_dev_bound(map)) { ++ err = bpf_map_offload_delete_elem(map, key); ++ goto out; ++ } else if (IS_FD_PROG_ARRAY(map) || ++ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { ++ /* These maps require sleepable context */ ++ err = map->ops->map_delete_elem(map, key); ++ goto out; ++ } ++ ++ bpf_disable_instrumentation(); ++ rcu_read_lock(); ++ err = map->ops->map_delete_elem(map, key); ++ rcu_read_unlock(); ++ bpf_enable_instrumentation(); ++ maybe_wait_bpf_programs(map); ++out: ++ kvfree(key); ++err_put: ++ bpf_map_write_active_dec(map); ++ fdput(f); ++ return err; ++} ++ ++/* last field in 'union bpf_attr' used by this command */ ++#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key ++ ++static int map_get_next_key(union bpf_attr *attr) ++{ ++ void __user *ukey = u64_to_user_ptr(attr->key); ++ void __user *unext_key = u64_to_user_ptr(attr->next_key); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *next_key; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if (ukey) { ++ key = __bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ } else { ++ key = NULL; ++ } ++ ++ err = -ENOMEM; ++ next_key = kvmalloc(map->key_size, GFP_USER); ++ if (!next_key) ++ goto free_key; ++ ++ if (bpf_map_is_dev_bound(map)) { ++ err = bpf_map_offload_get_next_key(map, key, next_key); ++ goto out; ++ } ++ ++ rcu_read_lock(); ++ err = map->ops->map_get_next_key(map, key, next_key); ++ rcu_read_unlock(); ++out: ++ if (err) ++ goto free_next_key; ++ ++ err = -EFAULT; ++ if (copy_to_user(unext_key, next_key, map->key_size) != 0) ++ goto free_next_key; ++ ++ err = 0; ++ ++free_next_key: ++ kvfree(next_key); ++free_key: ++ kvfree(key); ++err_put: ++ fdput(f); ++ return err; ++} ++ ++int generic_map_delete_batch(struct bpf_map *map, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ void __user *keys = u64_to_user_ptr(attr->batch.keys); ++ u32 cp, max_count; ++ int err = 0; ++ void *key; ++ ++ if (attr->batch.elem_flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ if ((attr->batch.elem_flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ return -EINVAL; ++ } ++ ++ max_count = attr->batch.count; ++ if (!max_count) ++ return 0; ++ ++ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); ++ if (!key) ++ return -ENOMEM; ++ ++ for (cp = 0; cp < max_count; cp++) { ++ err = -EFAULT; ++ if (copy_from_user(key, keys + cp * map->key_size, ++ map->key_size)) ++ break; ++ ++ if (bpf_map_is_dev_bound(map)) { ++ err = bpf_map_offload_delete_elem(map, key); ++ break; ++ } ++ ++ bpf_disable_instrumentation(); ++ rcu_read_lock(); ++ err = map->ops->map_delete_elem(map, key); ++ rcu_read_unlock(); ++ bpf_enable_instrumentation(); ++ if (err) ++ break; ++ cond_resched(); ++ } ++ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) ++ err = -EFAULT; ++ ++ kvfree(key); ++ ++ maybe_wait_bpf_programs(map); ++ return err; ++} ++ ++int generic_map_update_batch(struct bpf_map *map, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ void __user *values = u64_to_user_ptr(attr->batch.values); ++ void __user *keys = u64_to_user_ptr(attr->batch.keys); ++ u32 value_size, cp, max_count; ++ int ufd = attr->batch.map_fd; ++ void *key, *value; ++ struct fd f; ++ int err = 0; ++ ++ if (attr->batch.elem_flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ if ((attr->batch.elem_flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ return -EINVAL; ++ } ++ ++ value_size = bpf_map_value_size(map); ++ ++ max_count = attr->batch.count; ++ if (!max_count) ++ return 0; ++ ++ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); ++ if (!key) ++ return -ENOMEM; ++ ++ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) { ++ kvfree(key); ++ return -ENOMEM; ++ } ++ ++ f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */ ++ for (cp = 0; cp < max_count; cp++) { ++ err = -EFAULT; ++ if (copy_from_user(key, keys + cp * map->key_size, ++ map->key_size) || ++ copy_from_user(value, values + cp * value_size, value_size)) ++ break; ++ ++ err = bpf_map_update_value(map, f, key, value, ++ attr->batch.elem_flags); ++ ++ if (err) ++ break; ++ cond_resched(); ++ } ++ ++ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) ++ err = -EFAULT; ++ ++ kvfree(value); ++ kvfree(key); ++ fdput(f); ++ return err; ++} ++ ++#define MAP_LOOKUP_RETRIES 3 ++ ++int generic_map_lookup_batch(struct bpf_map *map, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); ++ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); ++ void __user *values = u64_to_user_ptr(attr->batch.values); ++ void __user *keys = u64_to_user_ptr(attr->batch.keys); ++ void *buf, *buf_prevkey, *prev_key, *key, *value; ++ int err, retry = MAP_LOOKUP_RETRIES; ++ u32 value_size, cp, max_count; ++ ++ if (attr->batch.elem_flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ if ((attr->batch.elem_flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) ++ return -EINVAL; ++ ++ value_size = bpf_map_value_size(map); ++ ++ max_count = attr->batch.count; ++ if (!max_count) ++ return 0; ++ ++ if (put_user(0, &uattr->batch.count)) ++ return -EFAULT; ++ ++ buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); ++ if (!buf_prevkey) ++ return -ENOMEM; ++ ++ buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); ++ if (!buf) { ++ kvfree(buf_prevkey); ++ return -ENOMEM; ++ } ++ ++ err = -EFAULT; ++ prev_key = NULL; ++ if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) ++ goto free_buf; ++ key = buf; ++ value = key + map->key_size; ++ if (ubatch) ++ prev_key = buf_prevkey; ++ ++ for (cp = 0; cp < max_count;) { ++ rcu_read_lock(); ++ err = map->ops->map_get_next_key(map, prev_key, key); ++ rcu_read_unlock(); ++ if (err) ++ break; ++ err = bpf_map_copy_value(map, key, value, ++ attr->batch.elem_flags); ++ ++ if (err == -ENOENT) { ++ if (retry) { ++ retry--; ++ continue; ++ } ++ err = -EINTR; ++ break; ++ } ++ ++ if (err) ++ goto free_buf; ++ ++ if (copy_to_user(keys + cp * map->key_size, key, ++ map->key_size)) { ++ err = -EFAULT; ++ goto free_buf; ++ } ++ if (copy_to_user(values + cp * value_size, value, value_size)) { ++ err = -EFAULT; ++ goto free_buf; ++ } ++ ++ if (!prev_key) ++ prev_key = buf_prevkey; ++ ++ swap(prev_key, key); ++ retry = MAP_LOOKUP_RETRIES; ++ cp++; ++ cond_resched(); ++ } ++ ++ if (err == -EFAULT) ++ goto free_buf; ++ ++ if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || ++ (cp && copy_to_user(uobatch, prev_key, map->key_size)))) ++ err = -EFAULT; ++ ++free_buf: ++ kvfree(buf_prevkey); ++ kvfree(buf); ++ return err; ++} ++ ++#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags ++ ++static int map_lookup_and_delete_elem(union bpf_attr *attr) ++{ ++ void __user *ukey = u64_to_user_ptr(attr->key); ++ void __user *uvalue = u64_to_user_ptr(attr->value); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *value; ++ u32 value_size; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) ++ return -EINVAL; ++ ++ if (attr->flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ bpf_map_write_active_inc(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || ++ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if (attr->flags && ++ (map->map_type == BPF_MAP_TYPE_QUEUE || ++ map->map_type == BPF_MAP_TYPE_STACK)) { ++ err = -EINVAL; ++ goto err_put; ++ } ++ ++ if ((attr->flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ err = -EINVAL; ++ goto err_put; ++ } ++ ++ key = __bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ ++ value_size = bpf_map_value_size(map); ++ ++ err = -ENOMEM; ++ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) ++ goto free_key; ++ ++ err = -ENOTSUPP; ++ if (map->map_type == BPF_MAP_TYPE_QUEUE || ++ map->map_type == BPF_MAP_TYPE_STACK) { ++ err = map->ops->map_pop_elem(map, value); ++ } else if (map->map_type == BPF_MAP_TYPE_HASH || ++ map->map_type == BPF_MAP_TYPE_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { ++ if (!bpf_map_is_dev_bound(map)) { ++ bpf_disable_instrumentation(); ++ rcu_read_lock(); ++ err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); ++ rcu_read_unlock(); ++ bpf_enable_instrumentation(); ++ } ++ } ++ ++ if (err) ++ goto free_value; ++ ++ if (copy_to_user(uvalue, value, value_size) != 0) { ++ err = -EFAULT; ++ goto free_value; ++ } ++ ++ err = 0; ++ ++free_value: ++ kvfree(value); ++free_key: ++ kvfree(key); ++err_put: ++ bpf_map_write_active_dec(map); ++ fdput(f); ++ return err; ++} ++ ++#define BPF_MAP_FREEZE_LAST_FIELD map_fd ++ ++static int map_freeze(const union bpf_attr *attr) ++{ ++ int err = 0, ufd = attr->map_fd; ++ struct bpf_map *map; ++ struct fd f; ++ ++ if (CHECK_ATTR(BPF_MAP_FREEZE)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ ++ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || ++ map_value_has_timer(map) || map_value_has_kptrs(map)) { ++ fdput(f); ++ return -ENOTSUPP; ++ } ++ ++ mutex_lock(&map->freeze_mutex); ++ if (bpf_map_write_active(map)) { ++ err = -EBUSY; ++ goto err_put; ++ } ++ if (READ_ONCE(map->frozen)) { ++ err = -EBUSY; ++ goto err_put; ++ } ++ if (!bpf_capable()) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ WRITE_ONCE(map->frozen, true); ++err_put: ++ mutex_unlock(&map->freeze_mutex); ++ fdput(f); ++ return err; ++} ++ ++static const struct bpf_prog_ops * const bpf_prog_types[] = { ++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ ++ [_id] = & _name ## _prog_ops, ++#define BPF_MAP_TYPE(_id, _ops) ++#define BPF_LINK_TYPE(_id, _name) ++#include ++#undef BPF_PROG_TYPE ++#undef BPF_MAP_TYPE ++#undef BPF_LINK_TYPE ++}; ++ ++static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) ++{ ++ const struct bpf_prog_ops *ops; ++ ++ if (type >= ARRAY_SIZE(bpf_prog_types)) ++ return -EINVAL; ++ type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); ++ ops = bpf_prog_types[type]; ++ if (!ops) ++ return -EINVAL; ++ ++ if (!bpf_prog_is_dev_bound(prog->aux)) ++ prog->aux->ops = ops; ++ else ++ prog->aux->ops = &bpf_offload_prog_ops; ++ prog->type = type; ++ return 0; ++} ++ ++enum bpf_audit { ++ BPF_AUDIT_LOAD, ++ BPF_AUDIT_UNLOAD, ++ BPF_AUDIT_MAX, ++}; ++ ++static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { ++ [BPF_AUDIT_LOAD] = "LOAD", ++ [BPF_AUDIT_UNLOAD] = "UNLOAD", ++}; ++ ++static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) ++{ ++ struct audit_context *ctx = NULL; ++ struct audit_buffer *ab; ++ ++ if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) ++ return; ++ if (audit_enabled == AUDIT_OFF) ++ return; ++ if (op == BPF_AUDIT_LOAD) ++ ctx = audit_context(); ++ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); ++ if (unlikely(!ab)) ++ return; ++ audit_log_format(ab, "prog-id=%u op=%s", ++ prog->aux->id, bpf_audit_str[op]); ++ audit_log_end(ab); ++} ++ ++static int bpf_prog_alloc_id(struct bpf_prog *prog) ++{ ++ int id; ++ ++ idr_preload(GFP_KERNEL); ++ spin_lock_bh(&prog_idr_lock); ++ id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); ++ if (id > 0) ++ prog->aux->id = id; ++ spin_unlock_bh(&prog_idr_lock); ++ idr_preload_end(); ++ ++ /* id is in [1, INT_MAX) */ ++ if (WARN_ON_ONCE(!id)) ++ return -ENOSPC; ++ ++ return id > 0 ? 0 : id; ++} ++ ++void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) ++{ ++ unsigned long flags; ++ ++ /* cBPF to eBPF migrations are currently not in the idr store. ++ * Offloaded programs are removed from the store when their device ++ * disappears - even if someone grabs an fd to them they are unusable, ++ * simply waiting for refcnt to drop to be freed. ++ */ ++ if (!prog->aux->id) ++ return; ++ ++ if (do_idr_lock) ++ spin_lock_irqsave(&prog_idr_lock, flags); ++ else ++ __acquire(&prog_idr_lock); ++ ++ idr_remove(&prog_idr, prog->aux->id); ++ prog->aux->id = 0; ++ ++ if (do_idr_lock) ++ spin_unlock_irqrestore(&prog_idr_lock, flags); ++ else ++ __release(&prog_idr_lock); ++} ++ ++static void __bpf_prog_put_rcu(struct rcu_head *rcu) ++{ ++ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); ++ ++ kvfree(aux->func_info); ++ kfree(aux->func_info_aux); ++ free_uid(aux->user); ++ security_bpf_prog_free(aux); ++ bpf_prog_free(aux->prog); ++} ++ ++static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) ++{ ++ bpf_prog_kallsyms_del_all(prog); ++ btf_put(prog->aux->btf); ++ kvfree(prog->aux->jited_linfo); ++ kvfree(prog->aux->linfo); ++ kfree(prog->aux->kfunc_tab); ++ if (prog->aux->attach_btf) ++ btf_put(prog->aux->attach_btf); ++ ++ if (deferred) { ++ if (prog->aux->sleepable) ++ call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); ++ else ++ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); ++ } else { ++ __bpf_prog_put_rcu(&prog->aux->rcu); ++ } ++} ++ ++static void bpf_prog_put_deferred(struct work_struct *work) ++{ ++ struct bpf_prog_aux *aux; ++ struct bpf_prog *prog; ++ ++ aux = container_of(work, struct bpf_prog_aux, work); ++ prog = aux->prog; ++ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); ++ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); ++ __bpf_prog_put_noref(prog, true); ++} ++ ++static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) ++{ ++ struct bpf_prog_aux *aux = prog->aux; ++ ++ if (atomic64_dec_and_test(&aux->refcnt)) { ++ /* bpf_prog_free_id() must be called first */ ++ bpf_prog_free_id(prog, do_idr_lock); ++ ++ if (in_irq() || irqs_disabled()) { ++ INIT_WORK(&aux->work, bpf_prog_put_deferred); ++ schedule_work(&aux->work); ++ } else { ++ bpf_prog_put_deferred(&aux->work); ++ } ++ } ++} ++ ++void bpf_prog_put(struct bpf_prog *prog) ++{ ++ __bpf_prog_put(prog, true); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_put); ++ ++static int bpf_prog_release(struct inode *inode, struct file *filp) ++{ ++ struct bpf_prog *prog = filp->private_data; ++ ++ bpf_prog_put(prog); ++ return 0; ++} ++ ++struct bpf_prog_kstats { ++ u64 nsecs; ++ u64 cnt; ++ u64 misses; ++}; ++ ++void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) ++{ ++ struct bpf_prog_stats *stats; ++ unsigned int flags; ++ ++ stats = this_cpu_ptr(prog->stats); ++ flags = u64_stats_update_begin_irqsave(&stats->syncp); ++ u64_stats_inc(&stats->misses); ++ u64_stats_update_end_irqrestore(&stats->syncp, flags); ++} ++ ++static void bpf_prog_get_stats(const struct bpf_prog *prog, ++ struct bpf_prog_kstats *stats) ++{ ++ u64 nsecs = 0, cnt = 0, misses = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ const struct bpf_prog_stats *st; ++ unsigned int start; ++ u64 tnsecs, tcnt, tmisses; ++ ++ st = per_cpu_ptr(prog->stats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&st->syncp); ++ tnsecs = u64_stats_read(&st->nsecs); ++ tcnt = u64_stats_read(&st->cnt); ++ tmisses = u64_stats_read(&st->misses); ++ } while (u64_stats_fetch_retry_irq(&st->syncp, start)); ++ nsecs += tnsecs; ++ cnt += tcnt; ++ misses += tmisses; ++ } ++ stats->nsecs = nsecs; ++ stats->cnt = cnt; ++ stats->misses = misses; ++} ++ ++#ifdef CONFIG_PROC_FS ++static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) ++{ ++ const struct bpf_prog *prog = filp->private_data; ++ char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; ++ struct bpf_prog_kstats stats; ++ ++ bpf_prog_get_stats(prog, &stats); ++ bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); ++ seq_printf(m, ++ "prog_type:\t%u\n" ++ "prog_jited:\t%u\n" ++ "prog_tag:\t%s\n" ++ "memlock:\t%llu\n" ++ "prog_id:\t%u\n" ++ "run_time_ns:\t%llu\n" ++ "run_cnt:\t%llu\n" ++ "recursion_misses:\t%llu\n" ++ "verified_insns:\t%u\n", ++ prog->type, ++ prog->jited, ++ prog_tag, ++ prog->pages * 1ULL << PAGE_SHIFT, ++ prog->aux->id, ++ stats.nsecs, ++ stats.cnt, ++ stats.misses, ++ prog->aux->verified_insns); ++} ++#endif ++ ++const struct file_operations bpf_prog_fops = { ++#ifdef CONFIG_PROC_FS ++ .show_fdinfo = bpf_prog_show_fdinfo, ++#endif ++ .release = bpf_prog_release, ++ .read = bpf_dummy_read, ++ .write = bpf_dummy_write, ++}; ++ ++int bpf_prog_new_fd(struct bpf_prog *prog) ++{ ++ int ret; ++ ++ ret = security_bpf_prog(prog); ++ if (ret < 0) ++ return ret; ++ ++ return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, ++ O_RDWR | O_CLOEXEC); ++} ++ ++static struct bpf_prog *____bpf_prog_get(struct fd f) ++{ ++ if (!f.file) ++ return ERR_PTR(-EBADF); ++ if (f.file->f_op != &bpf_prog_fops) { ++ fdput(f); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ return f.file->private_data; ++} ++ ++void bpf_prog_add(struct bpf_prog *prog, int i) ++{ ++ atomic64_add(i, &prog->aux->refcnt); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_add); ++ ++void bpf_prog_sub(struct bpf_prog *prog, int i) ++{ ++ /* Only to be used for undoing previous bpf_prog_add() in some ++ * error path. We still know that another entity in our call ++ * path holds a reference to the program, thus atomic_sub() can ++ * be safely used in such cases! ++ */ ++ WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_sub); ++ ++void bpf_prog_inc(struct bpf_prog *prog) ++{ ++ atomic64_inc(&prog->aux->refcnt); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_inc); ++ ++/* prog_idr_lock should have been held */ ++struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) ++{ ++ int refold; ++ ++ refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); ++ ++ if (!refold) ++ return ERR_PTR(-ENOENT); ++ ++ return prog; ++} ++EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); ++ ++bool bpf_prog_get_ok(struct bpf_prog *prog, ++ enum bpf_prog_type *attach_type, bool attach_drv) ++{ ++ /* not an attachment, just a refcount inc, always allow */ ++ if (!attach_type) ++ return true; ++ ++ if (prog->type != *attach_type) ++ return false; ++ if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) ++ return false; ++ ++ return true; ++} ++ ++static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, ++ bool attach_drv) ++{ ++ struct fd f = fdget(ufd); ++ struct bpf_prog *prog; ++ ++ prog = ____bpf_prog_get(f); ++ if (IS_ERR(prog)) ++ return prog; ++ if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) { ++ prog = ERR_PTR(-EINVAL); ++ goto out; ++ } ++ ++ bpf_prog_inc(prog); ++out: ++ fdput(f); ++ return prog; ++} ++ ++struct bpf_prog *bpf_prog_get(u32 ufd) ++{ ++ return __bpf_prog_get(ufd, NULL, false); ++} ++ ++struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, ++ bool attach_drv) ++{ ++ return __bpf_prog_get(ufd, &type, attach_drv); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); ++ ++/* Initially all BPF programs could be loaded w/o specifying ++ * expected_attach_type. Later for some of them specifying expected_attach_type ++ * at load time became required so that program could be validated properly. ++ * Programs of types that are allowed to be loaded both w/ and w/o (for ++ * backward compatibility) expected_attach_type, should have the default attach ++ * type assigned to expected_attach_type for the latter case, so that it can be ++ * validated later at attach time. ++ * ++ * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if ++ * prog type requires it but has some attach types that have to be backward ++ * compatible. ++ */ ++static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) ++{ ++ switch (attr->prog_type) { ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't ++ * exist so checking for non-zero is the way to go here. ++ */ ++ if (!attr->expected_attach_type) ++ attr->expected_attach_type = ++ BPF_CGROUP_INET_SOCK_CREATE; ++ break; ++ case BPF_PROG_TYPE_SK_REUSEPORT: ++ if (!attr->expected_attach_type) ++ attr->expected_attach_type = ++ BPF_SK_REUSEPORT_SELECT; ++ break; ++ } ++} ++ ++static int ++bpf_prog_load_check_attach(enum bpf_prog_type prog_type, ++ enum bpf_attach_type expected_attach_type, ++ struct btf *attach_btf, u32 btf_id, ++ struct bpf_prog *dst_prog) ++{ ++ if (btf_id) { ++ if (btf_id > BTF_MAX_TYPE) ++ return -EINVAL; ++ ++ if (!attach_btf && !dst_prog) ++ return -EINVAL; ++ ++ switch (prog_type) { ++ case BPF_PROG_TYPE_TRACING: ++ case BPF_PROG_TYPE_LSM: ++ case BPF_PROG_TYPE_STRUCT_OPS: ++ case BPF_PROG_TYPE_EXT: ++ break; ++ default: ++ return -EINVAL; ++ } ++ } ++ ++ if (attach_btf && (!btf_id || dst_prog)) ++ return -EINVAL; ++ ++ if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && ++ prog_type != BPF_PROG_TYPE_EXT) ++ return -EINVAL; ++ ++ switch (prog_type) { ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ switch (expected_attach_type) { ++ case BPF_CGROUP_INET_SOCK_CREATE: ++ case BPF_CGROUP_INET_SOCK_RELEASE: ++ case BPF_CGROUP_INET4_POST_BIND: ++ case BPF_CGROUP_INET6_POST_BIND: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ switch (expected_attach_type) { ++ case BPF_CGROUP_INET4_BIND: ++ case BPF_CGROUP_INET6_BIND: ++ case BPF_CGROUP_INET4_CONNECT: ++ case BPF_CGROUP_INET6_CONNECT: ++ case BPF_CGROUP_INET4_GETPEERNAME: ++ case BPF_CGROUP_INET6_GETPEERNAME: ++ case BPF_CGROUP_INET4_GETSOCKNAME: ++ case BPF_CGROUP_INET6_GETSOCKNAME: ++ case BPF_CGROUP_UDP4_SENDMSG: ++ case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ switch (expected_attach_type) { ++ case BPF_CGROUP_INET_INGRESS: ++ case BPF_CGROUP_INET_EGRESS: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ switch (expected_attach_type) { ++ case BPF_CGROUP_SETSOCKOPT: ++ case BPF_CGROUP_GETSOCKOPT: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_SK_LOOKUP: ++ if (expected_attach_type == BPF_SK_LOOKUP) ++ return 0; ++ return -EINVAL; ++ case BPF_PROG_TYPE_SK_REUSEPORT: ++ switch (expected_attach_type) { ++ case BPF_SK_REUSEPORT_SELECT: ++ case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_SYSCALL: ++ case BPF_PROG_TYPE_EXT: ++ if (expected_attach_type) ++ return -EINVAL; ++ fallthrough; ++ default: ++ return 0; ++ } ++} ++ ++static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) ++{ ++ switch (prog_type) { ++ case BPF_PROG_TYPE_SCHED_CLS: ++ case BPF_PROG_TYPE_SCHED_ACT: ++ case BPF_PROG_TYPE_XDP: ++ case BPF_PROG_TYPE_LWT_IN: ++ case BPF_PROG_TYPE_LWT_OUT: ++ case BPF_PROG_TYPE_LWT_XMIT: ++ case BPF_PROG_TYPE_LWT_SEG6LOCAL: ++ case BPF_PROG_TYPE_SK_SKB: ++ case BPF_PROG_TYPE_SK_MSG: ++ case BPF_PROG_TYPE_LIRC_MODE2: ++ case BPF_PROG_TYPE_FLOW_DISSECTOR: ++ case BPF_PROG_TYPE_CGROUP_DEVICE: ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ case BPF_PROG_TYPE_CGROUP_SYSCTL: ++ case BPF_PROG_TYPE_SOCK_OPS: ++ case BPF_PROG_TYPE_EXT: /* extends any prog */ ++ return true; ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ /* always unpriv */ ++ case BPF_PROG_TYPE_SK_REUSEPORT: ++ /* equivalent to SOCKET_FILTER. need CAP_BPF only */ ++ default: ++ return false; ++ } ++} ++ ++static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) ++{ ++ switch (prog_type) { ++ case BPF_PROG_TYPE_KPROBE: ++ case BPF_PROG_TYPE_TRACEPOINT: ++ case BPF_PROG_TYPE_PERF_EVENT: ++ case BPF_PROG_TYPE_RAW_TRACEPOINT: ++ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: ++ case BPF_PROG_TYPE_TRACING: ++ case BPF_PROG_TYPE_LSM: ++ case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ ++ case BPF_PROG_TYPE_EXT: /* extends any prog */ ++ return true; ++ default: ++ return false; ++ } ++} ++ ++/* last field in 'union bpf_attr' used by this command */ ++#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size ++ ++static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) ++{ ++ enum bpf_prog_type type = attr->prog_type; ++ struct bpf_prog *prog, *dst_prog = NULL; ++ struct btf *attach_btf = NULL; ++ int err; ++ char license[128]; ++ bool is_gpl; ++ ++ if (CHECK_ATTR(BPF_PROG_LOAD)) ++ return -EINVAL; ++ ++ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | ++ BPF_F_ANY_ALIGNMENT | ++ BPF_F_TEST_STATE_FREQ | ++ BPF_F_SLEEPABLE | ++ BPF_F_TEST_RND_HI32 | ++ BPF_F_XDP_HAS_FRAGS)) ++ return -EINVAL; ++ ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && ++ (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && ++ !bpf_capable()) ++ return -EPERM; ++ ++ /* copy eBPF program license from user space */ ++ if (strncpy_from_bpfptr(license, ++ make_bpfptr(attr->license, uattr.is_kernel), ++ sizeof(license) - 1) < 0) ++ return -EFAULT; ++ license[sizeof(license) - 1] = 0; ++ ++ /* eBPF programs must be GPL compatible to use GPL-ed functions */ ++ is_gpl = license_is_gpl_compatible(license); ++ ++ if (attr->insn_cnt == 0 || ++ attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) ++ return -E2BIG; ++ if (type != BPF_PROG_TYPE_SOCKET_FILTER && ++ type != BPF_PROG_TYPE_CGROUP_SKB && ++ !bpf_capable()) ++ return -EPERM; ++ ++ if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ if (is_perfmon_prog_type(type) && !perfmon_capable()) ++ return -EPERM; ++ ++ /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog ++ * or btf, we need to check which one it is ++ */ ++ if (attr->attach_prog_fd) { ++ dst_prog = bpf_prog_get(attr->attach_prog_fd); ++ if (IS_ERR(dst_prog)) { ++ dst_prog = NULL; ++ attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); ++ if (IS_ERR(attach_btf)) ++ return -EINVAL; ++ if (!btf_is_kernel(attach_btf)) { ++ /* attaching through specifying bpf_prog's BTF ++ * objects directly might be supported eventually ++ */ ++ btf_put(attach_btf); ++ return -ENOTSUPP; ++ } ++ } ++ } else if (attr->attach_btf_id) { ++ /* fall back to vmlinux BTF, if BTF type ID is specified */ ++ attach_btf = bpf_get_btf_vmlinux(); ++ if (IS_ERR(attach_btf)) ++ return PTR_ERR(attach_btf); ++ if (!attach_btf) ++ return -EINVAL; ++ btf_get(attach_btf); ++ } ++ ++ bpf_prog_load_fixup_attach_type(attr); ++ if (bpf_prog_load_check_attach(type, attr->expected_attach_type, ++ attach_btf, attr->attach_btf_id, ++ dst_prog)) { ++ if (dst_prog) ++ bpf_prog_put(dst_prog); ++ if (attach_btf) ++ btf_put(attach_btf); ++ return -EINVAL; ++ } ++ ++ /* plain bpf_prog allocation */ ++ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); ++ if (!prog) { ++ if (dst_prog) ++ bpf_prog_put(dst_prog); ++ if (attach_btf) ++ btf_put(attach_btf); ++ return -ENOMEM; ++ } ++ ++ prog->expected_attach_type = attr->expected_attach_type; ++ prog->aux->attach_btf = attach_btf; ++ prog->aux->attach_btf_id = attr->attach_btf_id; ++ prog->aux->dst_prog = dst_prog; ++ prog->aux->offload_requested = !!attr->prog_ifindex; ++ prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; ++ prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; ++ ++ err = security_bpf_prog_alloc(prog->aux); ++ if (err) ++ goto free_prog; ++ ++ prog->aux->user = get_current_user(); ++ prog->len = attr->insn_cnt; ++ ++ err = -EFAULT; ++ if (copy_from_bpfptr(prog->insns, ++ make_bpfptr(attr->insns, uattr.is_kernel), ++ bpf_prog_insn_size(prog)) != 0) ++ goto free_prog_sec; ++ ++ prog->orig_prog = NULL; ++ prog->jited = 0; ++ ++ atomic64_set(&prog->aux->refcnt, 1); ++ prog->gpl_compatible = is_gpl ? 1 : 0; ++ ++ if (bpf_prog_is_dev_bound(prog->aux)) { ++ err = bpf_prog_offload_init(prog, attr); ++ if (err) ++ goto free_prog_sec; ++ } ++ ++ /* find program type: socket_filter vs tracing_filter */ ++ err = find_prog_type(type, prog); ++ if (err < 0) ++ goto free_prog_sec; ++ ++ prog->aux->load_time = ktime_get_boottime_ns(); ++ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, ++ sizeof(attr->prog_name)); ++ if (err < 0) ++ goto free_prog_sec; ++ ++ /* run eBPF verifier */ ++ err = bpf_check(&prog, attr, uattr); ++ if (err < 0) ++ goto free_used_maps; ++ ++ prog = bpf_prog_select_runtime(prog, &err); ++ if (err < 0) ++ goto free_used_maps; ++ ++ err = bpf_prog_alloc_id(prog); ++ if (err) ++ goto free_used_maps; ++ ++ /* Upon success of bpf_prog_alloc_id(), the BPF prog is ++ * effectively publicly exposed. However, retrieving via ++ * bpf_prog_get_fd_by_id() will take another reference, ++ * therefore it cannot be gone underneath us. ++ * ++ * Only for the time /after/ successful bpf_prog_new_fd() ++ * and before returning to userspace, we might just hold ++ * one reference and any parallel close on that fd could ++ * rip everything out. Hence, below notifications must ++ * happen before bpf_prog_new_fd(). ++ * ++ * Also, any failure handling from this point onwards must ++ * be using bpf_prog_put() given the program is exposed. ++ */ ++ bpf_prog_kallsyms_add(prog); ++ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); ++ bpf_audit_prog(prog, BPF_AUDIT_LOAD); ++ ++ err = bpf_prog_new_fd(prog); ++ if (err < 0) ++ bpf_prog_put(prog); ++ return err; ++ ++free_used_maps: ++ /* In case we have subprogs, we need to wait for a grace ++ * period before we can tear down JIT memory since symbols ++ * are already exposed under kallsyms. ++ */ ++ __bpf_prog_put_noref(prog, prog->aux->func_cnt); ++ return err; ++free_prog_sec: ++ free_uid(prog->aux->user); ++ security_bpf_prog_free(prog->aux); ++free_prog: ++ if (prog->aux->attach_btf) ++ btf_put(prog->aux->attach_btf); ++ bpf_prog_free(prog); ++ return err; ++} ++ ++#define BPF_OBJ_LAST_FIELD file_flags ++ ++static int bpf_obj_pin(const union bpf_attr *attr) ++{ ++ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) ++ return -EINVAL; ++ ++ return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); ++} ++ ++static int bpf_obj_get(const union bpf_attr *attr) ++{ ++ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || ++ attr->file_flags & ~BPF_OBJ_FLAG_MASK) ++ return -EINVAL; ++ ++ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), ++ attr->file_flags); ++} ++ ++void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, ++ const struct bpf_link_ops *ops, struct bpf_prog *prog) ++{ ++ atomic64_set(&link->refcnt, 1); ++ link->type = type; ++ link->id = 0; ++ link->ops = ops; ++ link->prog = prog; ++} ++ ++static void bpf_link_free_id(int id) ++{ ++ if (!id) ++ return; ++ ++ spin_lock_bh(&link_idr_lock); ++ idr_remove(&link_idr, id); ++ spin_unlock_bh(&link_idr_lock); ++} ++ ++/* Clean up bpf_link and corresponding anon_inode file and FD. After ++ * anon_inode is created, bpf_link can't be just kfree()'d due to deferred ++ * anon_inode's release() call. This helper marksbpf_link as ++ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt ++ * is not decremented, it's the responsibility of a calling code that failed ++ * to complete bpf_link initialization. ++ */ ++void bpf_link_cleanup(struct bpf_link_primer *primer) ++{ ++ primer->link->prog = NULL; ++ bpf_link_free_id(primer->id); ++ fput(primer->file); ++ put_unused_fd(primer->fd); ++} ++ ++void bpf_link_inc(struct bpf_link *link) ++{ ++ atomic64_inc(&link->refcnt); ++} ++ ++/* bpf_link_free is guaranteed to be called from process context */ ++static void bpf_link_free(struct bpf_link *link) ++{ ++ bpf_link_free_id(link->id); ++ if (link->prog) { ++ /* detach BPF program, clean up used resources */ ++ link->ops->release(link); ++ bpf_prog_put(link->prog); ++ } ++ /* free bpf_link and its containing memory */ ++ link->ops->dealloc(link); ++} ++ ++static void bpf_link_put_deferred(struct work_struct *work) ++{ ++ struct bpf_link *link = container_of(work, struct bpf_link, work); ++ ++ bpf_link_free(link); ++} ++ ++/* bpf_link_put can be called from atomic context, but ensures that resources ++ * are freed from process context ++ */ ++void bpf_link_put(struct bpf_link *link) ++{ ++ if (!atomic64_dec_and_test(&link->refcnt)) ++ return; ++ ++ if (in_atomic()) { ++ INIT_WORK(&link->work, bpf_link_put_deferred); ++ schedule_work(&link->work); ++ } else { ++ bpf_link_free(link); ++ } ++} ++EXPORT_SYMBOL(bpf_link_put); ++ ++static int bpf_link_release(struct inode *inode, struct file *filp) ++{ ++ struct bpf_link *link = filp->private_data; ++ ++ bpf_link_put(link); ++ return 0; ++} ++ ++#ifdef CONFIG_PROC_FS ++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) ++#define BPF_MAP_TYPE(_id, _ops) ++#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, ++static const char *bpf_link_type_strs[] = { ++ [BPF_LINK_TYPE_UNSPEC] = "", ++#include ++}; ++#undef BPF_PROG_TYPE ++#undef BPF_MAP_TYPE ++#undef BPF_LINK_TYPE ++ ++static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) ++{ ++ const struct bpf_link *link = filp->private_data; ++ const struct bpf_prog *prog = link->prog; ++ char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; ++ ++ bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); ++ seq_printf(m, ++ "link_type:\t%s\n" ++ "link_id:\t%u\n" ++ "prog_tag:\t%s\n" ++ "prog_id:\t%u\n", ++ bpf_link_type_strs[link->type], ++ link->id, ++ prog_tag, ++ prog->aux->id); ++ if (link->ops->show_fdinfo) ++ link->ops->show_fdinfo(link, m); ++} ++#endif ++ ++static const struct file_operations bpf_link_fops = { ++#ifdef CONFIG_PROC_FS ++ .show_fdinfo = bpf_link_show_fdinfo, ++#endif ++ .release = bpf_link_release, ++ .read = bpf_dummy_read, ++ .write = bpf_dummy_write, ++}; ++ ++static int bpf_link_alloc_id(struct bpf_link *link) ++{ ++ int id; ++ ++ idr_preload(GFP_KERNEL); ++ spin_lock_bh(&link_idr_lock); ++ id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); ++ spin_unlock_bh(&link_idr_lock); ++ idr_preload_end(); ++ ++ return id; ++} ++ ++/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, ++ * reserving unused FD and allocating ID from link_idr. This is to be paired ++ * with bpf_link_settle() to install FD and ID and expose bpf_link to ++ * user-space, if bpf_link is successfully attached. If not, bpf_link and ++ * pre-allocated resources are to be freed with bpf_cleanup() call. All the ++ * transient state is passed around in struct bpf_link_primer. ++ * This is preferred way to create and initialize bpf_link, especially when ++ * there are complicated and expensive operations in between creating bpf_link ++ * itself and attaching it to BPF hook. By using bpf_link_prime() and ++ * bpf_link_settle() kernel code using bpf_link doesn't have to perform ++ * expensive (and potentially failing) roll back operations in a rare case ++ * that file, FD, or ID can't be allocated. ++ */ ++int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) ++{ ++ struct file *file; ++ int fd, id; ++ ++ fd = get_unused_fd_flags(O_CLOEXEC); ++ if (fd < 0) ++ return fd; ++ ++ ++ id = bpf_link_alloc_id(link); ++ if (id < 0) { ++ put_unused_fd(fd); ++ return id; ++ } ++ ++ file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); ++ if (IS_ERR(file)) { ++ bpf_link_free_id(id); ++ put_unused_fd(fd); ++ return PTR_ERR(file); ++ } ++ ++ primer->link = link; ++ primer->file = file; ++ primer->fd = fd; ++ primer->id = id; ++ return 0; ++} ++ ++int bpf_link_settle(struct bpf_link_primer *primer) ++{ ++ /* make bpf_link fetchable by ID */ ++ spin_lock_bh(&link_idr_lock); ++ primer->link->id = primer->id; ++ spin_unlock_bh(&link_idr_lock); ++ /* make bpf_link fetchable by FD */ ++ fd_install(primer->fd, primer->file); ++ /* pass through installed FD */ ++ return primer->fd; ++} ++ ++int bpf_link_new_fd(struct bpf_link *link) ++{ ++ return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); ++} ++ ++struct bpf_link *bpf_link_get_from_fd(u32 ufd) ++{ ++ struct fd f = fdget(ufd); ++ struct bpf_link *link; ++ ++ if (!f.file) ++ return ERR_PTR(-EBADF); ++ if (f.file->f_op != &bpf_link_fops) { ++ fdput(f); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ link = f.file->private_data; ++ bpf_link_inc(link); ++ fdput(f); ++ ++ return link; ++} ++EXPORT_SYMBOL(bpf_link_get_from_fd); ++ ++static void bpf_tracing_link_release(struct bpf_link *link) ++{ ++ struct bpf_tracing_link *tr_link = ++ container_of(link, struct bpf_tracing_link, link.link); ++ ++ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, ++ tr_link->trampoline)); ++ ++ bpf_trampoline_put(tr_link->trampoline); ++ ++ /* tgt_prog is NULL if target is a kernel function */ ++ if (tr_link->tgt_prog) ++ bpf_prog_put(tr_link->tgt_prog); ++} ++ ++static void bpf_tracing_link_dealloc(struct bpf_link *link) ++{ ++ struct bpf_tracing_link *tr_link = ++ container_of(link, struct bpf_tracing_link, link.link); ++ ++ kfree(tr_link); ++} ++ ++static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, ++ struct seq_file *seq) ++{ ++ struct bpf_tracing_link *tr_link = ++ container_of(link, struct bpf_tracing_link, link.link); ++ ++ seq_printf(seq, ++ "attach_type:\t%d\n", ++ tr_link->attach_type); ++} ++ ++static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, ++ struct bpf_link_info *info) ++{ ++ struct bpf_tracing_link *tr_link = ++ container_of(link, struct bpf_tracing_link, link.link); ++ ++ info->tracing.attach_type = tr_link->attach_type; ++ bpf_trampoline_unpack_key(tr_link->trampoline->key, ++ &info->tracing.target_obj_id, ++ &info->tracing.target_btf_id); ++ ++ return 0; ++} ++ ++static const struct bpf_link_ops bpf_tracing_link_lops = { ++ .release = bpf_tracing_link_release, ++ .dealloc = bpf_tracing_link_dealloc, ++ .show_fdinfo = bpf_tracing_link_show_fdinfo, ++ .fill_link_info = bpf_tracing_link_fill_link_info, ++}; ++ ++static int bpf_tracing_prog_attach(struct bpf_prog *prog, ++ int tgt_prog_fd, ++ u32 btf_id, ++ u64 bpf_cookie) ++{ ++ struct bpf_link_primer link_primer; ++ struct bpf_prog *tgt_prog = NULL; ++ struct bpf_trampoline *tr = NULL; ++ struct bpf_tracing_link *link; ++ u64 key = 0; ++ int err; ++ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_TRACING: ++ if (prog->expected_attach_type != BPF_TRACE_FENTRY && ++ prog->expected_attach_type != BPF_TRACE_FEXIT && ++ prog->expected_attach_type != BPF_MODIFY_RETURN) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ break; ++ case BPF_PROG_TYPE_EXT: ++ if (prog->expected_attach_type != 0) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ break; ++ case BPF_PROG_TYPE_LSM: ++ if (prog->expected_attach_type != BPF_LSM_MAC) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ break; ++ default: ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ ++ if (!!tgt_prog_fd != !!btf_id) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ ++ if (tgt_prog_fd) { ++ /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ ++ if (prog->type != BPF_PROG_TYPE_EXT) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ ++ tgt_prog = bpf_prog_get(tgt_prog_fd); ++ if (IS_ERR(tgt_prog)) { ++ err = PTR_ERR(tgt_prog); ++ tgt_prog = NULL; ++ goto out_put_prog; ++ } ++ ++ key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); ++ } ++ ++ link = kzalloc(sizeof(*link), GFP_USER); ++ if (!link) { ++ err = -ENOMEM; ++ goto out_put_prog; ++ } ++ bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, ++ &bpf_tracing_link_lops, prog); ++ link->attach_type = prog->expected_attach_type; ++ link->link.cookie = bpf_cookie; ++ ++ mutex_lock(&prog->aux->dst_mutex); ++ ++ /* There are a few possible cases here: ++ * ++ * - if prog->aux->dst_trampoline is set, the program was just loaded ++ * and not yet attached to anything, so we can use the values stored ++ * in prog->aux ++ * ++ * - if prog->aux->dst_trampoline is NULL, the program has already been ++ * attached to a target and its initial target was cleared (below) ++ * ++ * - if tgt_prog != NULL, the caller specified tgt_prog_fd + ++ * target_btf_id using the link_create API. ++ * ++ * - if tgt_prog == NULL when this function was called using the old ++ * raw_tracepoint_open API, and we need a target from prog->aux ++ * ++ * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program ++ * was detached and is going for re-attachment. ++ */ ++ if (!prog->aux->dst_trampoline && !tgt_prog) { ++ /* ++ * Allow re-attach for TRACING and LSM programs. If it's ++ * currently linked, bpf_trampoline_link_prog will fail. ++ * EXT programs need to specify tgt_prog_fd, so they ++ * re-attach in separate code path. ++ */ ++ if (prog->type != BPF_PROG_TYPE_TRACING && ++ prog->type != BPF_PROG_TYPE_LSM) { ++ err = -EINVAL; ++ goto out_unlock; ++ } ++ btf_id = prog->aux->attach_btf_id; ++ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); ++ } ++ ++ if (!prog->aux->dst_trampoline || ++ (key && key != prog->aux->dst_trampoline->key)) { ++ /* If there is no saved target, or the specified target is ++ * different from the destination specified at load time, we ++ * need a new trampoline and a check for compatibility ++ */ ++ struct bpf_attach_target_info tgt_info = {}; ++ ++ err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, ++ &tgt_info); ++ if (err) ++ goto out_unlock; ++ ++ tr = bpf_trampoline_get(key, &tgt_info); ++ if (!tr) { ++ err = -ENOMEM; ++ goto out_unlock; ++ } ++ } else { ++ /* The caller didn't specify a target, or the target was the ++ * same as the destination supplied during program load. This ++ * means we can reuse the trampoline and reference from program ++ * load time, and there is no need to allocate a new one. This ++ * can only happen once for any program, as the saved values in ++ * prog->aux are cleared below. ++ */ ++ tr = prog->aux->dst_trampoline; ++ tgt_prog = prog->aux->dst_prog; ++ } ++ ++ err = bpf_link_prime(&link->link.link, &link_primer); ++ if (err) ++ goto out_unlock; ++ ++ err = bpf_trampoline_link_prog(&link->link, tr); ++ if (err) { ++ bpf_link_cleanup(&link_primer); ++ link = NULL; ++ goto out_unlock; ++ } ++ ++ link->tgt_prog = tgt_prog; ++ link->trampoline = tr; ++ ++ /* Always clear the trampoline and target prog from prog->aux to make ++ * sure the original attach destination is not kept alive after a ++ * program is (re-)attached to another target. ++ */ ++ if (prog->aux->dst_prog && ++ (tgt_prog_fd || tr != prog->aux->dst_trampoline)) ++ /* got extra prog ref from syscall, or attaching to different prog */ ++ bpf_prog_put(prog->aux->dst_prog); ++ if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) ++ /* we allocated a new trampoline, so free the old one */ ++ bpf_trampoline_put(prog->aux->dst_trampoline); ++ ++ prog->aux->dst_prog = NULL; ++ prog->aux->dst_trampoline = NULL; ++ mutex_unlock(&prog->aux->dst_mutex); ++ ++ return bpf_link_settle(&link_primer); ++out_unlock: ++ if (tr && tr != prog->aux->dst_trampoline) ++ bpf_trampoline_put(tr); ++ mutex_unlock(&prog->aux->dst_mutex); ++ kfree(link); ++out_put_prog: ++ if (tgt_prog_fd && tgt_prog) ++ bpf_prog_put(tgt_prog); ++ return err; ++} ++ ++struct bpf_raw_tp_link { ++ struct bpf_link link; ++ struct bpf_raw_event_map *btp; ++}; ++ ++static void bpf_raw_tp_link_release(struct bpf_link *link) ++{ ++ struct bpf_raw_tp_link *raw_tp = ++ container_of(link, struct bpf_raw_tp_link, link); ++ ++ bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); ++ bpf_put_raw_tracepoint(raw_tp->btp); ++} ++ ++static void bpf_raw_tp_link_dealloc(struct bpf_link *link) ++{ ++ struct bpf_raw_tp_link *raw_tp = ++ container_of(link, struct bpf_raw_tp_link, link); ++ ++ kfree(raw_tp); ++} ++ ++static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, ++ struct seq_file *seq) ++{ ++ struct bpf_raw_tp_link *raw_tp_link = ++ container_of(link, struct bpf_raw_tp_link, link); ++ ++ seq_printf(seq, ++ "tp_name:\t%s\n", ++ raw_tp_link->btp->tp->name); ++} ++ ++static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, ++ struct bpf_link_info *info) ++{ ++ struct bpf_raw_tp_link *raw_tp_link = ++ container_of(link, struct bpf_raw_tp_link, link); ++ char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); ++ const char *tp_name = raw_tp_link->btp->tp->name; ++ u32 ulen = info->raw_tracepoint.tp_name_len; ++ size_t tp_len = strlen(tp_name); ++ ++ if (!ulen ^ !ubuf) ++ return -EINVAL; ++ ++ info->raw_tracepoint.tp_name_len = tp_len + 1; ++ ++ if (!ubuf) ++ return 0; ++ ++ if (ulen >= tp_len + 1) { ++ if (copy_to_user(ubuf, tp_name, tp_len + 1)) ++ return -EFAULT; ++ } else { ++ char zero = '\0'; ++ ++ if (copy_to_user(ubuf, tp_name, ulen - 1)) ++ return -EFAULT; ++ if (put_user(zero, ubuf + ulen - 1)) ++ return -EFAULT; ++ return -ENOSPC; ++ } ++ ++ return 0; ++} ++ ++static const struct bpf_link_ops bpf_raw_tp_link_lops = { ++ .release = bpf_raw_tp_link_release, ++ .dealloc = bpf_raw_tp_link_dealloc, ++ .show_fdinfo = bpf_raw_tp_link_show_fdinfo, ++ .fill_link_info = bpf_raw_tp_link_fill_link_info, ++}; ++ ++#ifdef CONFIG_PERF_EVENTS ++struct bpf_perf_link { ++ struct bpf_link link; ++ struct file *perf_file; ++}; ++ ++static void bpf_perf_link_release(struct bpf_link *link) ++{ ++ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); ++ struct perf_event *event = perf_link->perf_file->private_data; ++ ++ perf_event_free_bpf_prog(event); ++ fput(perf_link->perf_file); ++} ++ ++static void bpf_perf_link_dealloc(struct bpf_link *link) ++{ ++ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); ++ ++ kfree(perf_link); ++} ++ ++static const struct bpf_link_ops bpf_perf_link_lops = { ++ .release = bpf_perf_link_release, ++ .dealloc = bpf_perf_link_dealloc, ++}; ++ ++static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) ++{ ++ struct bpf_link_primer link_primer; ++ struct bpf_perf_link *link; ++ struct perf_event *event; ++ struct file *perf_file; ++ int err; ++ ++ if (attr->link_create.flags) ++ return -EINVAL; ++ ++ perf_file = perf_event_get(attr->link_create.target_fd); ++ if (IS_ERR(perf_file)) ++ return PTR_ERR(perf_file); ++ ++ link = kzalloc(sizeof(*link), GFP_USER); ++ if (!link) { ++ err = -ENOMEM; ++ goto out_put_file; ++ } ++ bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); ++ link->perf_file = perf_file; ++ ++ err = bpf_link_prime(&link->link, &link_primer); ++ if (err) { ++ kfree(link); ++ goto out_put_file; ++ } ++ ++ event = perf_file->private_data; ++ err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); ++ if (err) { ++ bpf_link_cleanup(&link_primer); ++ goto out_put_file; ++ } ++ /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ ++ bpf_prog_inc(prog); ++ ++ return bpf_link_settle(&link_primer); ++ ++out_put_file: ++ fput(perf_file); ++ return err; ++} ++#else ++static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) ++{ ++ return -EOPNOTSUPP; ++} ++#endif /* CONFIG_PERF_EVENTS */ ++ ++static int bpf_raw_tp_link_attach(struct bpf_prog *prog, ++ const char __user *user_tp_name) ++{ ++ struct bpf_link_primer link_primer; ++ struct bpf_raw_tp_link *link; ++ struct bpf_raw_event_map *btp; ++ const char *tp_name; ++ char buf[128]; ++ int err; ++ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_TRACING: ++ case BPF_PROG_TYPE_EXT: ++ case BPF_PROG_TYPE_LSM: ++ if (user_tp_name) ++ /* The attach point for this category of programs ++ * should be specified via btf_id during program load. ++ */ ++ return -EINVAL; ++ if (prog->type == BPF_PROG_TYPE_TRACING && ++ prog->expected_attach_type == BPF_TRACE_RAW_TP) { ++ tp_name = prog->aux->attach_func_name; ++ break; ++ } ++ return bpf_tracing_prog_attach(prog, 0, 0, 0); ++ case BPF_PROG_TYPE_RAW_TRACEPOINT: ++ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: ++ if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) ++ return -EFAULT; ++ buf[sizeof(buf) - 1] = 0; ++ tp_name = buf; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ btp = bpf_get_raw_tracepoint(tp_name); ++ if (!btp) ++ return -ENOENT; ++ ++ link = kzalloc(sizeof(*link), GFP_USER); ++ if (!link) { ++ err = -ENOMEM; ++ goto out_put_btp; ++ } ++ bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, ++ &bpf_raw_tp_link_lops, prog); ++ link->btp = btp; ++ ++ err = bpf_link_prime(&link->link, &link_primer); ++ if (err) { ++ kfree(link); ++ goto out_put_btp; ++ } ++ ++ err = bpf_probe_register(link->btp, prog); ++ if (err) { ++ bpf_link_cleanup(&link_primer); ++ goto out_put_btp; ++ } ++ ++ return bpf_link_settle(&link_primer); ++ ++out_put_btp: ++ bpf_put_raw_tracepoint(btp); ++ return err; ++} ++ ++#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd ++ ++static int bpf_raw_tracepoint_open(const union bpf_attr *attr) ++{ ++ struct bpf_prog *prog; ++ int fd; ++ ++ if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) ++ return -EINVAL; ++ ++ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); ++ if (fd < 0) ++ bpf_prog_put(prog); ++ return fd; ++} ++ ++static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, ++ enum bpf_attach_type attach_type) ++{ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ case BPF_PROG_TYPE_SK_LOOKUP: ++ return attach_type == prog->expected_attach_type ? 0 : -EINVAL; ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ if (!capable(CAP_NET_ADMIN)) ++ /* cg-skb progs can be loaded by unpriv user. ++ * check permissions at attach time. ++ */ ++ return -EPERM; ++ return prog->enforce_expected_attach_type && ++ prog->expected_attach_type != attach_type ? ++ -EINVAL : 0; ++ default: ++ return 0; ++ } ++} ++ ++static enum bpf_prog_type ++attach_type_to_prog_type(enum bpf_attach_type attach_type) ++{ ++ switch (attach_type) { ++ case BPF_CGROUP_INET_INGRESS: ++ case BPF_CGROUP_INET_EGRESS: ++ return BPF_PROG_TYPE_CGROUP_SKB; ++ case BPF_CGROUP_INET_SOCK_CREATE: ++ case BPF_CGROUP_INET_SOCK_RELEASE: ++ case BPF_CGROUP_INET4_POST_BIND: ++ case BPF_CGROUP_INET6_POST_BIND: ++ return BPF_PROG_TYPE_CGROUP_SOCK; ++ case BPF_CGROUP_INET4_BIND: ++ case BPF_CGROUP_INET6_BIND: ++ case BPF_CGROUP_INET4_CONNECT: ++ case BPF_CGROUP_INET6_CONNECT: ++ case BPF_CGROUP_INET4_GETPEERNAME: ++ case BPF_CGROUP_INET6_GETPEERNAME: ++ case BPF_CGROUP_INET4_GETSOCKNAME: ++ case BPF_CGROUP_INET6_GETSOCKNAME: ++ case BPF_CGROUP_UDP4_SENDMSG: ++ case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: ++ return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; ++ case BPF_CGROUP_SOCK_OPS: ++ return BPF_PROG_TYPE_SOCK_OPS; ++ case BPF_CGROUP_DEVICE: ++ return BPF_PROG_TYPE_CGROUP_DEVICE; ++ case BPF_SK_MSG_VERDICT: ++ return BPF_PROG_TYPE_SK_MSG; ++ case BPF_SK_SKB_STREAM_PARSER: ++ case BPF_SK_SKB_STREAM_VERDICT: ++ case BPF_SK_SKB_VERDICT: ++ return BPF_PROG_TYPE_SK_SKB; ++ case BPF_LIRC_MODE2: ++ return BPF_PROG_TYPE_LIRC_MODE2; ++ case BPF_FLOW_DISSECTOR: ++ return BPF_PROG_TYPE_FLOW_DISSECTOR; ++ case BPF_CGROUP_SYSCTL: ++ return BPF_PROG_TYPE_CGROUP_SYSCTL; ++ case BPF_CGROUP_GETSOCKOPT: ++ case BPF_CGROUP_SETSOCKOPT: ++ return BPF_PROG_TYPE_CGROUP_SOCKOPT; ++ case BPF_TRACE_ITER: ++ case BPF_TRACE_RAW_TP: ++ case BPF_TRACE_FENTRY: ++ case BPF_TRACE_FEXIT: ++ case BPF_MODIFY_RETURN: ++ return BPF_PROG_TYPE_TRACING; ++ case BPF_LSM_MAC: ++ return BPF_PROG_TYPE_LSM; ++ case BPF_SK_LOOKUP: ++ return BPF_PROG_TYPE_SK_LOOKUP; ++ case BPF_XDP: ++ return BPF_PROG_TYPE_XDP; ++ case BPF_LSM_CGROUP: ++ return BPF_PROG_TYPE_LSM; ++ default: ++ return BPF_PROG_TYPE_UNSPEC; ++ } ++} ++ ++#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd ++ ++#define BPF_F_ATTACH_MASK \ ++ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) ++ ++static int bpf_prog_attach(const union bpf_attr *attr) ++{ ++ enum bpf_prog_type ptype; ++ struct bpf_prog *prog; ++ int ret; ++ ++ if (CHECK_ATTR(BPF_PROG_ATTACH)) ++ return -EINVAL; ++ ++ if (attr->attach_flags & ~BPF_F_ATTACH_MASK) ++ return -EINVAL; ++ ++ ptype = attach_type_to_prog_type(attr->attach_type); ++ if (ptype == BPF_PROG_TYPE_UNSPEC) ++ return -EINVAL; ++ ++ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { ++ bpf_prog_put(prog); ++ return -EINVAL; ++ } ++ ++ switch (ptype) { ++ case BPF_PROG_TYPE_SK_SKB: ++ case BPF_PROG_TYPE_SK_MSG: ++ ret = sock_map_get_from_fd(attr, prog); ++ break; ++ case BPF_PROG_TYPE_LIRC_MODE2: ++ ret = lirc_prog_attach(attr, prog); ++ break; ++ case BPF_PROG_TYPE_FLOW_DISSECTOR: ++ ret = netns_bpf_prog_attach(attr, prog); ++ break; ++ case BPF_PROG_TYPE_CGROUP_DEVICE: ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ case BPF_PROG_TYPE_CGROUP_SYSCTL: ++ case BPF_PROG_TYPE_SOCK_OPS: ++ case BPF_PROG_TYPE_LSM: ++ if (ptype == BPF_PROG_TYPE_LSM && ++ prog->expected_attach_type != BPF_LSM_CGROUP) ++ return -EINVAL; ++ ++ ret = cgroup_bpf_prog_attach(attr, ptype, prog); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret) ++ bpf_prog_put(prog); ++ return ret; ++} ++ ++#define BPF_PROG_DETACH_LAST_FIELD attach_type ++ ++static int bpf_prog_detach(const union bpf_attr *attr) ++{ ++ enum bpf_prog_type ptype; ++ ++ if (CHECK_ATTR(BPF_PROG_DETACH)) ++ return -EINVAL; ++ ++ ptype = attach_type_to_prog_type(attr->attach_type); ++ ++ switch (ptype) { ++ case BPF_PROG_TYPE_SK_MSG: ++ case BPF_PROG_TYPE_SK_SKB: ++ return sock_map_prog_detach(attr, ptype); ++ case BPF_PROG_TYPE_LIRC_MODE2: ++ return lirc_prog_detach(attr); ++ case BPF_PROG_TYPE_FLOW_DISSECTOR: ++ return netns_bpf_prog_detach(attr, ptype); ++ case BPF_PROG_TYPE_CGROUP_DEVICE: ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ case BPF_PROG_TYPE_CGROUP_SYSCTL: ++ case BPF_PROG_TYPE_SOCK_OPS: ++ case BPF_PROG_TYPE_LSM: ++ return cgroup_bpf_prog_detach(attr, ptype); ++ default: ++ return -EINVAL; ++ } ++} ++ ++#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags ++ ++static int bpf_prog_query(const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ if (CHECK_ATTR(BPF_PROG_QUERY)) ++ return -EINVAL; ++ if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) ++ return -EINVAL; ++ ++ switch (attr->query.attach_type) { ++ case BPF_CGROUP_INET_INGRESS: ++ case BPF_CGROUP_INET_EGRESS: ++ case BPF_CGROUP_INET_SOCK_CREATE: ++ case BPF_CGROUP_INET_SOCK_RELEASE: ++ case BPF_CGROUP_INET4_BIND: ++ case BPF_CGROUP_INET6_BIND: ++ case BPF_CGROUP_INET4_POST_BIND: ++ case BPF_CGROUP_INET6_POST_BIND: ++ case BPF_CGROUP_INET4_CONNECT: ++ case BPF_CGROUP_INET6_CONNECT: ++ case BPF_CGROUP_INET4_GETPEERNAME: ++ case BPF_CGROUP_INET6_GETPEERNAME: ++ case BPF_CGROUP_INET4_GETSOCKNAME: ++ case BPF_CGROUP_INET6_GETSOCKNAME: ++ case BPF_CGROUP_UDP4_SENDMSG: ++ case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: ++ case BPF_CGROUP_SOCK_OPS: ++ case BPF_CGROUP_DEVICE: ++ case BPF_CGROUP_SYSCTL: ++ case BPF_CGROUP_GETSOCKOPT: ++ case BPF_CGROUP_SETSOCKOPT: ++ case BPF_LSM_CGROUP: ++ return cgroup_bpf_prog_query(attr, uattr); ++ case BPF_LIRC_MODE2: ++ return lirc_prog_query(attr, uattr); ++ case BPF_FLOW_DISSECTOR: ++ case BPF_SK_LOOKUP: ++ return netns_bpf_prog_query(attr, uattr); ++ case BPF_SK_SKB_STREAM_PARSER: ++ case BPF_SK_SKB_STREAM_VERDICT: ++ case BPF_SK_MSG_VERDICT: ++ case BPF_SK_SKB_VERDICT: ++ return sock_map_bpf_prog_query(attr, uattr); ++ default: ++ return -EINVAL; ++ } ++} ++ ++#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size ++ ++static int bpf_prog_test_run(const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_prog *prog; ++ int ret = -ENOTSUPP; ++ ++ if (CHECK_ATTR(BPF_PROG_TEST_RUN)) ++ return -EINVAL; ++ ++ if ((attr->test.ctx_size_in && !attr->test.ctx_in) || ++ (!attr->test.ctx_size_in && attr->test.ctx_in)) ++ return -EINVAL; ++ ++ if ((attr->test.ctx_size_out && !attr->test.ctx_out) || ++ (!attr->test.ctx_size_out && attr->test.ctx_out)) ++ return -EINVAL; ++ ++ prog = bpf_prog_get(attr->test.prog_fd); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ if (prog->aux->ops->test_run) ++ ret = prog->aux->ops->test_run(prog, attr, uattr); ++ ++ bpf_prog_put(prog); ++ return ret; ++} ++ ++#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id ++ ++static int bpf_obj_get_next_id(const union bpf_attr *attr, ++ union bpf_attr __user *uattr, ++ struct idr *idr, ++ spinlock_t *lock) ++{ ++ u32 next_id = attr->start_id; ++ int err = 0; ++ ++ if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ next_id++; ++ spin_lock_bh(lock); ++ if (!idr_get_next(idr, &next_id)) ++ err = -ENOENT; ++ spin_unlock_bh(lock); ++ ++ if (!err) ++ err = put_user(next_id, &uattr->next_id); ++ ++ return err; ++} ++ ++struct bpf_map *bpf_map_get_curr_or_next(u32 *id) ++{ ++ struct bpf_map *map; ++ ++ spin_lock_bh(&map_idr_lock); ++again: ++ map = idr_get_next(&map_idr, id); ++ if (map) { ++ map = __bpf_map_inc_not_zero(map, false); ++ if (IS_ERR(map)) { ++ (*id)++; ++ goto again; ++ } ++ } ++ spin_unlock_bh(&map_idr_lock); ++ ++ return map; ++} ++ ++struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) ++{ ++ struct bpf_prog *prog; ++ ++ spin_lock_bh(&prog_idr_lock); ++again: ++ prog = idr_get_next(&prog_idr, id); ++ if (prog) { ++ prog = bpf_prog_inc_not_zero(prog); ++ if (IS_ERR(prog)) { ++ (*id)++; ++ goto again; ++ } ++ } ++ spin_unlock_bh(&prog_idr_lock); ++ ++ return prog; ++} ++ ++#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id ++ ++struct bpf_prog *bpf_prog_by_id(u32 id) ++{ ++ struct bpf_prog *prog; ++ ++ if (!id) ++ return ERR_PTR(-ENOENT); ++ ++ spin_lock_bh(&prog_idr_lock); ++ prog = idr_find(&prog_idr, id); ++ if (prog) ++ prog = bpf_prog_inc_not_zero(prog); ++ else ++ prog = ERR_PTR(-ENOENT); ++ spin_unlock_bh(&prog_idr_lock); ++ return prog; ++} ++ ++static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) ++{ ++ struct bpf_prog *prog; ++ u32 id = attr->prog_id; ++ int fd; ++ ++ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ prog = bpf_prog_by_id(id); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ fd = bpf_prog_new_fd(prog); ++ if (fd < 0) ++ bpf_prog_put(prog); ++ ++ return fd; ++} ++ ++#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags ++ ++static int bpf_map_get_fd_by_id(const union bpf_attr *attr) ++{ ++ struct bpf_map *map; ++ u32 id = attr->map_id; ++ int f_flags; ++ int fd; ++ ++ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || ++ attr->open_flags & ~BPF_OBJ_FLAG_MASK) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ f_flags = bpf_get_file_flag(attr->open_flags); ++ if (f_flags < 0) ++ return f_flags; ++ ++ spin_lock_bh(&map_idr_lock); ++ map = idr_find(&map_idr, id); ++ if (map) ++ map = __bpf_map_inc_not_zero(map, true); ++ else ++ map = ERR_PTR(-ENOENT); ++ spin_unlock_bh(&map_idr_lock); ++ ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ ++ fd = bpf_map_new_fd(map, f_flags); ++ if (fd < 0) ++ bpf_map_put_with_uref(map); ++ ++ return fd; ++} ++ ++static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, ++ unsigned long addr, u32 *off, ++ u32 *type) ++{ ++ const struct bpf_map *map; ++ int i; ++ ++ mutex_lock(&prog->aux->used_maps_mutex); ++ for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { ++ map = prog->aux->used_maps[i]; ++ if (map == (void *)addr) { ++ *type = BPF_PSEUDO_MAP_FD; ++ goto out; ++ } ++ if (!map->ops->map_direct_value_meta) ++ continue; ++ if (!map->ops->map_direct_value_meta(map, addr, off)) { ++ *type = BPF_PSEUDO_MAP_VALUE; ++ goto out; ++ } ++ } ++ map = NULL; ++ ++out: ++ mutex_unlock(&prog->aux->used_maps_mutex); ++ return map; ++} ++ ++static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, ++ const struct cred *f_cred) ++{ ++ const struct bpf_map *map; ++ struct bpf_insn *insns; ++ u32 off, type; ++ u64 imm; ++ u8 code; ++ int i; ++ ++ insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), ++ GFP_USER); ++ if (!insns) ++ return insns; ++ ++ for (i = 0; i < prog->len; i++) { ++ code = insns[i].code; ++ ++ if (code == (BPF_JMP | BPF_TAIL_CALL)) { ++ insns[i].code = BPF_JMP | BPF_CALL; ++ insns[i].imm = BPF_FUNC_tail_call; ++ /* fall-through */ ++ } ++ if (code == (BPF_JMP | BPF_CALL) || ++ code == (BPF_JMP | BPF_CALL_ARGS)) { ++ if (code == (BPF_JMP | BPF_CALL_ARGS)) ++ insns[i].code = BPF_JMP | BPF_CALL; ++ if (!bpf_dump_raw_ok(f_cred)) ++ insns[i].imm = 0; ++ continue; ++ } ++ if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { ++ insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; ++ continue; ++ } ++ ++ if (code != (BPF_LD | BPF_IMM | BPF_DW)) ++ continue; ++ ++ imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; ++ map = bpf_map_from_imm(prog, imm, &off, &type); ++ if (map) { ++ insns[i].src_reg = type; ++ insns[i].imm = map->id; ++ insns[i + 1].imm = off; ++ continue; ++ } ++ } ++ ++ return insns; ++} ++ ++static int set_info_rec_size(struct bpf_prog_info *info) ++{ ++ /* ++ * Ensure info.*_rec_size is the same as kernel expected size ++ * ++ * or ++ * ++ * Only allow zero *_rec_size if both _rec_size and _cnt are ++ * zero. In this case, the kernel will set the expected ++ * _rec_size back to the info. ++ */ ++ ++ if ((info->nr_func_info || info->func_info_rec_size) && ++ info->func_info_rec_size != sizeof(struct bpf_func_info)) ++ return -EINVAL; ++ ++ if ((info->nr_line_info || info->line_info_rec_size) && ++ info->line_info_rec_size != sizeof(struct bpf_line_info)) ++ return -EINVAL; ++ ++ if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && ++ info->jited_line_info_rec_size != sizeof(__u64)) ++ return -EINVAL; ++ ++ info->func_info_rec_size = sizeof(struct bpf_func_info); ++ info->line_info_rec_size = sizeof(struct bpf_line_info); ++ info->jited_line_info_rec_size = sizeof(__u64); ++ ++ return 0; ++} ++ ++static int bpf_prog_get_info_by_fd(struct file *file, ++ struct bpf_prog *prog, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); ++ struct btf *attach_btf = bpf_prog_get_target_btf(prog); ++ struct bpf_prog_info info; ++ u32 info_len = attr->info.info_len; ++ struct bpf_prog_kstats stats; ++ char __user *uinsns; ++ u32 ulen; ++ int err; ++ ++ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); ++ if (err) ++ return err; ++ info_len = min_t(u32, sizeof(info), info_len); ++ ++ memset(&info, 0, sizeof(info)); ++ if (copy_from_user(&info, uinfo, info_len)) ++ return -EFAULT; ++ ++ info.type = prog->type; ++ info.id = prog->aux->id; ++ info.load_time = prog->aux->load_time; ++ info.created_by_uid = from_kuid_munged(current_user_ns(), ++ prog->aux->user->uid); ++ info.gpl_compatible = prog->gpl_compatible; ++ ++ memcpy(info.tag, prog->tag, sizeof(prog->tag)); ++ memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); ++ ++ mutex_lock(&prog->aux->used_maps_mutex); ++ ulen = info.nr_map_ids; ++ info.nr_map_ids = prog->aux->used_map_cnt; ++ ulen = min_t(u32, info.nr_map_ids, ulen); ++ if (ulen) { ++ u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); ++ u32 i; ++ ++ for (i = 0; i < ulen; i++) ++ if (put_user(prog->aux->used_maps[i]->id, ++ &user_map_ids[i])) { ++ mutex_unlock(&prog->aux->used_maps_mutex); ++ return -EFAULT; ++ } ++ } ++ mutex_unlock(&prog->aux->used_maps_mutex); ++ ++ err = set_info_rec_size(&info); ++ if (err) ++ return err; ++ ++ bpf_prog_get_stats(prog, &stats); ++ info.run_time_ns = stats.nsecs; ++ info.run_cnt = stats.cnt; ++ info.recursion_misses = stats.misses; ++ ++ info.verified_insns = prog->aux->verified_insns; ++ ++ if (!bpf_capable()) { ++ info.jited_prog_len = 0; ++ info.xlated_prog_len = 0; ++ info.nr_jited_ksyms = 0; ++ info.nr_jited_func_lens = 0; ++ info.nr_func_info = 0; ++ info.nr_line_info = 0; ++ info.nr_jited_line_info = 0; ++ goto done; ++ } ++ ++ ulen = info.xlated_prog_len; ++ info.xlated_prog_len = bpf_prog_insn_size(prog); ++ if (info.xlated_prog_len && ulen) { ++ struct bpf_insn *insns_sanitized; ++ bool fault; ++ ++ if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { ++ info.xlated_prog_insns = 0; ++ goto done; ++ } ++ insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); ++ if (!insns_sanitized) ++ return -ENOMEM; ++ uinsns = u64_to_user_ptr(info.xlated_prog_insns); ++ ulen = min_t(u32, info.xlated_prog_len, ulen); ++ fault = copy_to_user(uinsns, insns_sanitized, ulen); ++ kfree(insns_sanitized); ++ if (fault) ++ return -EFAULT; ++ } ++ ++ if (bpf_prog_is_dev_bound(prog->aux)) { ++ err = bpf_prog_offload_info_fill(&info, prog); ++ if (err) ++ return err; ++ goto done; ++ } ++ ++ /* NOTE: the following code is supposed to be skipped for offload. ++ * bpf_prog_offload_info_fill() is the place to fill similar fields ++ * for offload. ++ */ ++ ulen = info.jited_prog_len; ++ if (prog->aux->func_cnt) { ++ u32 i; ++ ++ info.jited_prog_len = 0; ++ for (i = 0; i < prog->aux->func_cnt; i++) ++ info.jited_prog_len += prog->aux->func[i]->jited_len; ++ } else { ++ info.jited_prog_len = prog->jited_len; ++ } ++ ++ if (info.jited_prog_len && ulen) { ++ if (bpf_dump_raw_ok(file->f_cred)) { ++ uinsns = u64_to_user_ptr(info.jited_prog_insns); ++ ulen = min_t(u32, info.jited_prog_len, ulen); ++ ++ /* for multi-function programs, copy the JITed ++ * instructions for all the functions ++ */ ++ if (prog->aux->func_cnt) { ++ u32 len, free, i; ++ u8 *img; ++ ++ free = ulen; ++ for (i = 0; i < prog->aux->func_cnt; i++) { ++ len = prog->aux->func[i]->jited_len; ++ len = min_t(u32, len, free); ++ img = (u8 *) prog->aux->func[i]->bpf_func; ++ if (copy_to_user(uinsns, img, len)) ++ return -EFAULT; ++ uinsns += len; ++ free -= len; ++ if (!free) ++ break; ++ } ++ } else { ++ if (copy_to_user(uinsns, prog->bpf_func, ulen)) ++ return -EFAULT; ++ } ++ } else { ++ info.jited_prog_insns = 0; ++ } ++ } ++ ++ ulen = info.nr_jited_ksyms; ++ info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; ++ if (ulen) { ++ if (bpf_dump_raw_ok(file->f_cred)) { ++ unsigned long ksym_addr; ++ u64 __user *user_ksyms; ++ u32 i; ++ ++ /* copy the address of the kernel symbol ++ * corresponding to each function ++ */ ++ ulen = min_t(u32, info.nr_jited_ksyms, ulen); ++ user_ksyms = u64_to_user_ptr(info.jited_ksyms); ++ if (prog->aux->func_cnt) { ++ for (i = 0; i < ulen; i++) { ++ ksym_addr = (unsigned long) ++ prog->aux->func[i]->bpf_func; ++ if (put_user((u64) ksym_addr, ++ &user_ksyms[i])) ++ return -EFAULT; ++ } ++ } else { ++ ksym_addr = (unsigned long) prog->bpf_func; ++ if (put_user((u64) ksym_addr, &user_ksyms[0])) ++ return -EFAULT; ++ } ++ } else { ++ info.jited_ksyms = 0; ++ } ++ } ++ ++ ulen = info.nr_jited_func_lens; ++ info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; ++ if (ulen) { ++ if (bpf_dump_raw_ok(file->f_cred)) { ++ u32 __user *user_lens; ++ u32 func_len, i; ++ ++ /* copy the JITed image lengths for each function */ ++ ulen = min_t(u32, info.nr_jited_func_lens, ulen); ++ user_lens = u64_to_user_ptr(info.jited_func_lens); ++ if (prog->aux->func_cnt) { ++ for (i = 0; i < ulen; i++) { ++ func_len = ++ prog->aux->func[i]->jited_len; ++ if (put_user(func_len, &user_lens[i])) ++ return -EFAULT; ++ } ++ } else { ++ func_len = prog->jited_len; ++ if (put_user(func_len, &user_lens[0])) ++ return -EFAULT; ++ } ++ } else { ++ info.jited_func_lens = 0; ++ } ++ } ++ ++ if (prog->aux->btf) ++ info.btf_id = btf_obj_id(prog->aux->btf); ++ info.attach_btf_id = prog->aux->attach_btf_id; ++ if (attach_btf) ++ info.attach_btf_obj_id = btf_obj_id(attach_btf); ++ ++ ulen = info.nr_func_info; ++ info.nr_func_info = prog->aux->func_info_cnt; ++ if (info.nr_func_info && ulen) { ++ char __user *user_finfo; ++ ++ user_finfo = u64_to_user_ptr(info.func_info); ++ ulen = min_t(u32, info.nr_func_info, ulen); ++ if (copy_to_user(user_finfo, prog->aux->func_info, ++ info.func_info_rec_size * ulen)) ++ return -EFAULT; ++ } ++ ++ ulen = info.nr_line_info; ++ info.nr_line_info = prog->aux->nr_linfo; ++ if (info.nr_line_info && ulen) { ++ __u8 __user *user_linfo; ++ ++ user_linfo = u64_to_user_ptr(info.line_info); ++ ulen = min_t(u32, info.nr_line_info, ulen); ++ if (copy_to_user(user_linfo, prog->aux->linfo, ++ info.line_info_rec_size * ulen)) ++ return -EFAULT; ++ } ++ ++ ulen = info.nr_jited_line_info; ++ if (prog->aux->jited_linfo) ++ info.nr_jited_line_info = prog->aux->nr_linfo; ++ else ++ info.nr_jited_line_info = 0; ++ if (info.nr_jited_line_info && ulen) { ++ if (bpf_dump_raw_ok(file->f_cred)) { ++ unsigned long line_addr; ++ __u64 __user *user_linfo; ++ u32 i; ++ ++ user_linfo = u64_to_user_ptr(info.jited_line_info); ++ ulen = min_t(u32, info.nr_jited_line_info, ulen); ++ for (i = 0; i < ulen; i++) { ++ line_addr = (unsigned long)prog->aux->jited_linfo[i]; ++ if (put_user((__u64)line_addr, &user_linfo[i])) ++ return -EFAULT; ++ } ++ } else { ++ info.jited_line_info = 0; ++ } ++ } ++ ++ ulen = info.nr_prog_tags; ++ info.nr_prog_tags = prog->aux->func_cnt ? : 1; ++ if (ulen) { ++ __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; ++ u32 i; ++ ++ user_prog_tags = u64_to_user_ptr(info.prog_tags); ++ ulen = min_t(u32, info.nr_prog_tags, ulen); ++ if (prog->aux->func_cnt) { ++ for (i = 0; i < ulen; i++) { ++ if (copy_to_user(user_prog_tags[i], ++ prog->aux->func[i]->tag, ++ BPF_TAG_SIZE)) ++ return -EFAULT; ++ } ++ } else { ++ if (copy_to_user(user_prog_tags[0], ++ prog->tag, BPF_TAG_SIZE)) ++ return -EFAULT; ++ } ++ } ++ ++done: ++ if (copy_to_user(uinfo, &info, info_len) || ++ put_user(info_len, &uattr->info.info_len)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++static int bpf_map_get_info_by_fd(struct file *file, ++ struct bpf_map *map, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); ++ struct bpf_map_info info; ++ u32 info_len = attr->info.info_len; ++ int err; ++ ++ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); ++ if (err) ++ return err; ++ info_len = min_t(u32, sizeof(info), info_len); ++ ++ memset(&info, 0, sizeof(info)); ++ info.type = map->map_type; ++ info.id = map->id; ++ info.key_size = map->key_size; ++ info.value_size = map->value_size; ++ info.max_entries = map->max_entries; ++ info.map_flags = map->map_flags; ++ info.map_extra = map->map_extra; ++ memcpy(info.name, map->name, sizeof(map->name)); ++ ++ if (map->btf) { ++ info.btf_id = btf_obj_id(map->btf); ++ info.btf_key_type_id = map->btf_key_type_id; ++ info.btf_value_type_id = map->btf_value_type_id; ++ } ++ info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; ++ ++ if (bpf_map_is_dev_bound(map)) { ++ err = bpf_map_offload_info_fill(&info, map); ++ if (err) ++ return err; ++ } ++ ++ if (copy_to_user(uinfo, &info, info_len) || ++ put_user(info_len, &uattr->info.info_len)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++static int bpf_btf_get_info_by_fd(struct file *file, ++ struct btf *btf, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); ++ u32 info_len = attr->info.info_len; ++ int err; ++ ++ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); ++ if (err) ++ return err; ++ ++ return btf_get_info_by_fd(btf, attr, uattr); ++} ++ ++static int bpf_link_get_info_by_fd(struct file *file, ++ struct bpf_link *link, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); ++ struct bpf_link_info info; ++ u32 info_len = attr->info.info_len; ++ int err; ++ ++ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); ++ if (err) ++ return err; ++ info_len = min_t(u32, sizeof(info), info_len); ++ ++ memset(&info, 0, sizeof(info)); ++ if (copy_from_user(&info, uinfo, info_len)) ++ return -EFAULT; ++ ++ info.type = link->type; ++ info.id = link->id; ++ info.prog_id = link->prog->aux->id; ++ ++ if (link->ops->fill_link_info) { ++ err = link->ops->fill_link_info(link, &info); ++ if (err) ++ return err; ++ } ++ ++ if (copy_to_user(uinfo, &info, info_len) || ++ put_user(info_len, &uattr->info.info_len)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++ ++#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info ++ ++static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ int ufd = attr->info.bpf_fd; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ if (!f.file) ++ return -EBADFD; ++ ++ if (f.file->f_op == &bpf_prog_fops) ++ err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, ++ uattr); ++ else if (f.file->f_op == &bpf_map_fops) ++ err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, ++ uattr); ++ else if (f.file->f_op == &btf_fops) ++ err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); ++ else if (f.file->f_op == &bpf_link_fops) ++ err = bpf_link_get_info_by_fd(f.file, f.file->private_data, ++ attr, uattr); ++ else ++ err = -EINVAL; ++ ++ fdput(f); ++ return err; ++} ++ ++#define BPF_BTF_LOAD_LAST_FIELD btf_log_level ++ ++static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) ++{ ++ if (CHECK_ATTR(BPF_BTF_LOAD)) ++ return -EINVAL; ++ ++ if (!bpf_capable()) ++ return -EPERM; ++ ++ return btf_new_fd(attr, uattr); ++} ++ ++#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id ++ ++static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) ++{ ++ if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ return btf_get_fd_by_id(attr->btf_id); ++} ++ ++static int bpf_task_fd_query_copy(const union bpf_attr *attr, ++ union bpf_attr __user *uattr, ++ u32 prog_id, u32 fd_type, ++ const char *buf, u64 probe_offset, ++ u64 probe_addr) ++{ ++ char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); ++ u32 len = buf ? strlen(buf) : 0, input_len; ++ int err = 0; ++ ++ if (put_user(len, &uattr->task_fd_query.buf_len)) ++ return -EFAULT; ++ input_len = attr->task_fd_query.buf_len; ++ if (input_len && ubuf) { ++ if (!len) { ++ /* nothing to copy, just make ubuf NULL terminated */ ++ char zero = '\0'; ++ ++ if (put_user(zero, ubuf)) ++ return -EFAULT; ++ } else if (input_len >= len + 1) { ++ /* ubuf can hold the string with NULL terminator */ ++ if (copy_to_user(ubuf, buf, len + 1)) ++ return -EFAULT; ++ } else { ++ /* ubuf cannot hold the string with NULL terminator, ++ * do a partial copy with NULL terminator. ++ */ ++ char zero = '\0'; ++ ++ err = -ENOSPC; ++ if (copy_to_user(ubuf, buf, input_len - 1)) ++ return -EFAULT; ++ if (put_user(zero, ubuf + input_len - 1)) ++ return -EFAULT; ++ } ++ } ++ ++ if (put_user(prog_id, &uattr->task_fd_query.prog_id) || ++ put_user(fd_type, &uattr->task_fd_query.fd_type) || ++ put_user(probe_offset, &uattr->task_fd_query.probe_offset) || ++ put_user(probe_addr, &uattr->task_fd_query.probe_addr)) ++ return -EFAULT; ++ ++ return err; ++} ++ ++#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr ++ ++static int bpf_task_fd_query(const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ pid_t pid = attr->task_fd_query.pid; ++ u32 fd = attr->task_fd_query.fd; ++ const struct perf_event *event; ++ struct task_struct *task; ++ struct file *file; ++ int err; ++ ++ if (CHECK_ATTR(BPF_TASK_FD_QUERY)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (attr->task_fd_query.flags != 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ task = get_pid_task(find_vpid(pid), PIDTYPE_PID); ++ rcu_read_unlock(); ++ if (!task) ++ return -ENOENT; ++ ++ err = 0; ++ file = fget_task(task, fd); ++ put_task_struct(task); ++ if (!file) ++ return -EBADF; ++ ++ if (file->f_op == &bpf_link_fops) { ++ struct bpf_link *link = file->private_data; ++ ++ if (link->ops == &bpf_raw_tp_link_lops) { ++ struct bpf_raw_tp_link *raw_tp = ++ container_of(link, struct bpf_raw_tp_link, link); ++ struct bpf_raw_event_map *btp = raw_tp->btp; ++ ++ err = bpf_task_fd_query_copy(attr, uattr, ++ raw_tp->link.prog->aux->id, ++ BPF_FD_TYPE_RAW_TRACEPOINT, ++ btp->tp->name, 0, 0); ++ goto put_file; ++ } ++ goto out_not_supp; ++ } ++ ++ event = perf_get_event(file); ++ if (!IS_ERR(event)) { ++ u64 probe_offset, probe_addr; ++ u32 prog_id, fd_type; ++ const char *buf; ++ ++ err = bpf_get_perf_event_info(event, &prog_id, &fd_type, ++ &buf, &probe_offset, ++ &probe_addr); ++ if (!err) ++ err = bpf_task_fd_query_copy(attr, uattr, prog_id, ++ fd_type, buf, ++ probe_offset, ++ probe_addr); ++ goto put_file; ++ } ++ ++out_not_supp: ++ err = -ENOTSUPP; ++put_file: ++ fput(file); ++ return err; ++} ++ ++#define BPF_MAP_BATCH_LAST_FIELD batch.flags ++ ++#define BPF_DO_BATCH(fn) \ ++ do { \ ++ if (!fn) { \ ++ err = -ENOTSUPP; \ ++ goto err_put; \ ++ } \ ++ err = fn(map, attr, uattr); \ ++ } while (0) ++ ++static int bpf_map_do_batch(const union bpf_attr *attr, ++ union bpf_attr __user *uattr, ++ int cmd) ++{ ++ bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || ++ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; ++ bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; ++ struct bpf_map *map; ++ int err, ufd; ++ struct fd f; ++ ++ if (CHECK_ATTR(BPF_MAP_BATCH)) ++ return -EINVAL; ++ ++ ufd = attr->batch.map_fd; ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ if (has_write) ++ bpf_map_write_active_inc(map); ++ if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if (cmd == BPF_MAP_LOOKUP_BATCH) ++ BPF_DO_BATCH(map->ops->map_lookup_batch); ++ else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) ++ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); ++ else if (cmd == BPF_MAP_UPDATE_BATCH) ++ BPF_DO_BATCH(map->ops->map_update_batch); ++ else ++ BPF_DO_BATCH(map->ops->map_delete_batch); ++err_put: ++ if (has_write) ++ bpf_map_write_active_dec(map); ++ fdput(f); ++ return err; ++} ++ ++#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies ++static int link_create(union bpf_attr *attr, bpfptr_t uattr) ++{ ++ enum bpf_prog_type ptype; ++ struct bpf_prog *prog; ++ int ret; ++ ++ if (CHECK_ATTR(BPF_LINK_CREATE)) ++ return -EINVAL; ++ ++ prog = bpf_prog_get(attr->link_create.prog_fd); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ ret = bpf_prog_attach_check_attach_type(prog, ++ attr->link_create.attach_type); ++ if (ret) ++ goto out; ++ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_EXT: ++ break; ++ case BPF_PROG_TYPE_PERF_EVENT: ++ case BPF_PROG_TYPE_TRACEPOINT: ++ if (attr->link_create.attach_type != BPF_PERF_EVENT) { ++ ret = -EINVAL; ++ goto out; ++ } ++ break; ++ case BPF_PROG_TYPE_KPROBE: ++ if (attr->link_create.attach_type != BPF_PERF_EVENT && ++ attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) { ++ ret = -EINVAL; ++ goto out; ++ } ++ break; ++ default: ++ ptype = attach_type_to_prog_type(attr->link_create.attach_type); ++ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { ++ ret = -EINVAL; ++ goto out; ++ } ++ break; ++ } ++ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_SOCK_OPS: ++ case BPF_PROG_TYPE_CGROUP_DEVICE: ++ case BPF_PROG_TYPE_CGROUP_SYSCTL: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ ret = cgroup_bpf_link_attach(attr, prog); ++ break; ++ case BPF_PROG_TYPE_EXT: ++ ret = bpf_tracing_prog_attach(prog, ++ attr->link_create.target_fd, ++ attr->link_create.target_btf_id, ++ attr->link_create.tracing.cookie); ++ break; ++ case BPF_PROG_TYPE_LSM: ++ case BPF_PROG_TYPE_TRACING: ++ if (attr->link_create.attach_type != prog->expected_attach_type) { ++ ret = -EINVAL; ++ goto out; ++ } ++ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) ++ ret = bpf_raw_tp_link_attach(prog, NULL); ++ else if (prog->expected_attach_type == BPF_TRACE_ITER) ++ ret = bpf_iter_link_attach(attr, uattr, prog); ++ else if (prog->expected_attach_type == BPF_LSM_CGROUP) ++ ret = cgroup_bpf_link_attach(attr, prog); ++ else ++ ret = bpf_tracing_prog_attach(prog, ++ attr->link_create.target_fd, ++ attr->link_create.target_btf_id, ++ attr->link_create.tracing.cookie); ++ break; ++ case BPF_PROG_TYPE_FLOW_DISSECTOR: ++ case BPF_PROG_TYPE_SK_LOOKUP: ++ ret = netns_bpf_link_create(attr, prog); ++ break; ++#ifdef CONFIG_NET ++ case BPF_PROG_TYPE_XDP: ++ ret = bpf_xdp_link_attach(attr, prog); ++ break; ++#endif ++ case BPF_PROG_TYPE_PERF_EVENT: ++ case BPF_PROG_TYPE_TRACEPOINT: ++ ret = bpf_perf_link_attach(attr, prog); ++ break; ++ case BPF_PROG_TYPE_KPROBE: ++ if (attr->link_create.attach_type == BPF_PERF_EVENT) ++ ret = bpf_perf_link_attach(attr, prog); ++ else ++ ret = bpf_kprobe_multi_link_attach(attr, prog); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++out: ++ if (ret < 0) ++ bpf_prog_put(prog); ++ return ret; ++} ++ ++#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd ++ ++static int link_update(union bpf_attr *attr) ++{ ++ struct bpf_prog *old_prog = NULL, *new_prog; ++ struct bpf_link *link; ++ u32 flags; ++ int ret; ++ ++ if (CHECK_ATTR(BPF_LINK_UPDATE)) ++ return -EINVAL; ++ ++ flags = attr->link_update.flags; ++ if (flags & ~BPF_F_REPLACE) ++ return -EINVAL; ++ ++ link = bpf_link_get_from_fd(attr->link_update.link_fd); ++ if (IS_ERR(link)) ++ return PTR_ERR(link); ++ ++ new_prog = bpf_prog_get(attr->link_update.new_prog_fd); ++ if (IS_ERR(new_prog)) { ++ ret = PTR_ERR(new_prog); ++ goto out_put_link; ++ } ++ ++ if (flags & BPF_F_REPLACE) { ++ old_prog = bpf_prog_get(attr->link_update.old_prog_fd); ++ if (IS_ERR(old_prog)) { ++ ret = PTR_ERR(old_prog); ++ old_prog = NULL; ++ goto out_put_progs; ++ } ++ } else if (attr->link_update.old_prog_fd) { ++ ret = -EINVAL; ++ goto out_put_progs; ++ } ++ ++ if (link->ops->update_prog) ++ ret = link->ops->update_prog(link, new_prog, old_prog); ++ else ++ ret = -EINVAL; ++ ++out_put_progs: ++ if (old_prog) ++ bpf_prog_put(old_prog); ++ if (ret) ++ bpf_prog_put(new_prog); ++out_put_link: ++ bpf_link_put(link); ++ return ret; ++} ++ ++#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd ++ ++static int link_detach(union bpf_attr *attr) ++{ ++ struct bpf_link *link; ++ int ret; ++ ++ if (CHECK_ATTR(BPF_LINK_DETACH)) ++ return -EINVAL; ++ ++ link = bpf_link_get_from_fd(attr->link_detach.link_fd); ++ if (IS_ERR(link)) ++ return PTR_ERR(link); ++ ++ if (link->ops->detach) ++ ret = link->ops->detach(link); ++ else ++ ret = -EOPNOTSUPP; ++ ++ bpf_link_put(link); ++ return ret; ++} ++ ++static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) ++{ ++ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); ++} ++ ++struct bpf_link *bpf_link_by_id(u32 id) ++{ ++ struct bpf_link *link; ++ ++ if (!id) ++ return ERR_PTR(-ENOENT); ++ ++ spin_lock_bh(&link_idr_lock); ++ /* before link is "settled", ID is 0, pretend it doesn't exist yet */ ++ link = idr_find(&link_idr, id); ++ if (link) { ++ if (link->id) ++ link = bpf_link_inc_not_zero(link); ++ else ++ link = ERR_PTR(-EAGAIN); ++ } else { ++ link = ERR_PTR(-ENOENT); ++ } ++ spin_unlock_bh(&link_idr_lock); ++ return link; ++} ++ ++struct bpf_link *bpf_link_get_curr_or_next(u32 *id) ++{ ++ struct bpf_link *link; ++ ++ spin_lock_bh(&link_idr_lock); ++again: ++ link = idr_get_next(&link_idr, id); ++ if (link) { ++ link = bpf_link_inc_not_zero(link); ++ if (IS_ERR(link)) { ++ (*id)++; ++ goto again; ++ } ++ } ++ spin_unlock_bh(&link_idr_lock); ++ ++ return link; ++} ++ ++#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id ++ ++static int bpf_link_get_fd_by_id(const union bpf_attr *attr) ++{ ++ struct bpf_link *link; ++ u32 id = attr->link_id; ++ int fd; ++ ++ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ link = bpf_link_by_id(id); ++ if (IS_ERR(link)) ++ return PTR_ERR(link); ++ ++ fd = bpf_link_new_fd(link); ++ if (fd < 0) ++ bpf_link_put(link); ++ ++ return fd; ++} ++ ++DEFINE_MUTEX(bpf_stats_enabled_mutex); ++ ++static int bpf_stats_release(struct inode *inode, struct file *file) ++{ ++ mutex_lock(&bpf_stats_enabled_mutex); ++ static_key_slow_dec(&bpf_stats_enabled_key.key); ++ mutex_unlock(&bpf_stats_enabled_mutex); ++ return 0; ++} ++ ++static const struct file_operations bpf_stats_fops = { ++ .release = bpf_stats_release, ++}; ++ ++static int bpf_enable_runtime_stats(void) ++{ ++ int fd; ++ ++ mutex_lock(&bpf_stats_enabled_mutex); ++ ++ /* Set a very high limit to avoid overflow */ ++ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { ++ mutex_unlock(&bpf_stats_enabled_mutex); ++ return -EBUSY; ++ } ++ ++ fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); ++ if (fd >= 0) ++ static_key_slow_inc(&bpf_stats_enabled_key.key); ++ ++ mutex_unlock(&bpf_stats_enabled_mutex); ++ return fd; ++} ++ ++#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type ++ ++static int bpf_enable_stats(union bpf_attr *attr) ++{ ++ ++ if (CHECK_ATTR(BPF_ENABLE_STATS)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ switch (attr->enable_stats.type) { ++ case BPF_STATS_RUN_TIME: ++ return bpf_enable_runtime_stats(); ++ default: ++ break; ++ } ++ return -EINVAL; ++} ++ ++#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags ++ ++static int bpf_iter_create(union bpf_attr *attr) ++{ ++ struct bpf_link *link; ++ int err; ++ ++ if (CHECK_ATTR(BPF_ITER_CREATE)) ++ return -EINVAL; ++ ++ if (attr->iter_create.flags) ++ return -EINVAL; ++ ++ link = bpf_link_get_from_fd(attr->iter_create.link_fd); ++ if (IS_ERR(link)) ++ return PTR_ERR(link); ++ ++ err = bpf_iter_new_fd(link); ++ bpf_link_put(link); ++ ++ return err; ++} ++ ++#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags ++ ++static int bpf_prog_bind_map(union bpf_attr *attr) ++{ ++ struct bpf_prog *prog; ++ struct bpf_map *map; ++ struct bpf_map **used_maps_old, **used_maps_new; ++ int i, ret = 0; ++ ++ if (CHECK_ATTR(BPF_PROG_BIND_MAP)) ++ return -EINVAL; ++ ++ if (attr->prog_bind_map.flags) ++ return -EINVAL; ++ ++ prog = bpf_prog_get(attr->prog_bind_map.prog_fd); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ map = bpf_map_get(attr->prog_bind_map.map_fd); ++ if (IS_ERR(map)) { ++ ret = PTR_ERR(map); ++ goto out_prog_put; ++ } ++ ++ mutex_lock(&prog->aux->used_maps_mutex); ++ ++ used_maps_old = prog->aux->used_maps; ++ ++ for (i = 0; i < prog->aux->used_map_cnt; i++) ++ if (used_maps_old[i] == map) { ++ bpf_map_put(map); ++ goto out_unlock; ++ } ++ ++ used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, ++ sizeof(used_maps_new[0]), ++ GFP_KERNEL); ++ if (!used_maps_new) { ++ ret = -ENOMEM; ++ goto out_unlock; ++ } ++ ++ memcpy(used_maps_new, used_maps_old, ++ sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); ++ used_maps_new[prog->aux->used_map_cnt] = map; ++ ++ prog->aux->used_map_cnt++; ++ prog->aux->used_maps = used_maps_new; ++ ++ kfree(used_maps_old); ++ ++out_unlock: ++ mutex_unlock(&prog->aux->used_maps_mutex); ++ ++ if (ret) ++ bpf_map_put(map); ++out_prog_put: ++ bpf_prog_put(prog); ++ return ret; ++} ++ ++static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) ++{ ++ union bpf_attr attr; ++ bool capable; ++ int err; ++ ++ capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; ++ ++ /* Intent here is for unprivileged_bpf_disabled to block key object ++ * creation commands for unprivileged users; other actions depend ++ * of fd availability and access to bpffs, so are dependent on ++ * object creation success. Capabilities are later verified for ++ * operations such as load and map create, so even with unprivileged ++ * BPF disabled, capability checks are still carried out for these ++ * and other operations. ++ */ ++ if (!capable && ++ (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) ++ return -EPERM; ++ ++ err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); ++ if (err) ++ return err; ++ size = min_t(u32, size, sizeof(attr)); ++ ++ /* copy attributes from user space, may be less than sizeof(bpf_attr) */ ++ memset(&attr, 0, sizeof(attr)); ++ if (copy_from_bpfptr(&attr, uattr, size) != 0) ++ return -EFAULT; ++ ++ err = security_bpf(cmd, &attr, size); ++ if (err < 0) ++ return err; ++ ++ switch (cmd) { ++ case BPF_MAP_CREATE: ++ err = map_create(&attr); ++ break; ++ case BPF_MAP_LOOKUP_ELEM: ++ err = map_lookup_elem(&attr); ++ break; ++ case BPF_MAP_UPDATE_ELEM: ++ err = map_update_elem(&attr, uattr); ++ break; ++ case BPF_MAP_DELETE_ELEM: ++ err = map_delete_elem(&attr); ++ break; ++ case BPF_MAP_GET_NEXT_KEY: ++ err = map_get_next_key(&attr); ++ break; ++ case BPF_MAP_FREEZE: ++ err = map_freeze(&attr); ++ break; ++ case BPF_PROG_LOAD: ++ err = bpf_prog_load(&attr, uattr); ++ break; ++ case BPF_OBJ_PIN: ++ err = bpf_obj_pin(&attr); ++ break; ++ case BPF_OBJ_GET: ++ err = bpf_obj_get(&attr); ++ break; ++ case BPF_PROG_ATTACH: ++ err = bpf_prog_attach(&attr); ++ break; ++ case BPF_PROG_DETACH: ++ err = bpf_prog_detach(&attr); ++ break; ++ case BPF_PROG_QUERY: ++ err = bpf_prog_query(&attr, uattr.user); ++ break; ++ case BPF_PROG_TEST_RUN: ++ err = bpf_prog_test_run(&attr, uattr.user); ++ break; ++ case BPF_PROG_GET_NEXT_ID: ++ err = bpf_obj_get_next_id(&attr, uattr.user, ++ &prog_idr, &prog_idr_lock); ++ break; ++ case BPF_MAP_GET_NEXT_ID: ++ err = bpf_obj_get_next_id(&attr, uattr.user, ++ &map_idr, &map_idr_lock); ++ break; ++ case BPF_BTF_GET_NEXT_ID: ++ err = bpf_obj_get_next_id(&attr, uattr.user, ++ &btf_idr, &btf_idr_lock); ++ break; ++ case BPF_PROG_GET_FD_BY_ID: ++ err = bpf_prog_get_fd_by_id(&attr); ++ break; ++ case BPF_MAP_GET_FD_BY_ID: ++ err = bpf_map_get_fd_by_id(&attr); ++ break; ++ case BPF_OBJ_GET_INFO_BY_FD: ++ err = bpf_obj_get_info_by_fd(&attr, uattr.user); ++ break; ++ case BPF_RAW_TRACEPOINT_OPEN: ++ err = bpf_raw_tracepoint_open(&attr); ++ break; ++ case BPF_BTF_LOAD: ++ err = bpf_btf_load(&attr, uattr); ++ break; ++ case BPF_BTF_GET_FD_BY_ID: ++ err = bpf_btf_get_fd_by_id(&attr); ++ break; ++ case BPF_TASK_FD_QUERY: ++ err = bpf_task_fd_query(&attr, uattr.user); ++ break; ++ case BPF_MAP_LOOKUP_AND_DELETE_ELEM: ++ err = map_lookup_and_delete_elem(&attr); ++ break; ++ case BPF_MAP_LOOKUP_BATCH: ++ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH); ++ break; ++ case BPF_MAP_LOOKUP_AND_DELETE_BATCH: ++ err = bpf_map_do_batch(&attr, uattr.user, ++ BPF_MAP_LOOKUP_AND_DELETE_BATCH); ++ break; ++ case BPF_MAP_UPDATE_BATCH: ++ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH); ++ break; ++ case BPF_MAP_DELETE_BATCH: ++ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH); ++ break; ++ case BPF_LINK_CREATE: ++ err = link_create(&attr, uattr); ++ break; ++ case BPF_LINK_UPDATE: ++ err = link_update(&attr); ++ break; ++ case BPF_LINK_GET_FD_BY_ID: ++ err = bpf_link_get_fd_by_id(&attr); ++ break; ++ case BPF_LINK_GET_NEXT_ID: ++ err = bpf_obj_get_next_id(&attr, uattr.user, ++ &link_idr, &link_idr_lock); ++ break; ++ case BPF_ENABLE_STATS: ++ err = bpf_enable_stats(&attr); ++ break; ++ case BPF_ITER_CREATE: ++ err = bpf_iter_create(&attr); ++ break; ++ case BPF_LINK_DETACH: ++ err = link_detach(&attr); ++ break; ++ case BPF_PROG_BIND_MAP: ++ err = bpf_prog_bind_map(&attr); ++ break; ++ default: ++ err = -EINVAL; ++ break; ++ } ++ ++ return err; ++} ++ ++SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) ++{ ++ return __sys_bpf(cmd, USER_BPFPTR(uattr), size); ++} ++ ++static bool syscall_prog_is_valid_access(int off, int size, ++ enum bpf_access_type type, ++ const struct bpf_prog *prog, ++ struct bpf_insn_access_aux *info) ++{ ++ if (off < 0 || off >= U16_MAX) ++ return false; ++ if (off % size != 0) ++ return false; ++ return true; ++} ++ ++BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) ++{ ++ switch (cmd) { ++ case BPF_MAP_CREATE: ++ case BPF_MAP_UPDATE_ELEM: ++ case BPF_MAP_FREEZE: ++ case BPF_PROG_LOAD: ++ case BPF_BTF_LOAD: ++ case BPF_LINK_CREATE: ++ case BPF_RAW_TRACEPOINT_OPEN: ++ break; ++ default: ++ return -EINVAL; ++ } ++ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); ++} ++ ++ ++/* To shut up -Wmissing-prototypes. ++ * This function is used by the kernel light skeleton ++ * to load bpf programs when modules are loaded or during kernel boot. ++ * See tools/lib/bpf/skel_internal.h ++ */ ++int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); ++ ++int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) ++{ ++ struct bpf_prog * __maybe_unused prog; ++ struct bpf_tramp_run_ctx __maybe_unused run_ctx; ++ ++ switch (cmd) { ++#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ ++ case BPF_PROG_TEST_RUN: ++ if (attr->test.data_in || attr->test.data_out || ++ attr->test.ctx_out || attr->test.duration || ++ attr->test.repeat || attr->test.flags) ++ return -EINVAL; ++ ++ prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || ++ attr->test.ctx_size_in > U16_MAX) { ++ bpf_prog_put(prog); ++ return -EINVAL; ++ } ++ ++ run_ctx.bpf_cookie = 0; ++ run_ctx.saved_run_ctx = NULL; ++ if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) { ++ /* recursion detected */ ++ bpf_prog_put(prog); ++ return -EBUSY; ++ } ++ attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); ++ __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); ++ bpf_prog_put(prog); ++ return 0; ++#endif ++ default: ++ return ____bpf_sys_bpf(cmd, attr, size); ++ } ++} ++EXPORT_SYMBOL(kern_sys_bpf); ++ ++static const struct bpf_func_proto bpf_sys_bpf_proto = { ++ .func = bpf_sys_bpf, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_ANYTHING, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, ++ .arg3_type = ARG_CONST_SIZE, ++}; ++ ++const struct bpf_func_proto * __weak ++tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ++{ ++ return bpf_base_func_proto(func_id); ++} ++ ++BPF_CALL_1(bpf_sys_close, u32, fd) ++{ ++ /* When bpf program calls this helper there should not be ++ * an fdget() without matching completed fdput(). ++ * This helper is allowed in the following callchain only: ++ * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close ++ */ ++ return close_fd(fd); ++} ++ ++static const struct bpf_func_proto bpf_sys_close_proto = { ++ .func = bpf_sys_close, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_ANYTHING, ++}; ++ ++BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res) ++{ ++ if (flags) ++ return -EINVAL; ++ ++ if (name_sz <= 1 || name[name_sz - 1]) ++ return -EINVAL; ++ ++ if (!bpf_dump_raw_ok(current_cred())) ++ return -EPERM; ++ ++ *res = kallsyms_lookup_name(name); ++ return *res ? 0 : -ENOENT; ++} ++ ++static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { ++ .func = bpf_kallsyms_lookup_name, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_CONST_SIZE_OR_ZERO, ++ .arg3_type = ARG_ANYTHING, ++ .arg4_type = ARG_PTR_TO_LONG, ++}; ++ ++static const struct bpf_func_proto * ++syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ++{ ++ switch (func_id) { ++ case BPF_FUNC_sys_bpf: ++ return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; ++ case BPF_FUNC_btf_find_by_name_kind: ++ return &bpf_btf_find_by_name_kind_proto; ++ case BPF_FUNC_sys_close: ++ return &bpf_sys_close_proto; ++ case BPF_FUNC_kallsyms_lookup_name: ++ return &bpf_kallsyms_lookup_name_proto; ++ default: ++ return tracing_prog_func_proto(func_id, prog); ++ } ++} ++ ++const struct bpf_verifier_ops bpf_syscall_verifier_ops = { ++ .get_func_proto = syscall_prog_func_proto, ++ .is_valid_access = syscall_prog_is_valid_access, ++}; ++ ++const struct bpf_prog_ops bpf_syscall_prog_ops = { ++ .test_run = bpf_prog_test_run_syscall, ++}; ++ ++#ifdef CONFIG_SYSCTL ++static int bpf_stats_handler(struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct static_key *key = (struct static_key *)table->data; ++ static int saved_val; ++ int val, ret; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .mode = table->mode, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ mutex_lock(&bpf_stats_enabled_mutex); ++ val = saved_val; ++ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); ++ if (write && !ret && val != saved_val) { ++ if (val) ++ static_key_slow_inc(key); ++ else ++ static_key_slow_dec(key); ++ saved_val = val; ++ } ++ mutex_unlock(&bpf_stats_enabled_mutex); ++ return ret; ++} ++ ++void __weak unpriv_ebpf_notify(int new_state) ++{ ++} ++ ++static int bpf_unpriv_handler(struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret, unpriv_enable = *(int *)table->data; ++ bool locked_state = unpriv_enable == 1; ++ struct ctl_table tmp = *table; ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ tmp.data = &unpriv_enable; ++ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); ++ if (write && !ret) { ++ if (locked_state && unpriv_enable != 1) ++ return -EPERM; ++ *(int *)table->data = unpriv_enable; ++ } ++ ++ unpriv_ebpf_notify(unpriv_enable); ++ ++ return ret; ++} ++ ++static struct ctl_table bpf_syscall_table[] = { ++ { ++ .procname = "unprivileged_bpf_disabled", ++ .data = &sysctl_unprivileged_bpf_disabled, ++ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), ++ .mode = 0644, ++ .proc_handler = bpf_unpriv_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_TWO, ++ }, ++ { ++ .procname = "bpf_stats_enabled", ++ .data = &bpf_stats_enabled_key.key, ++ .maxlen = sizeof(bpf_stats_enabled_key), ++ .mode = 0644, ++ .proc_handler = bpf_stats_handler, ++ }, ++ { } ++}; ++ ++static int __init bpf_syscall_sysctl_init(void) ++{ ++ register_sysctl_init("kernel", bpf_syscall_table); ++ return 0; ++} ++late_initcall(bpf_syscall_sysctl_init); ++#endif /* CONFIG_SYSCTL */ +diff -rupN linux.orig/kernel/entry/common.c linux/kernel/entry/common.c +--- linux.orig/kernel/entry/common.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/entry/common.c 2022-12-04 10:40:26.716034044 -0500 +@@ -153,7 +153,7 @@ static unsigned long exit_to_user_mode_l local_irq_enable_exit_to_user(ti_work); @@ -5094,7 +33236,7 @@ index 063068a9ea9b3..26b772720b227 100644 schedule(); if (ti_work & _TIF_UPROBE) -@@ -381,7 +381,7 @@ void raw_irqentry_exit_cond_resched(void) +@@ -381,7 +381,7 @@ void raw_irqentry_exit_cond_resched(void rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); @@ -5103,11 +33245,10 @@ index 063068a9ea9b3..26b772720b227 100644 preempt_schedule_irq(); } } -diff --git a/kernel/hung_task.c b/kernel/hung_task.c -index bb2354f73dedc..19c9de825d248 100644 ---- a/kernel/hung_task.c -+++ b/kernel/hung_task.c -@@ -127,6 +127,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) +diff -rupN linux.orig/kernel/hung_task.c linux/kernel/hung_task.c +--- linux.orig/kernel/hung_task.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/hung_task.c 2022-12-04 10:40:26.716034044 -0500 +@@ -127,6 +127,8 @@ static void check_hung_task(struct task_ * complain: */ if (sysctl_hung_task_warnings) { @@ -5116,7 +33257,7 @@ index bb2354f73dedc..19c9de825d248 100644 if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", -@@ -142,6 +144,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) +@@ -142,6 +144,8 @@ static void check_hung_task(struct task_ if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; @@ -5125,7 +33266,7 @@ index bb2354f73dedc..19c9de825d248 100644 } touch_nmi_watchdog(); -@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) +@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_t } unlock: rcu_read_unlock(); @@ -5144,11 +33285,10 @@ index bb2354f73dedc..19c9de825d248 100644 } if (hung_task_call_panic) -diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c -index 5db0230aa6b52..476a3fecb8c53 100644 ---- a/kernel/irq/irqdesc.c -+++ b/kernel/irq/irqdesc.c -@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) +diff -rupN linux.orig/kernel/irq/irqdesc.c linux/kernel/irq/irqdesc.c +--- linux.orig/kernel/irq/irqdesc.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/irq/irqdesc.c 2022-12-04 10:40:26.716034044 -0500 +@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); @@ -5179,10 +33319,24 @@ index 5db0230aa6b52..476a3fecb8c53 100644 /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. -diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c -index b1292a57c2a53..a6514db7ef58e 100644 ---- a/kernel/ksysfs.c -+++ b/kernel/ksysfs.c +diff -rupN linux.orig/kernel/Kconfig.preempt linux/kernel/Kconfig.preempt +--- linux.orig/kernel/Kconfig.preempt 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/Kconfig.preempt 2022-12-04 10:40:26.716034044 -0500 +@@ -1,5 +1,11 @@ + # SPDX-License-Identifier: GPL-2.0-only + ++config HAVE_PREEMPT_LAZY ++ bool ++ ++config PREEMPT_LAZY ++ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT ++ + config PREEMPT_NONE_BUILD + bool + +diff -rupN linux.orig/kernel/ksysfs.c linux/kernel/ksysfs.c +--- linux.orig/kernel/ksysfs.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/ksysfs.c 2022-12-04 10:40:26.716034044 -0500 @@ -137,6 +137,15 @@ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_CRASH_CORE */ @@ -5199,20 +33353,19 @@ index b1292a57c2a53..a6514db7ef58e 100644 /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -@@ -227,6 +236,9 @@ static struct attribute * kernel_attrs[] = { - #ifndef CONFIG_TINY_RCU +@@ -228,6 +237,9 @@ static struct attribute * kernel_attrs[] &rcu_expedited_attr.attr, &rcu_normal_attr.attr, -+#endif + #endif +#ifdef CONFIG_PREEMPT_RT + &realtime_attr.attr, - #endif ++#endif NULL }; -diff --git a/kernel/panic.c b/kernel/panic.c -index c6eb8f8db0c05..c4e8896e3caba 100644 ---- a/kernel/panic.c -+++ b/kernel/panic.c + +diff -rupN linux.orig/kernel/panic.c linux/kernel/panic.c +--- linux.orig/kernel/panic.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/panic.c 2022-12-04 10:40:26.716034044 -0500 @@ -257,7 +257,6 @@ void panic(const char *fmt, ...) panic_smp_self_stop(); @@ -5249,7 +33402,7 @@ index c6eb8f8db0c05..c4e8896e3caba 100644 crash_smp_send_stop(); } -@@ -604,6 +610,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, +@@ -604,6 +610,8 @@ void __warn(const char *file, int line, { disable_trace_on_warning(); @@ -5258,7 +33411,7 @@ index c6eb8f8db0c05..c4e8896e3caba 100644 if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, -@@ -633,6 +641,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, +@@ -633,6 +641,8 @@ void __warn(const char *file, int line, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); @@ -5267,10 +33420,9 @@ index c6eb8f8db0c05..c4e8896e3caba 100644 } #ifndef __WARN_FLAGS -diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h -index d947ca6c84f99..e7d8578860adf 100644 ---- a/kernel/printk/internal.h -+++ b/kernel/printk/internal.h +diff -rupN linux.orig/kernel/printk/internal.h linux/kernel/printk/internal.h +--- linux.orig/kernel/printk/internal.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/printk/internal.h 2022-12-04 10:40:26.716034044 -0500 @@ -20,6 +20,8 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; @@ -5280,10 +33432,9 @@ index d947ca6c84f99..e7d8578860adf 100644 __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, -diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c -index a1a81fd9889bb..f1f9ce9b23f60 100644 ---- a/kernel/printk/printk.c -+++ b/kernel/printk/printk.c +diff -rupN linux.orig/kernel/printk/printk.c linux/kernel/printk/printk.c +--- linux.orig/kernel/printk/printk.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/printk/printk.c 2022-12-04 10:40:26.720034034 -0500 @@ -44,6 +44,7 @@ #include #include @@ -5292,11 +33443,10 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 #include #include #include -@@ -223,6 +224,36 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - /* Number of registered extended console drivers. */ +@@ -224,6 +225,36 @@ int devkmsg_sysctl_set_loglvl(struct ctl static int nr_ext_console_drivers; -+/* + /* + * Used to synchronize printing kthreads against direct printing via + * console_trylock/console_unlock. + * @@ -5326,9 +33476,10 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 +/* Block console kthreads to avoid processing new messages. */ +bool block_console_kthreads; + - /* ++/* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. + */ @@ -271,14 +302,49 @@ static bool panic_in_progress(void) } @@ -5342,15 +33493,15 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 + * Tracks whether kthread printers are all blocked. A value of true implies + * that the console is locked via console_lock() or the console is suspended. + * Writing to this variable requires holding @console_sem. - */ --static int console_locked, console_suspended; ++ */ +static bool console_kthreads_blocked; + +/* + * Block all kthread printers from a schedulable context. + * + * Requires holding @console_sem. -+ */ + */ +-static int console_locked, console_suspended; +static void console_kthreads_block(void) +{ + struct console *con; @@ -5386,7 +33537,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 /* * Array of consoles built from command line options (console=) -@@ -361,7 +427,75 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; +@@ -361,7 +427,75 @@ static int console_msg_format = MSG_FORM /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); @@ -5462,7 +33613,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ -@@ -1850,6 +1984,7 @@ static int console_lock_spinning_disable_and_check(void) +@@ -1850,6 +1984,7 @@ static int console_lock_spinning_disable return 1; } @@ -5470,7 +33621,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 /** * console_trylock_spinning - try to get console_lock by busy waiting * -@@ -1923,6 +2058,7 @@ static int console_trylock_spinning(void) +@@ -1923,6 +2058,7 @@ static int console_trylock_spinning(void return 1; } @@ -5478,7 +33629,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 /* * Call the specified console driver, asking it to write out the specified -@@ -1930,19 +2066,28 @@ static int console_trylock_spinning(void) +@@ -1930,19 +2066,28 @@ static int console_trylock_spinning(void * dropped, a dropped message will be written out first. */ static void call_console_driver(struct console *con, const char *text, size_t len, @@ -5513,7 +33664,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } /* -@@ -2252,10 +2397,22 @@ asmlinkage int vprintk_emit(int facility, int level, +@@ -2252,10 +2397,22 @@ asmlinkage int vprintk_emit(int facility printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ @@ -5538,7 +33689,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. -@@ -2270,6 +2427,7 @@ asmlinkage int vprintk_emit(int facility, int level, +@@ -2270,6 +2427,7 @@ asmlinkage int vprintk_emit(int facility if (console_trylock_spinning()) console_unlock(); preempt_enable(); @@ -5546,7 +33697,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } wake_up_klogd(); -@@ -2296,8 +2454,80 @@ asmlinkage __visible int _printk(const char *fmt, ...) +@@ -2296,8 +2454,80 @@ asmlinkage __visible int _printk(const c } EXPORT_SYMBOL(_printk); @@ -5627,7 +33778,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 -@@ -2308,6 +2538,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre +@@ -2308,6 +2538,8 @@ static bool __pr_flush(struct console *c #define prb_first_valid_seq(rb) 0 #define prb_next_seq(rb) 0 @@ -5636,7 +33787,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 static u64 syslog_seq; static size_t record_print_text(const struct printk_record *r, -@@ -2326,11 +2558,13 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, +@@ -2326,11 +2558,13 @@ static ssize_t msg_print_ext_body(char * static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_driver(struct console *con, const char *text, size_t len, @@ -5651,7 +33802,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 #endif /* CONFIG_PRINTK */ -@@ -2549,6 +2783,14 @@ static int console_cpu_notify(unsigned int cpu) +@@ -2549,6 +2783,14 @@ static int console_cpu_notify(unsigned i /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); @@ -5708,7 +33859,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } EXPORT_SYMBOL(is_console_locked); -@@ -2620,18 +2877,9 @@ static bool abandon_console_lock_in_panic(void) +@@ -2620,18 +2877,9 @@ static bool abandon_console_lock_in_pani return atomic_read(&panic_cpu) != raw_smp_processor_id(); } @@ -5729,7 +33880,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 return false; /* -@@ -2640,18 +2888,116 @@ static inline bool console_is_usable(struct console *con) +@@ -2640,18 +2888,116 @@ static inline bool console_is_usable(str * cope (CON_ANYTIME) don't call them until this CPU is officially up. */ if (!cpu_online(raw_smp_processor_id()) && @@ -5907,7 +34058,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 suppress_panic_printk = 1; pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); } -@@ -2701,7 +3058,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ +@@ -2701,7 +3058,7 @@ static bool console_emit_next_record(str /* Skip record that has level above the console loglevel. */ if (suppress_message_printing(r.info->level)) { @@ -5916,7 +34067,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 goto skip; } -@@ -2715,31 +3072,65 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ +@@ -2715,32 +3072,66 @@ static bool console_emit_next_record(str len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } @@ -5969,7 +34120,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 return true; } -+/* + /* + * Print a record for a given console, but allow another printk() caller to + * take over the console_lock and continue printing. + * @@ -5997,10 +34148,11 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 + return __console_emit_next_record(con, text, ext_text, dropped_text, false, handover); +} + - /* ++/* * Print out all remaining records to all consoles. * -@@ -2758,8 +3149,8 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ + * @do_cond_resched is set by the caller. It can be true only in schedulable +@@ -2758,8 +3149,8 @@ skip: * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this @@ -6011,7 +34163,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 * * Requires the console_lock. */ -@@ -2776,24 +3167,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove +@@ -2776,24 +3167,26 @@ static bool console_flush_all(bool do_co *handover = false; do { @@ -6045,7 +34197,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } if (*handover) return false; -@@ -2818,6 +3211,68 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove +@@ -2818,6 +3211,68 @@ static bool console_flush_all(bool do_co return any_usable; } @@ -6141,7 +34293,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 /* * If someone else is holding the console lock, trylock will fail * and may_schedule may be set. Ignore and proceed to unlock so -@@ -2946,7 +3409,7 @@ void console_flush_on_panic(enum con_flush_mode mode) +@@ -2946,7 +3409,7 @@ void console_flush_on_panic(enum con_flu seq = prb_first_valid_seq(prb); for_each_console(c) @@ -6150,7 +34302,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } console_unlock(); } -@@ -3189,16 +3652,27 @@ void register_console(struct console *newcon) +@@ -3189,16 +3652,27 @@ void register_console(struct console *ne if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; @@ -6189,7 +34341,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 struct console *con; int res; -@@ -3265,9 +3740,26 @@ int unregister_console(struct console *console) +@@ -3265,9 +3740,26 @@ int unregister_console(struct console *c console_drivers->flags |= CON_CONSDEV; console->flags &= ~CON_ENABLED; @@ -6237,7 +34389,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) -@@ -3384,7 +3890,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre +@@ -3384,7 +3890,7 @@ static bool __pr_flush(struct console *c for_each_console(c) { if (con && con != c) continue; @@ -6246,7 +34398,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 continue; printk_seq = c->seq; if (printk_seq < seq) -@@ -3444,11 +3950,215 @@ bool pr_flush(int timeout_ms, bool reset_on_progress) +@@ -3444,11 +3950,215 @@ bool pr_flush(int timeout_ms, bool reset } EXPORT_SYMBOL(pr_flush); @@ -6464,7 +34616,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 static DEFINE_PER_CPU(int, printk_pending); -@@ -3456,10 +4166,14 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) +@@ -3456,10 +4166,14 @@ static void wake_up_klogd_work_func(stru { int pending = this_cpu_xchg(printk_pending, 0); @@ -6513,10 +34665,9 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } void printk_trigger_flush(void) -diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c -index ef0f9a2044da1..caac4de1ea59a 100644 ---- a/kernel/printk/printk_safe.c -+++ b/kernel/printk/printk_safe.c +diff -rupN linux.orig/kernel/printk/printk_safe.c linux/kernel/printk/printk_safe.c +--- linux.orig/kernel/printk/printk_safe.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/printk/printk_safe.c 2022-12-04 10:40:26.720034034 -0500 @@ -8,7 +8,9 @@ #include #include @@ -6527,7 +34678,7 @@ index ef0f9a2044da1..caac4de1ea59a 100644 #include "internal.h" -@@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt, va_list args) +@@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt, return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); @@ -6561,11 +34712,10 @@ index ef0f9a2044da1..caac4de1ea59a 100644 + timeout_ms -= 1; + } +} -diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c -index d8e1b270a065f..257cb6f5ea622 100644 ---- a/kernel/rcu/rcutorture.c -+++ b/kernel/rcu/rcutorture.c -@@ -2157,6 +2157,12 @@ static int rcutorture_booster_init(unsigned int cpu) +diff -rupN linux.orig/kernel/rcu/rcutorture.c linux/kernel/rcu/rcutorture.c +--- linux.orig/kernel/rcu/rcutorture.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/rcu/rcutorture.c 2022-12-04 10:40:26.720034034 -0500 +@@ -2157,6 +2157,12 @@ static int rcutorture_booster_init(unsig WARN_ON_ONCE(!t); sp.sched_priority = 2; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); @@ -6578,11 +34728,10 @@ index d8e1b270a065f..257cb6f5ea622 100644 } /* Don't allow time recalculation while creating a new task. */ -diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h -index c3fbbcc09327f..195cad14742dd 100644 ---- a/kernel/rcu/tree_stall.h -+++ b/kernel/rcu/tree_stall.h -@@ -643,6 +643,7 @@ static void print_cpu_stall(unsigned long gps) +diff -rupN linux.orig/kernel/rcu/tree_stall.h linux/kernel/rcu/tree_stall.h +--- linux.orig/kernel/rcu/tree_stall.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/rcu/tree_stall.h 2022-12-04 10:40:26.720034034 -0500 +@@ -643,6 +643,7 @@ static void print_cpu_stall(unsigned lon * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ @@ -6590,7 +34739,7 @@ index c3fbbcc09327f..195cad14742dd 100644 trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected")); pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); -@@ -677,6 +678,7 @@ static void print_cpu_stall(unsigned long gps) +@@ -677,6 +678,7 @@ static void print_cpu_stall(unsigned lon */ set_tsk_need_resched(current); set_preempt_need_resched(); @@ -6598,10 +34747,9 @@ index c3fbbcc09327f..195cad14742dd 100644 } static void check_cpu_stall(struct rcu_data *rdp) -diff --git a/kernel/reboot.c b/kernel/reboot.c -index 3c35445bf5ad3..80564ffafabff 100644 ---- a/kernel/reboot.c -+++ b/kernel/reboot.c +diff -rupN linux.orig/kernel/reboot.c linux/kernel/reboot.c +--- linux.orig/kernel/reboot.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/reboot.c 2022-12-04 10:40:26.720034034 -0500 @@ -82,6 +82,7 @@ void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); @@ -6610,7 +34758,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 usermodehelper_disable(); device_shutdown(); } -@@ -270,6 +271,7 @@ static void kernel_shutdown_prepare(enum system_states state) +@@ -270,6 +271,7 @@ static void kernel_shutdown_prepare(enum blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; @@ -6630,7 +34778,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 } return ret; -@@ -834,6 +838,7 @@ static int __orderly_poweroff(bool force) +@@ -834,6 +838,7 @@ static int __orderly_poweroff(bool force ret = run_cmd(poweroff_cmd); if (ret && force) { @@ -6638,7 +34786,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* -@@ -843,6 +848,7 @@ static int __orderly_poweroff(bool force) +@@ -843,6 +848,7 @@ static int __orderly_poweroff(bool force */ emergency_sync(); kernel_power_off(); @@ -6655,7 +34803,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 /* * We have reached here after the emergency shutdown waiting period has * expired. This means orderly_poweroff has not been able to shut off -@@ -916,6 +924,8 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work) +@@ -916,6 +924,8 @@ static void hw_failure_emergency_powerof */ pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); emergency_restart(); @@ -6664,7 +34812,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 } static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, -@@ -954,11 +964,13 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) +@@ -954,11 +964,13 @@ void hw_protection_shutdown(const char * { static atomic_t allow_proceed = ATOMIC_INIT(1); @@ -6679,7 +34827,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 /* * Queue a backup emergency shutdown in the event of -@@ -966,6 +978,8 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) +@@ -966,6 +978,8 @@ void hw_protection_shutdown(const char * */ hw_failure_emergency_poweroff(ms_until_forced); orderly_poweroff(true); @@ -6688,10 +34836,9 @@ index 3c35445bf5ad3..80564ffafabff 100644 } EXPORT_SYMBOL_GPL(hw_protection_shutdown); -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index ee28253c9ac0c..2ce515d3e6f8d 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c +diff -rupN linux.orig/kernel/sched/core.c linux/kernel/sched/core.c +--- linux.orig/kernel/sched/core.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/sched/core.c 2022-12-04 10:40:26.720034034 -0500 @@ -1046,6 +1046,46 @@ void resched_curr(struct rq *rq) trace_sched_wake_idle_without_ipi(cpu); } @@ -6755,7 +34902,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); -@@ -3251,6 +3293,70 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, +@@ -3251,6 +3293,70 @@ out: } #endif /* CONFIG_NUMA_BALANCING */ @@ -6826,7 +34973,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 /* * wait_task_inactive - wait for a thread to unschedule. * -@@ -3269,7 +3375,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, +@@ -3269,7 +3375,7 @@ out: */ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) { @@ -6835,7 +34982,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 struct rq_flags rf; unsigned long ncsw; struct rq *rq; -@@ -3295,7 +3401,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state +@@ -3295,7 +3401,7 @@ unsigned long wait_task_inactive(struct * is actually now running somewhere else! */ while (task_running(rq, p)) { @@ -6844,7 +34991,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 return 0; cpu_relax(); } -@@ -3308,10 +3414,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state +@@ -3308,10 +3414,12 @@ unsigned long wait_task_inactive(struct rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); running = task_running(rq, p); @@ -6859,7 +35006,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 task_rq_unlock(rq, p, &rf); /* -@@ -3340,7 +3448,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state +@@ -3340,7 +3448,7 @@ unsigned long wait_task_inactive(struct * running right now), it's preempted, and we should * yield - it could be a while. */ @@ -6868,7 +35015,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBLE); -@@ -4589,6 +4697,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4589,6 +4697,9 @@ int sched_fork(unsigned long clone_flags p->on_cpu = 0; #endif init_task_preempt_count(p); @@ -6878,7 +35025,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); -@@ -6457,6 +6568,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) +@@ -6457,6 +6568,7 @@ static void __sched notrace __schedule(u next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); @@ -6886,7 +35033,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; -@@ -6671,6 +6783,30 @@ static void __sched notrace preempt_schedule_common(void) +@@ -6671,6 +6783,30 @@ static void __sched notrace preempt_sche } while (need_resched()); } @@ -6917,7 +35064,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 #ifdef CONFIG_PREEMPTION /* * This is the entry point to schedule() from in-kernel preemption -@@ -6684,6 +6820,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) +@@ -6684,6 +6820,8 @@ asmlinkage __visible void __sched notrac */ if (likely(!preemptible())) return; @@ -6926,7 +35073,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); -@@ -6731,6 +6869,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) +@@ -6731,6 +6869,9 @@ asmlinkage __visible void __sched notrac if (likely(!preemptible())) return; @@ -6936,7 +35083,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 do { /* * Because the function tracer can trace preempt_count_sub() -@@ -8988,7 +9129,9 @@ void __init init_idle(struct task_struct *idle, int cpu) +@@ -8988,7 +9129,9 @@ void __init init_idle(struct task_struct /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); @@ -6947,11 +35094,10 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 /* * The idle tasks have their own, simple scheduling class: */ -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 914096c5b1ae1..3cb55e6ede337 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -4576,7 +4576,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +diff -rupN linux.orig/kernel/sched/fair.c linux/kernel/sched/fair.c +--- linux.orig/kernel/sched/fair.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/sched/fair.c 2022-12-04 10:40:26.720034034 -0500 +@@ -4576,7 +4576,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { @@ -6960,7 +35106,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. -@@ -4600,7 +4600,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +@@ -4600,7 +4600,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq return; if (delta > ideal_runtime) @@ -6969,7 +35115,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 } static void -@@ -4746,7 +4746,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +@@ -4746,7 +4746,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc * validating it and just reschedule. */ if (queued) { @@ -6978,7 +35124,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 return; } /* -@@ -4895,7 +4895,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +@@ -4895,7 +4895,7 @@ static void __account_cfs_rq_runtime(str * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) @@ -6987,7 +35133,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 } static __always_inline -@@ -5646,7 +5646,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) +@@ -5646,7 +5646,7 @@ static void hrtick_start_fair(struct rq if (delta < 0) { if (task_current(rq, p)) @@ -6996,7 +35142,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 return; } hrtick_start(rq, delta); -@@ -7307,7 +7307,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7307,7 +7307,7 @@ static void check_preempt_wakeup(struct return; preempt: @@ -7005,7 +35151,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved -@@ -11454,7 +11454,7 @@ static void task_fork_fair(struct task_struct *p) +@@ -11454,7 +11454,7 @@ static void task_fork_fair(struct task_s * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); @@ -7014,7 +35160,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 } se->vruntime -= cfs_rq->min_vruntime; -@@ -11481,7 +11481,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +@@ -11481,7 +11481,7 @@ prio_changed_fair(struct rq *rq, struct */ if (task_current(rq, p)) { if (p->prio > oldprio) @@ -7023,10 +35169,9 @@ index 914096c5b1ae1..3cb55e6ede337 100644 } else check_preempt_curr(rq, p, 0); } -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index ee7f23c76bd33..e13090e33f3c4 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h +diff -rupN linux.orig/kernel/sched/features.h linux/kernel/sched/features.h +--- linux.orig/kernel/sched/features.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/sched/features.h 2022-12-04 10:40:26.720034034 -0500 @@ -48,6 +48,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true) #ifdef CONFIG_PREEMPT_RT @@ -7037,11 +35182,10 @@ index ee7f23c76bd33..e13090e33f3c4 100644 #else /* -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index e26688d387aeb..5b889de29e3c9 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2356,6 +2356,15 @@ extern void reweight_task(struct task_struct *p, int prio); +diff -rupN linux.orig/kernel/sched/sched.h linux/kernel/sched/sched.h +--- linux.orig/kernel/sched/sched.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/sched/sched.h 2022-12-04 10:40:26.724034024 -0500 +@@ -2356,6 +2356,15 @@ extern void reweight_task(struct task_st extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -7057,11 +35201,10 @@ index e26688d387aeb..5b889de29e3c9 100644 extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -diff --git a/kernel/signal.c b/kernel/signal.c -index 6f86fda5e432a..139b965e4fafc 100644 ---- a/kernel/signal.c -+++ b/kernel/signal.c -@@ -2297,13 +2297,13 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, +diff -rupN linux.orig/kernel/signal.c linux/kernel/signal.c +--- linux.orig/kernel/signal.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/signal.c 2022-12-04 10:40:26.724034024 -0500 +@@ -2297,13 +2297,13 @@ static int ptrace_stop(int exit_code, in /* * Don't want to allow preemption here, because * sys_ptrace() needs this task to be inactive. @@ -7079,10 +35222,9 @@ index 6f86fda5e432a..139b965e4fafc 100644 freezable_schedule(); cgroup_leave_frozen(true); -diff --git a/kernel/softirq.c b/kernel/softirq.c -index c8a6913c067d9..ab1fe34326bab 100644 ---- a/kernel/softirq.c -+++ b/kernel/softirq.c +diff -rupN linux.orig/kernel/softirq.c linux/kernel/softirq.c +--- linux.orig/kernel/softirq.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/softirq.c 2022-12-04 10:40:26.724034024 -0500 @@ -637,6 +637,24 @@ static inline void tick_irq_exit(void) #endif } @@ -7124,7 +35266,7 @@ index c8a6913c067d9..ab1fe34326bab 100644 tick_irq_exit(); } -@@ -976,12 +999,70 @@ static struct smp_hotplug_thread softirq_threads = { +@@ -976,12 +999,70 @@ static struct smp_hotplug_thread softirq .thread_comm = "ksoftirqd/%u", }; @@ -7196,11 +35338,10 @@ index c8a6913c067d9..ab1fe34326bab 100644 return 0; } early_initcall(spawn_ksoftirqd); -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 23af5eca11b14..b0b4e44dd0968 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1805,7 +1805,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) +diff -rupN linux.orig/kernel/time/hrtimer.c linux/kernel/time/hrtimer.c +--- linux.orig/kernel/time/hrtimer.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/time/hrtimer.c 2022-12-04 10:40:26.724034024 -0500 +@@ -1805,7 +1805,7 @@ retry: if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; @@ -7218,11 +35359,10 @@ index 23af5eca11b14..b0b4e44dd0968 100644 } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); -diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c -index b0e3c9205946f..133e4160ed54b 100644 ---- a/kernel/time/tick-sched.c -+++ b/kernel/time/tick-sched.c -@@ -779,7 +779,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +diff -rupN linux.orig/kernel/time/tick-sched.c linux/kernel/time/tick-sched.c +--- linux.orig/kernel/time/tick-sched.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/time/tick-sched.c 2022-12-04 10:40:26.724034024 -0500 +@@ -779,7 +779,7 @@ static void tick_nohz_restart(struct tic static inline bool local_timer_softirq_pending(void) { @@ -7231,10 +35371,9 @@ index b0e3c9205946f..133e4160ed54b 100644 } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index 717fcb9fb14aa..e6219da89933d 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c +diff -rupN linux.orig/kernel/time/timer.c linux/kernel/time/timer.c +--- linux.orig/kernel/time/timer.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/time/timer.c 2022-12-04 10:40:26.724034024 -0500 @@ -1822,7 +1822,7 @@ static void run_local_timers(void) if (time_before(jiffies, base->next_expiry)) return; @@ -7244,11 +35383,10 @@ index 717fcb9fb14aa..e6219da89933d 100644 } /* -diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c -index cc65887b31bd9..1d01756752676 100644 ---- a/kernel/trace/trace.c -+++ b/kernel/trace/trace.c -@@ -2640,11 +2640,19 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) +diff -rupN linux.orig/kernel/trace/trace.c linux/kernel/trace/trace.c +--- linux.orig/kernel/trace/trace.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/trace/trace.c 2022-12-04 10:40:26.724034024 -0500 +@@ -2640,11 +2640,19 @@ unsigned int tracing_gen_ctx_irq_test(un if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) trace_flags |= TRACE_FLAG_BH_OFF; @@ -7270,7 +35408,7 @@ index cc65887b31bd9..1d01756752676 100644 (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } -@@ -4230,15 +4238,17 @@ unsigned long trace_total_entries(struct trace_array *tr) +@@ -4230,15 +4238,17 @@ unsigned long trace_total_entries(struct static void print_lat_help_header(struct seq_file *m) { @@ -7297,7 +35435,7 @@ index cc65887b31bd9..1d01756752676 100644 } static void print_event_info(struct array_buffer *buf, struct seq_file *m) -@@ -4272,14 +4282,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file +@@ -4272,14 +4282,16 @@ static void print_func_help_header_irq(s print_event_info(buf, m); @@ -7322,11 +35460,10 @@ index cc65887b31bd9..1d01756752676 100644 } void -diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c -index 0356cae0cf74e..585380a3db753 100644 ---- a/kernel/trace/trace_events.c -+++ b/kernel/trace/trace_events.c -@@ -193,6 +193,7 @@ static int trace_define_common_fields(void) +diff -rupN linux.orig/kernel/trace/trace_events.c linux/kernel/trace/trace_events.c +--- linux.orig/kernel/trace/trace_events.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/trace/trace_events.c 2022-12-04 10:40:26.724034024 -0500 +@@ -193,6 +193,7 @@ static int trace_define_common_fields(vo /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); @@ -7334,11 +35471,10 @@ index 0356cae0cf74e..585380a3db753 100644 return ret; } -diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c -index 67f47ea27921d..de58eaaf1ac7a 100644 ---- a/kernel/trace/trace_output.c -+++ b/kernel/trace/trace_output.c -@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +diff -rupN linux.orig/kernel/trace/trace_output.c linux/kernel/trace/trace_output.c +--- linux.orig/kernel/trace/trace_output.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/trace/trace_output.c 2022-12-04 10:40:26.724034024 -0500 +@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq { char hardsoft_irq; char need_resched; @@ -7346,7 +35482,7 @@ index 67f47ea27921d..de58eaaf1ac7a 100644 char irqs_off; int hardirq; int softirq; -@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED)) { @@ -7374,7 +35510,7 @@ index 67f47ea27921d..de58eaaf1ac7a 100644 hardsoft_irq = (nmi && hardirq) ? 'Z' : nmi ? 'z' : -@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq softirq ? 's' : '.' ; @@ -7397,11 +35533,10 @@ index 67f47ea27921d..de58eaaf1ac7a 100644 if (entry->preempt_count & 0xf0) trace_seq_printf(s, "%x", entry->preempt_count >> 4); else -diff --git a/kernel/watchdog.c b/kernel/watchdog.c -index 8e61f21e7e33e..41596c415111b 100644 ---- a/kernel/watchdog.c -+++ b/kernel/watchdog.c -@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +diff -rupN linux.orig/kernel/watchdog.c linux/kernel/watchdog.c +--- linux.orig/kernel/watchdog.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/watchdog.c 2022-12-04 10:40:26.724034024 -0500 +@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_tim /* Start period for the next softlockup warning. */ update_report_ts(); @@ -7410,7 +35545,7 @@ index 8e61f21e7e33e..41596c415111b 100644 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); -@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_tim add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); @@ -7419,11 +35554,10 @@ index 8e61f21e7e33e..41596c415111b 100644 } return HRTIMER_RESTART; -diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c -index 247bf0b1582ca..701f35f0e2d44 100644 ---- a/kernel/watchdog_hld.c -+++ b/kernel/watchdog_hld.c -@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event, +diff -rupN linux.orig/kernel/watchdog_hld.c linux/kernel/watchdog_hld.c +--- linux.orig/kernel/watchdog_hld.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/watchdog_hld.c 2022-12-04 10:40:26.724034024 -0500 +@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(s if (__this_cpu_read(hard_watchdog_warn) == true) return; @@ -7432,7 +35566,7 @@ index 247bf0b1582ca..701f35f0e2d44 100644 pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", this_cpu); print_modules(); -@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(struct perf_event *event, +@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(s if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); @@ -7441,10 +35575,28 @@ index 247bf0b1582ca..701f35f0e2d44 100644 __this_cpu_write(hard_watchdog_warn, true); return; } -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index cb131fad117cc..c65e69bf4eebb 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug +diff -rupN linux.orig/lib/flex_proportions.c linux/lib/flex_proportions.c +--- linux.orig/lib/flex_proportions.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/lib/flex_proportions.c 2022-12-04 10:40:26.728034014 -0500 +@@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_globa + */ + if (events <= 1) + return false; ++ preempt_disable_nested(); + write_seqcount_begin(&p->sequence); + if (periods < 64) + events -= events >> periods; +@@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_globa + percpu_counter_add(&p->events, -events); + p->period += periods; + write_seqcount_end(&p->sequence); ++ preempt_enable_nested(); + + return true; + } +diff -rupN linux.orig/lib/Kconfig.debug linux/lib/Kconfig.debug +--- linux.orig/lib/Kconfig.debug 2022-12-02 11:43:18.000000000 -0500 ++++ linux/lib/Kconfig.debug 2022-12-04 10:40:26.724034024 -0500 @@ -811,6 +811,9 @@ config ARCH_HAS_DEBUG_VM_PGTABLE An architecture should select this when it can successfully build and run DEBUG_VM_PGTABLE. @@ -7455,31 +35607,10 @@ index cb131fad117cc..c65e69bf4eebb 100644 config DEBUG_VM bool "Debug VM" depends on DEBUG_KERNEL -diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c -index 05cccbcf1661a..83332fefa6f42 100644 ---- a/lib/flex_proportions.c -+++ b/lib/flex_proportions.c -@@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_global *p, int periods) - */ - if (events <= 1) - return false; -+ preempt_disable_nested(); - write_seqcount_begin(&p->sequence); - if (periods < 64) - events -= events >> periods; -@@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_global *p, int periods) - percpu_counter_add(&p->events, -events); - p->period += periods; - write_seqcount_end(&p->sequence); -+ preempt_enable_nested(); - - return true; - } -diff --git a/lib/vsprintf.c b/lib/vsprintf.c -index 3c1853a9d1c09..ffaba68e6a290 100644 ---- a/lib/vsprintf.c -+++ b/lib/vsprintf.c -@@ -750,37 +750,42 @@ static int __init debug_boot_weak_hash_enable(char *str) +diff -rupN linux.orig/lib/vsprintf.c linux/lib/vsprintf.c +--- linux.orig/lib/vsprintf.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/lib/vsprintf.c 2022-12-04 10:40:26.728034014 -0500 +@@ -750,37 +750,42 @@ static int __init debug_boot_weak_hash_e } early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable); @@ -7543,34 +35674,14 @@ index 3c1853a9d1c09..ffaba68e6a290 100644 #ifdef CONFIG_64BIT hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); -diff --git a/localversion-rt b/localversion-rt -new file mode 100644 -index 0000000000000..08b3e75841adc ---- /dev/null -+++ b/localversion-rt +diff -rupN linux.orig/localversion-rt linux/localversion-rt +--- linux.orig/localversion-rt 1969-12-31 19:00:00.000000000 -0500 ++++ linux/localversion-rt 2022-12-04 10:40:26.728034014 -0500 @@ -0,0 +1 @@ +-rt14 -diff --git a/mm/Kconfig b/mm/Kconfig -index 0331f1461f81c..3897e924e40f2 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -579,6 +579,12 @@ config COMPACTION - it and then we would be really interested to hear about that at - linux-mm@kvack.org. - -+config COMPACT_UNEVICTABLE_DEFAULT -+ int -+ depends on COMPACTION -+ default 0 if PREEMPT_RT -+ default 1 -+ - # - # support for free page reporting - config PAGE_REPORTING -diff --git a/mm/compaction.c b/mm/compaction.c -index 640fa76228dd9..10561cb1aaad9 100644 ---- a/mm/compaction.c -+++ b/mm/compaction.c +diff -rupN linux.orig/mm/compaction.c linux/mm/compaction.c +--- linux.orig/mm/compaction.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/compaction.c 2022-12-04 10:40:26.728034014 -0500 @@ -1727,11 +1727,7 @@ typedef enum { * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. @@ -7584,10 +35695,25 @@ index 640fa76228dd9..10561cb1aaad9 100644 static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index b69979c9ced5c..d35b6fa560f0a 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c +diff -rupN linux.orig/mm/Kconfig linux/mm/Kconfig +--- linux.orig/mm/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/Kconfig 2022-12-04 10:40:26.728034014 -0500 +@@ -579,6 +579,12 @@ config COMPACTION + it and then we would be really interested to hear about that at + linux-mm@kvack.org. + ++config COMPACT_UNEVICTABLE_DEFAULT ++ int ++ depends on COMPACTION ++ default 0 if PREEMPT_RT ++ default 1 ++ + # + # support for free page reporting + config PAGE_REPORTING +diff -rupN linux.orig/mm/memcontrol.c linux/mm/memcontrol.c +--- linux.orig/mm/memcontrol.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/memcontrol.c 2022-12-04 10:40:26.728034014 -0500 @@ -597,25 +597,18 @@ static u64 flush_next_time; */ static void memcg_stats_lock(void) @@ -7618,7 +35744,7 @@ index b69979c9ced5c..d35b6fa560f0a 100644 } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) -@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lru * interrupt context while other caller need to have disabled interrupt. */ __memcg_stats_lock(); @@ -7627,7 +35753,7 @@ index b69979c9ced5c..d35b6fa560f0a 100644 switch (idx) { case NR_ANON_MAPPED: case NR_FILE_MAPPED: -@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lru WARN_ON_ONCE(!in_task()); break; default: @@ -7636,10 +35762,9 @@ index b69979c9ced5c..d35b6fa560f0a 100644 } } -diff --git a/mm/slub.c b/mm/slub.c -index 4b98dff9be8e3..59173fa5901a0 100644 ---- a/mm/slub.c -+++ b/mm/slub.c +diff -rupN linux.orig/mm/slub.c linux/mm/slub.c +--- linux.orig/mm/slub.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/slub.c 2022-12-04 10:40:26.728034014 -0500 @@ -50,7 +50,7 @@ * 1. slab_mutex (Global Mutex) * 2. node->list_lock (Spinlock) @@ -7705,7 +35830,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 #endif #ifdef CONFIG_SLUB_DEBUG -@@ -447,7 +455,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +@@ -447,7 +455,7 @@ slub_set_cpu_partial(struct kmem_cache * /* * Per slab locking using the pagelock */ @@ -7714,7 +35839,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 { struct page *page = slab_page(slab); -@@ -455,7 +463,7 @@ static __always_inline void __slab_lock(struct slab *slab) +@@ -455,7 +463,7 @@ static __always_inline void __slab_lock( bit_spin_lock(PG_locked, &page->flags); } @@ -7723,7 +35848,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 { struct page *page = slab_page(slab); -@@ -463,31 +471,19 @@ static __always_inline void __slab_unlock(struct slab *slab) +@@ -463,31 +471,19 @@ static __always_inline void __slab_unloc __bit_spin_unlock(PG_locked, &page->flags); } @@ -7760,7 +35885,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) -@@ -499,18 +495,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab +@@ -499,18 +495,15 @@ static inline bool __cmpxchg_double_slab } else #endif { @@ -7782,7 +35907,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } cpu_relax(); -@@ -541,16 +534,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, +@@ -541,16 +534,16 @@ static inline bool cmpxchg_double_slab(s unsigned long flags; local_irq_save(flags); @@ -7802,7 +35927,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 local_irq_restore(flags); } -@@ -566,7 +559,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, +@@ -566,7 +559,7 @@ static inline bool cmpxchg_double_slab(s #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; @@ -7842,7 +35967,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) -@@ -1329,17 +1298,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, +@@ -1329,17 +1298,14 @@ static inline int alloc_consistency_chec } static noinline int alloc_debug_processing(struct kmem_cache *s, @@ -7862,7 +35987,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 trace(s, slab, object, 1); init_object(s, object, SLUB_RED_ACTIVE); return 1; -@@ -1390,63 +1356,6 @@ static inline int free_consistency_checks(struct kmem_cache *s, +@@ -1390,63 +1356,6 @@ static inline int free_consistency_check return 1; } @@ -7948,7 +36073,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, -@@ -1981,11 +1892,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +@@ -1981,11 +1892,13 @@ static struct slab *allocate_slab(struct */ slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) @@ -7963,7 +36088,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 account_slab(slab, oo_order(oo), s, flags); -@@ -2012,15 +1925,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +@@ -2012,15 +1925,6 @@ static struct slab *allocate_slab(struct set_freepointer(s, p, NULL); } @@ -7979,11 +36104,10 @@ index 4b98dff9be8e3..59173fa5901a0 100644 return slab; } -@@ -2107,6 +2011,75 @@ static inline void remove_partial(struct kmem_cache_node *n, - n->nr_partial--; +@@ -2108,6 +2012,75 @@ static inline void remove_partial(struct } -+/* + /* + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move @@ -8052,10 +36176,11 @@ index 4b98dff9be8e3..59173fa5901a0 100644 + return object; +} + - /* ++/* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. -@@ -2187,6 +2160,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + * +@@ -2187,6 +2160,13 @@ static void *get_partial_node(struct kme if (!pfmemalloc_match(slab, gfpflags)) continue; @@ -8069,7 +36194,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 t = acquire_slab(s, n, slab, object == NULL); if (!t) break; -@@ -2793,6 +2773,109 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +@@ -2793,6 +2773,109 @@ static inline unsigned long node_nr_objs { return atomic_long_read(&n->total_objects); } @@ -8179,7 +36304,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 #endif /* CONFIG_SLUB_DEBUG */ #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) -@@ -3041,36 +3124,52 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +@@ -3041,36 +3124,52 @@ new_objects: return NULL; } @@ -8245,7 +36370,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 retry_load_slab: -@@ -3094,11 +3193,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +@@ -3094,11 +3193,6 @@ retry_load_slab: c->slab = slab; goto load_freelist; @@ -8257,7 +36382,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } /* -@@ -3202,14 +3296,8 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_l +@@ -3202,14 +3296,8 @@ redo: object = c->freelist; slab = c->slab; @@ -8274,7 +36399,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { -@@ -3346,9 +3434,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, +@@ -3346,9 +3434,10 @@ static void __slab_free(struct kmem_cach if (kfence_free(head)) return; @@ -8287,7 +36412,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 do { if (unlikely(n)) { -@@ -3468,6 +3557,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, +@@ -3468,6 +3557,7 @@ static __always_inline void do_slab_free void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; @@ -8295,7 +36420,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 redo: /* -@@ -3482,9 +3572,13 @@ static __always_inline void do_slab_free(struct kmem_cache *s, +@@ -3482,9 +3572,13 @@ redo: /* Same with comment on barrier() in slab_alloc_node() */ barrier(); @@ -8312,7 +36437,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 set_freepointer(s, tail_obj, freelist); -@@ -3496,16 +3590,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, +@@ -3496,16 +3590,8 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } @@ -8331,7 +36456,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { -@@ -3520,11 +3606,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, +@@ -3520,11 +3606,8 @@ redo: c->tid = next_tid(tid); local_unlock(&s->cpu_slab->lock); @@ -8345,7 +36470,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, -@@ -3941,6 +4024,7 @@ static void early_kmem_cache_node_alloc(int node) +@@ -3941,6 +4024,7 @@ static void early_kmem_cache_node_alloc( slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!slab); @@ -8353,7 +36478,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); -@@ -3955,7 +4039,6 @@ static void early_kmem_cache_node_alloc(int node) +@@ -3955,7 +4039,6 @@ static void early_kmem_cache_node_alloc( n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; @@ -8361,7 +36486,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); -@@ -4242,23 +4325,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, +@@ -4242,23 +4325,21 @@ static void list_slab_objects(struct kme { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); @@ -8390,7 +36515,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 #endif } -@@ -4616,6 +4697,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) +@@ -4616,6 +4697,7 @@ static int __kmem_cache_do_shrink(struct if (free == slab->objects) { list_move(&slab->slab_list, &discard); n->nr_partial--; @@ -8398,7 +36523,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } else if (free <= SHRINK_PROMOTE_MAX) list_move(&slab->slab_list, promote + free - 1); } -@@ -4631,7 +4713,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) +@@ -4631,7 +4713,7 @@ static int __kmem_cache_do_shrink(struct /* Release empty slabs */ list_for_each_entry_safe(slab, t, &discard, slab_list) @@ -8407,7 +36532,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 if (slabs_node(s, node)) ret = 1; -@@ -4991,12 +5073,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, +@@ -4991,12 +5073,9 @@ static void validate_slab(struct kmem_ca { void *p; void *addr = slab_address(slab); @@ -8421,7 +36546,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 /* Now we know that a valid freelist exists */ __fill_map(obj_map, s, slab); -@@ -5007,8 +5086,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, +@@ -5007,8 +5086,6 @@ static void validate_slab(struct kmem_ca if (!check_object(s, slab, p, val)) break; } @@ -8430,7 +36555,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } static int validate_slab_node(struct kmem_cache *s, -@@ -5612,7 +5689,7 @@ static ssize_t validate_store(struct kmem_cache *s, +@@ -5612,7 +5689,7 @@ static ssize_t validate_store(struct kme { int ret = -EINVAL; @@ -8439,11 +36564,10 @@ index 4b98dff9be8e3..59173fa5901a0 100644 ret = validate_slab_cache(s); if (ret >= 0) ret = length; -diff --git a/mm/vmstat.c b/mm/vmstat.c -index 90af9a8572f5a..7a2d73f152304 100644 ---- a/mm/vmstat.c -+++ b/mm/vmstat.c -@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, +diff -rupN linux.orig/mm/vmstat.c linux/mm/vmstat.c +--- linux.orig/mm/vmstat.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/vmstat.c 2022-12-04 10:40:26.728034014 -0500 +@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone * * CPU migrations and preemption potentially corrupts a counter so * disable preemption. */ @@ -8453,7 +36577,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 x = delta + __this_cpu_read(*p); -@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, +@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone * } __this_cpu_write(*p, x); @@ -8463,7 +36587,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } EXPORT_SYMBOL(__mod_zone_page_state); -@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, +@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist } /* See __mod_node_page_state */ @@ -8473,7 +36597,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 x = delta + __this_cpu_read(*p); -@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, +@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist } __this_cpu_write(*p, x); @@ -8483,7 +36607,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } EXPORT_SYMBOL(__mod_node_page_state); -@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) +@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, s8 v, t; /* See __mod_node_page_state */ @@ -8493,7 +36617,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); -@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) +@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, __this_cpu_write(*p, -overstep); } @@ -8503,7 +36627,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) -@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ @@ -8513,7 +36637,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); -@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data __this_cpu_write(*p, -overstep); } @@ -8523,7 +36647,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) -@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) +@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, s8 v, t; /* See __mod_node_page_state */ @@ -8533,7 +36657,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); -@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) +@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, __this_cpu_write(*p, overstep); } @@ -8543,7 +36667,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) -@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ @@ -8553,7 +36677,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); -@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data __this_cpu_write(*p, overstep); } @@ -8563,11 +36687,10 @@ index 90af9a8572f5a..7a2d73f152304 100644 } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) -diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c -index 035812b0461cc..ecdb47712d956 100644 ---- a/net/8021q/vlan_dev.c -+++ b/net/8021q/vlan_dev.c -@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct net_device *dev, +diff -rupN linux.orig/net/8021q/vlan_dev.c linux/net/8021q/vlan_dev.c +--- linux.orig/net/8021q/vlan_dev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/8021q/vlan_dev.c 2022-12-04 10:40:26.728034014 -0500 +@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); do { @@ -8583,11 +36706,10 @@ index 035812b0461cc..ecdb47712d956 100644 stats->rx_packets += rxpackets; stats->rx_bytes += rxbytes; -diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c -index db4f2641d1cd1..7e2a9fb5786c9 100644 ---- a/net/bridge/br_multicast.c -+++ b/net/bridge/br_multicast.c -@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct net_bridge *br, +diff -rupN linux.orig/net/bridge/br_multicast.c linux/net/bridge/br_multicast.c +--- linux.orig/net/bridge/br_multicast.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/bridge/br_multicast.c 2022-12-04 10:40:26.728034014 -0500 +@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct unsigned int start; do { @@ -8599,11 +36721,10 @@ index db4f2641d1cd1..7e2a9fb5786c9 100644 mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries); -diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c -index 6e53dc9914094..f2fc284abab38 100644 ---- a/net/bridge/br_vlan.c -+++ b/net/bridge/br_vlan.c -@@ -1378,12 +1378,12 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, +diff -rupN linux.orig/net/bridge/br_vlan.c linux/net/bridge/br_vlan.c +--- linux.orig/net/bridge/br_vlan.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/bridge/br_vlan.c 2022-12-04 10:40:26.728034014 -0500 +@@ -1389,12 +1389,12 @@ void br_vlan_get_stats(const struct net_ cpu_stats = per_cpu_ptr(v->stats, i); do { @@ -8618,11 +36739,2324 @@ index 6e53dc9914094..f2fc284abab38 100644 u64_stats_add(&stats->rx_packets, rxpackets); u64_stats_add(&stats->rx_bytes, rxbytes); -diff --git a/net/core/dev.c b/net/core/dev.c -index 56c8b0921c9fd..d96506980d2f2 100644 ---- a/net/core/dev.c -+++ b/net/core/dev.c -@@ -4582,15 +4582,6 @@ static void rps_trigger_softirq(void *data) +diff -rupN linux.orig/net/bridge/br_vlan.c.orig linux/net/bridge/br_vlan.c.orig +--- linux.orig/net/bridge/br_vlan.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/net/bridge/br_vlan.c.orig 2022-12-04 10:40:18.724054527 -0500 +@@ -0,0 +1,2310 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++#include ++#include ++#include ++ ++#include "br_private.h" ++#include "br_private_tunnel.h" ++ ++static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid); ++ ++static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, ++ const void *ptr) ++{ ++ const struct net_bridge_vlan *vle = ptr; ++ u16 vid = *(u16 *)arg->key; ++ ++ return vle->vid != vid; ++} ++ ++static const struct rhashtable_params br_vlan_rht_params = { ++ .head_offset = offsetof(struct net_bridge_vlan, vnode), ++ .key_offset = offsetof(struct net_bridge_vlan, vid), ++ .key_len = sizeof(u16), ++ .nelem_hint = 3, ++ .max_size = VLAN_N_VID, ++ .obj_cmpfn = br_vlan_cmp, ++ .automatic_shrinking = true, ++}; ++ ++static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) ++{ ++ return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); ++} ++ ++static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, ++ const struct net_bridge_vlan *v) ++{ ++ if (vg->pvid == v->vid) ++ return; ++ ++ smp_wmb(); ++ br_vlan_set_pvid_state(vg, v->state); ++ vg->pvid = v->vid; ++} ++ ++static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) ++{ ++ if (vg->pvid != vid) ++ return; ++ ++ smp_wmb(); ++ vg->pvid = 0; ++} ++ ++/* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v. ++ * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and ++ * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v. ++ */ ++static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags, ++ bool commit) ++{ ++ struct net_bridge_vlan_group *vg; ++ bool change; ++ ++ if (br_vlan_is_master(v)) ++ vg = br_vlan_group(v->br); ++ else ++ vg = nbp_vlan_group(v->port); ++ ++ /* check if anything would be changed on commit */ ++ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) || ++ ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED); ++ ++ if (!commit) ++ goto out; ++ ++ if (flags & BRIDGE_VLAN_INFO_PVID) ++ __vlan_add_pvid(vg, v); ++ else ++ __vlan_delete_pvid(vg, v->vid); ++ ++ if (flags & BRIDGE_VLAN_INFO_UNTAGGED) ++ v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; ++ else ++ v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; ++ ++out: ++ return change; ++} ++ ++static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags) ++{ ++ return __vlan_flags_update(v, flags, false); ++} ++ ++static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags) ++{ ++ __vlan_flags_update(v, flags, true); ++} ++ ++static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, ++ struct net_bridge_vlan *v, u16 flags, ++ struct netlink_ext_ack *extack) ++{ ++ int err; ++ ++ /* Try switchdev op first. In case it is not supported, fallback to ++ * 8021q add. ++ */ ++ err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); ++ if (err == -EOPNOTSUPP) ++ return vlan_vid_add(dev, br->vlan_proto, v->vid); ++ v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; ++ return err; ++} ++ ++static void __vlan_add_list(struct net_bridge_vlan *v) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct list_head *headp, *hpos; ++ struct net_bridge_vlan *vent; ++ ++ if (br_vlan_is_master(v)) ++ vg = br_vlan_group(v->br); ++ else ++ vg = nbp_vlan_group(v->port); ++ ++ headp = &vg->vlan_list; ++ list_for_each_prev(hpos, headp) { ++ vent = list_entry(hpos, struct net_bridge_vlan, vlist); ++ if (v->vid >= vent->vid) ++ break; ++ } ++ list_add_rcu(&v->vlist, hpos); ++} ++ ++static void __vlan_del_list(struct net_bridge_vlan *v) ++{ ++ list_del_rcu(&v->vlist); ++} ++ ++static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, ++ const struct net_bridge_vlan *v) ++{ ++ int err; ++ ++ /* Try switchdev op first. In case it is not supported, fallback to ++ * 8021q del. ++ */ ++ err = br_switchdev_port_vlan_del(dev, v->vid); ++ if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)) ++ vlan_vid_del(dev, br->vlan_proto, v->vid); ++ return err == -EOPNOTSUPP ? 0 : err; ++} ++ ++/* Returns a master vlan, if it didn't exist it gets created. In all cases ++ * a reference is taken to the master vlan before returning. ++ */ ++static struct net_bridge_vlan * ++br_vlan_get_master(struct net_bridge *br, u16 vid, ++ struct netlink_ext_ack *extack) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *masterv; ++ ++ vg = br_vlan_group(br); ++ masterv = br_vlan_find(vg, vid); ++ if (!masterv) { ++ bool changed; ++ ++ /* missing global ctx, create it now */ ++ if (br_vlan_add(br, vid, 0, &changed, extack)) ++ return NULL; ++ masterv = br_vlan_find(vg, vid); ++ if (WARN_ON(!masterv)) ++ return NULL; ++ refcount_set(&masterv->refcnt, 1); ++ return masterv; ++ } ++ refcount_inc(&masterv->refcnt); ++ ++ return masterv; ++} ++ ++static void br_master_vlan_rcu_free(struct rcu_head *rcu) ++{ ++ struct net_bridge_vlan *v; ++ ++ v = container_of(rcu, struct net_bridge_vlan, rcu); ++ WARN_ON(!br_vlan_is_master(v)); ++ free_percpu(v->stats); ++ v->stats = NULL; ++ kfree(v); ++} ++ ++static void br_vlan_put_master(struct net_bridge_vlan *masterv) ++{ ++ struct net_bridge_vlan_group *vg; ++ ++ if (!br_vlan_is_master(masterv)) ++ return; ++ ++ vg = br_vlan_group(masterv->br); ++ if (refcount_dec_and_test(&masterv->refcnt)) { ++ rhashtable_remove_fast(&vg->vlan_hash, ++ &masterv->vnode, br_vlan_rht_params); ++ __vlan_del_list(masterv); ++ br_multicast_toggle_one_vlan(masterv, false); ++ br_multicast_ctx_deinit(&masterv->br_mcast_ctx); ++ call_rcu(&masterv->rcu, br_master_vlan_rcu_free); ++ } ++} ++ ++static void nbp_vlan_rcu_free(struct rcu_head *rcu) ++{ ++ struct net_bridge_vlan *v; ++ ++ v = container_of(rcu, struct net_bridge_vlan, rcu); ++ WARN_ON(br_vlan_is_master(v)); ++ /* if we had per-port stats configured then free them here */ ++ if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) ++ free_percpu(v->stats); ++ v->stats = NULL; ++ kfree(v); ++} ++ ++static void br_vlan_init_state(struct net_bridge_vlan *v) ++{ ++ struct net_bridge *br; ++ ++ if (br_vlan_is_master(v)) ++ br = v->br; ++ else ++ br = v->port->br; ++ ++ if (br_opt_get(br, BROPT_MST_ENABLED)) { ++ br_mst_vlan_init_state(v); ++ return; ++ } ++ ++ v->state = BR_STATE_FORWARDING; ++ v->msti = 0; ++} ++ ++/* This is the shared VLAN add function which works for both ports and bridge ++ * devices. There are four possible calls to this function in terms of the ++ * vlan entry type: ++ * 1. vlan is being added on a port (no master flags, global entry exists) ++ * 2. vlan is being added on a bridge (both master and brentry flags) ++ * 3. vlan is being added on a port, but a global entry didn't exist which ++ * is being created right now (master flag set, brentry flag unset), the ++ * global entry is used for global per-vlan features, but not for filtering ++ * 4. same as 3 but with both master and brentry flags set so the entry ++ * will be used for filtering in both the port and the bridge ++ */ ++static int __vlan_add(struct net_bridge_vlan *v, u16 flags, ++ struct netlink_ext_ack *extack) ++{ ++ struct net_bridge_vlan *masterv = NULL; ++ struct net_bridge_port *p = NULL; ++ struct net_bridge_vlan_group *vg; ++ struct net_device *dev; ++ struct net_bridge *br; ++ int err; ++ ++ if (br_vlan_is_master(v)) { ++ br = v->br; ++ dev = br->dev; ++ vg = br_vlan_group(br); ++ } else { ++ p = v->port; ++ br = p->br; ++ dev = p->dev; ++ vg = nbp_vlan_group(p); ++ } ++ ++ if (p) { ++ /* Add VLAN to the device filter if it is supported. ++ * This ensures tagged traffic enters the bridge when ++ * promiscuous mode is disabled by br_manage_promisc(). ++ */ ++ err = __vlan_vid_add(dev, br, v, flags, extack); ++ if (err) ++ goto out; ++ ++ /* need to work on the master vlan too */ ++ if (flags & BRIDGE_VLAN_INFO_MASTER) { ++ bool changed; ++ ++ err = br_vlan_add(br, v->vid, ++ flags | BRIDGE_VLAN_INFO_BRENTRY, ++ &changed, extack); ++ if (err) ++ goto out_filt; ++ ++ if (changed) ++ br_vlan_notify(br, NULL, v->vid, 0, ++ RTM_NEWVLAN); ++ } ++ ++ masterv = br_vlan_get_master(br, v->vid, extack); ++ if (!masterv) { ++ err = -ENOMEM; ++ goto out_filt; ++ } ++ v->brvlan = masterv; ++ if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { ++ v->stats = ++ netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); ++ if (!v->stats) { ++ err = -ENOMEM; ++ goto out_filt; ++ } ++ v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; ++ } else { ++ v->stats = masterv->stats; ++ } ++ br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); ++ } else { ++ if (br_vlan_should_use(v)) { ++ err = br_switchdev_port_vlan_add(dev, v->vid, flags, ++ false, extack); ++ if (err && err != -EOPNOTSUPP) ++ goto out; ++ } ++ br_multicast_ctx_init(br, v, &v->br_mcast_ctx); ++ v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; ++ } ++ ++ /* Add the dev mac and count the vlan only if it's usable */ ++ if (br_vlan_should_use(v)) { ++ err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); ++ if (err) { ++ br_err(br, "failed insert local address into bridge forwarding table\n"); ++ goto out_filt; ++ } ++ vg->num_vlans++; ++ } ++ ++ /* set the state before publishing */ ++ br_vlan_init_state(v); ++ ++ err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, ++ br_vlan_rht_params); ++ if (err) ++ goto out_fdb_insert; ++ ++ __vlan_add_list(v); ++ __vlan_flags_commit(v, flags); ++ br_multicast_toggle_one_vlan(v, true); ++ ++ if (p) ++ nbp_vlan_set_vlan_dev_state(p, v->vid); ++out: ++ return err; ++ ++out_fdb_insert: ++ if (br_vlan_should_use(v)) { ++ br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); ++ vg->num_vlans--; ++ } ++ ++out_filt: ++ if (p) { ++ __vlan_vid_del(dev, br, v); ++ if (masterv) { ++ if (v->stats && masterv->stats != v->stats) ++ free_percpu(v->stats); ++ v->stats = NULL; ++ ++ br_vlan_put_master(masterv); ++ v->brvlan = NULL; ++ } ++ } else { ++ br_switchdev_port_vlan_del(dev, v->vid); ++ } ++ ++ goto out; ++} ++ ++static int __vlan_del(struct net_bridge_vlan *v) ++{ ++ struct net_bridge_vlan *masterv = v; ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p = NULL; ++ int err = 0; ++ ++ if (br_vlan_is_master(v)) { ++ vg = br_vlan_group(v->br); ++ } else { ++ p = v->port; ++ vg = nbp_vlan_group(v->port); ++ masterv = v->brvlan; ++ } ++ ++ __vlan_delete_pvid(vg, v->vid); ++ if (p) { ++ err = __vlan_vid_del(p->dev, p->br, v); ++ if (err) ++ goto out; ++ } else { ++ err = br_switchdev_port_vlan_del(v->br->dev, v->vid); ++ if (err && err != -EOPNOTSUPP) ++ goto out; ++ err = 0; ++ } ++ ++ if (br_vlan_should_use(v)) { ++ v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; ++ vg->num_vlans--; ++ } ++ ++ if (masterv != v) { ++ vlan_tunnel_info_del(vg, v); ++ rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, ++ br_vlan_rht_params); ++ __vlan_del_list(v); ++ nbp_vlan_set_vlan_dev_state(p, v->vid); ++ br_multicast_toggle_one_vlan(v, false); ++ br_multicast_port_ctx_deinit(&v->port_mcast_ctx); ++ call_rcu(&v->rcu, nbp_vlan_rcu_free); ++ } ++ ++ br_vlan_put_master(masterv); ++out: ++ return err; ++} ++ ++static void __vlan_group_free(struct net_bridge_vlan_group *vg) ++{ ++ WARN_ON(!list_empty(&vg->vlan_list)); ++ rhashtable_destroy(&vg->vlan_hash); ++ vlan_tunnel_deinit(vg); ++ kfree(vg); ++} ++ ++static void __vlan_flush(const struct net_bridge *br, ++ const struct net_bridge_port *p, ++ struct net_bridge_vlan_group *vg) ++{ ++ struct net_bridge_vlan *vlan, *tmp; ++ u16 v_start = 0, v_end = 0; ++ int err; ++ ++ __vlan_delete_pvid(vg, vg->pvid); ++ list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { ++ /* take care of disjoint ranges */ ++ if (!v_start) { ++ v_start = vlan->vid; ++ } else if (vlan->vid - v_end != 1) { ++ /* found range end, notify and start next one */ ++ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); ++ v_start = vlan->vid; ++ } ++ v_end = vlan->vid; ++ ++ err = __vlan_del(vlan); ++ if (err) { ++ br_err(br, ++ "port %u(%s) failed to delete vlan %d: %pe\n", ++ (unsigned int) p->port_no, p->dev->name, ++ vlan->vid, ERR_PTR(err)); ++ } ++ } ++ ++ /* notify about the last/whole vlan range */ ++ if (v_start) ++ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); ++} ++ ++struct sk_buff *br_handle_vlan(struct net_bridge *br, ++ const struct net_bridge_port *p, ++ struct net_bridge_vlan_group *vg, ++ struct sk_buff *skb) ++{ ++ struct pcpu_sw_netstats *stats; ++ struct net_bridge_vlan *v; ++ u16 vid; ++ ++ /* If this packet was not filtered at input, let it pass */ ++ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) ++ goto out; ++ ++ /* At this point, we know that the frame was filtered and contains ++ * a valid vlan id. If the vlan id has untagged flag set, ++ * send untagged; otherwise, send tagged. ++ */ ++ br_vlan_get_tag(skb, &vid); ++ v = br_vlan_find(vg, vid); ++ /* Vlan entry must be configured at this point. The ++ * only exception is the bridge is set in promisc mode and the ++ * packet is destined for the bridge device. In this case ++ * pass the packet as is. ++ */ ++ if (!v || !br_vlan_should_use(v)) { ++ if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { ++ goto out; ++ } else { ++ kfree_skb(skb); ++ return NULL; ++ } ++ } ++ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { ++ stats = this_cpu_ptr(v->stats); ++ u64_stats_update_begin(&stats->syncp); ++ u64_stats_add(&stats->tx_bytes, skb->len); ++ u64_stats_inc(&stats->tx_packets); ++ u64_stats_update_end(&stats->syncp); ++ } ++ ++ /* If the skb will be sent using forwarding offload, the assumption is ++ * that the switchdev will inject the packet into hardware together ++ * with the bridge VLAN, so that it can be forwarded according to that ++ * VLAN. The switchdev should deal with popping the VLAN header in ++ * hardware on each egress port as appropriate. So only strip the VLAN ++ * header if forwarding offload is not being used. ++ */ ++ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && ++ !br_switchdev_frame_uses_tx_fwd_offload(skb)) ++ __vlan_hwaccel_clear_tag(skb); ++ ++ if (p && (p->flags & BR_VLAN_TUNNEL) && ++ br_handle_egress_vlan_tunnel(skb, v)) { ++ kfree_skb(skb); ++ return NULL; ++ } ++out: ++ return skb; ++} ++ ++/* Called under RCU */ ++static bool __allowed_ingress(const struct net_bridge *br, ++ struct net_bridge_vlan_group *vg, ++ struct sk_buff *skb, u16 *vid, ++ u8 *state, ++ struct net_bridge_vlan **vlan) ++{ ++ struct pcpu_sw_netstats *stats; ++ struct net_bridge_vlan *v; ++ bool tagged; ++ ++ BR_INPUT_SKB_CB(skb)->vlan_filtered = true; ++ /* If vlan tx offload is disabled on bridge device and frame was ++ * sent from vlan device on the bridge device, it does not have ++ * HW accelerated vlan tag. ++ */ ++ if (unlikely(!skb_vlan_tag_present(skb) && ++ skb->protocol == br->vlan_proto)) { ++ skb = skb_vlan_untag(skb); ++ if (unlikely(!skb)) ++ return false; ++ } ++ ++ if (!br_vlan_get_tag(skb, vid)) { ++ /* Tagged frame */ ++ if (skb->vlan_proto != br->vlan_proto) { ++ /* Protocol-mismatch, empty out vlan_tci for new tag */ ++ skb_push(skb, ETH_HLEN); ++ skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, ++ skb_vlan_tag_get(skb)); ++ if (unlikely(!skb)) ++ return false; ++ ++ skb_pull(skb, ETH_HLEN); ++ skb_reset_mac_len(skb); ++ *vid = 0; ++ tagged = false; ++ } else { ++ tagged = true; ++ } ++ } else { ++ /* Untagged frame */ ++ tagged = false; ++ } ++ ++ if (!*vid) { ++ u16 pvid = br_get_pvid(vg); ++ ++ /* Frame had a tag with VID 0 or did not have a tag. ++ * See if pvid is set on this port. That tells us which ++ * vlan untagged or priority-tagged traffic belongs to. ++ */ ++ if (!pvid) ++ goto drop; ++ ++ /* PVID is set on this port. Any untagged or priority-tagged ++ * ingress frame is considered to belong to this vlan. ++ */ ++ *vid = pvid; ++ if (likely(!tagged)) ++ /* Untagged Frame. */ ++ __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); ++ else ++ /* Priority-tagged Frame. ++ * At this point, we know that skb->vlan_tci VID ++ * field was 0. ++ * We update only VID field and preserve PCP field. ++ */ ++ skb->vlan_tci |= pvid; ++ ++ /* if snooping and stats are disabled we can avoid the lookup */ ++ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && ++ !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { ++ if (*state == BR_STATE_FORWARDING) { ++ *state = br_vlan_get_pvid_state(vg); ++ if (!br_vlan_state_allowed(*state, true)) ++ goto drop; ++ } ++ return true; ++ } ++ } ++ v = br_vlan_find(vg, *vid); ++ if (!v || !br_vlan_should_use(v)) ++ goto drop; ++ ++ if (*state == BR_STATE_FORWARDING) { ++ *state = br_vlan_get_state(v); ++ if (!br_vlan_state_allowed(*state, true)) ++ goto drop; ++ } ++ ++ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { ++ stats = this_cpu_ptr(v->stats); ++ u64_stats_update_begin(&stats->syncp); ++ u64_stats_add(&stats->rx_bytes, skb->len); ++ u64_stats_inc(&stats->rx_packets); ++ u64_stats_update_end(&stats->syncp); ++ } ++ ++ *vlan = v; ++ ++ return true; ++ ++drop: ++ kfree_skb(skb); ++ return false; ++} ++ ++bool br_allowed_ingress(const struct net_bridge *br, ++ struct net_bridge_vlan_group *vg, struct sk_buff *skb, ++ u16 *vid, u8 *state, ++ struct net_bridge_vlan **vlan) ++{ ++ /* If VLAN filtering is disabled on the bridge, all packets are ++ * permitted. ++ */ ++ *vlan = NULL; ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { ++ BR_INPUT_SKB_CB(skb)->vlan_filtered = false; ++ return true; ++ } ++ ++ return __allowed_ingress(br, vg, skb, vid, state, vlan); ++} ++ ++/* Called under RCU. */ ++bool br_allowed_egress(struct net_bridge_vlan_group *vg, ++ const struct sk_buff *skb) ++{ ++ const struct net_bridge_vlan *v; ++ u16 vid; ++ ++ /* If this packet was not filtered at input, let it pass */ ++ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) ++ return true; ++ ++ br_vlan_get_tag(skb, &vid); ++ v = br_vlan_find(vg, vid); ++ if (v && br_vlan_should_use(v) && ++ br_vlan_state_allowed(br_vlan_get_state(v), false)) ++ return true; ++ ++ return false; ++} ++ ++/* Called under RCU */ ++bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge *br = p->br; ++ struct net_bridge_vlan *v; ++ ++ /* If filtering was disabled at input, let it pass. */ ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) ++ return true; ++ ++ vg = nbp_vlan_group_rcu(p); ++ if (!vg || !vg->num_vlans) ++ return false; ++ ++ if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) ++ *vid = 0; ++ ++ if (!*vid) { ++ *vid = br_get_pvid(vg); ++ if (!*vid || ++ !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) ++ return false; ++ ++ return true; ++ } ++ ++ v = br_vlan_find(vg, *vid); ++ if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) ++ return true; ++ ++ return false; ++} ++ ++static int br_vlan_add_existing(struct net_bridge *br, ++ struct net_bridge_vlan_group *vg, ++ struct net_bridge_vlan *vlan, ++ u16 flags, bool *changed, ++ struct netlink_ext_ack *extack) ++{ ++ bool would_change = __vlan_flags_would_change(vlan, flags); ++ bool becomes_brentry = false; ++ int err; ++ ++ if (!br_vlan_is_brentry(vlan)) { ++ /* Trying to change flags of non-existent bridge vlan */ ++ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) ++ return -EINVAL; ++ ++ becomes_brentry = true; ++ } ++ ++ /* Master VLANs that aren't brentries weren't notified before, ++ * time to notify them now. ++ */ ++ if (becomes_brentry || would_change) { ++ err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, ++ would_change, extack); ++ if (err && err != -EOPNOTSUPP) ++ return err; ++ } ++ ++ if (becomes_brentry) { ++ /* It was only kept for port vlans, now make it real */ ++ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); ++ if (err) { ++ br_err(br, "failed to insert local address into bridge forwarding table\n"); ++ goto err_fdb_insert; ++ } ++ ++ refcount_inc(&vlan->refcnt); ++ vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; ++ vg->num_vlans++; ++ *changed = true; ++ br_multicast_toggle_one_vlan(vlan, true); ++ } ++ ++ __vlan_flags_commit(vlan, flags); ++ if (would_change) ++ *changed = true; ++ ++ return 0; ++ ++err_fdb_insert: ++ br_switchdev_port_vlan_del(br->dev, vlan->vid); ++ return err; ++} ++ ++/* Must be protected by RTNL. ++ * Must be called with vid in range from 1 to 4094 inclusive. ++ * changed must be true only if the vlan was created or updated ++ */ ++int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, ++ struct netlink_ext_ack *extack) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *vlan; ++ int ret; ++ ++ ASSERT_RTNL(); ++ ++ *changed = false; ++ vg = br_vlan_group(br); ++ vlan = br_vlan_find(vg, vid); ++ if (vlan) ++ return br_vlan_add_existing(br, vg, vlan, flags, changed, ++ extack); ++ ++ vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); ++ if (!vlan) ++ return -ENOMEM; ++ ++ vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); ++ if (!vlan->stats) { ++ kfree(vlan); ++ return -ENOMEM; ++ } ++ vlan->vid = vid; ++ vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; ++ vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; ++ vlan->br = br; ++ if (flags & BRIDGE_VLAN_INFO_BRENTRY) ++ refcount_set(&vlan->refcnt, 1); ++ ret = __vlan_add(vlan, flags, extack); ++ if (ret) { ++ free_percpu(vlan->stats); ++ kfree(vlan); ++ } else { ++ *changed = true; ++ } ++ ++ return ret; ++} ++ ++/* Must be protected by RTNL. ++ * Must be called with vid in range from 1 to 4094 inclusive. ++ */ ++int br_vlan_delete(struct net_bridge *br, u16 vid) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v; ++ ++ ASSERT_RTNL(); ++ ++ vg = br_vlan_group(br); ++ v = br_vlan_find(vg, vid); ++ if (!v || !br_vlan_is_brentry(v)) ++ return -ENOENT; ++ ++ br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); ++ br_fdb_delete_by_port(br, NULL, vid, 0); ++ ++ vlan_tunnel_info_del(vg, v); ++ ++ return __vlan_del(v); ++} ++ ++void br_vlan_flush(struct net_bridge *br) ++{ ++ struct net_bridge_vlan_group *vg; ++ ++ ASSERT_RTNL(); ++ ++ vg = br_vlan_group(br); ++ __vlan_flush(br, NULL, vg); ++ RCU_INIT_POINTER(br->vlgrp, NULL); ++ synchronize_rcu(); ++ __vlan_group_free(vg); ++} ++ ++struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) ++{ ++ if (!vg) ++ return NULL; ++ ++ return br_vlan_lookup(&vg->vlan_hash, vid); ++} ++ ++/* Must be protected by RTNL. */ ++static void recalculate_group_addr(struct net_bridge *br) ++{ ++ if (br_opt_get(br, BROPT_GROUP_ADDR_SET)) ++ return; ++ ++ spin_lock_bh(&br->lock); ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED) || ++ br->vlan_proto == htons(ETH_P_8021Q)) { ++ /* Bridge Group Address */ ++ br->group_addr[5] = 0x00; ++ } else { /* vlan_enabled && ETH_P_8021AD */ ++ /* Provider Bridge Group Address */ ++ br->group_addr[5] = 0x08; ++ } ++ spin_unlock_bh(&br->lock); ++} ++ ++/* Must be protected by RTNL. */ ++void br_recalculate_fwd_mask(struct net_bridge *br) ++{ ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED) || ++ br->vlan_proto == htons(ETH_P_8021Q)) ++ br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; ++ else /* vlan_enabled && ETH_P_8021AD */ ++ br->group_fwd_mask_required = BR_GROUPFWD_8021AD & ++ ~(1u << br->group_addr[5]); ++} ++ ++int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, ++ struct netlink_ext_ack *extack) ++{ ++ struct switchdev_attr attr = { ++ .orig_dev = br->dev, ++ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, ++ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, ++ .u.vlan_filtering = val, ++ }; ++ int err; ++ ++ if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) ++ return 0; ++ ++ br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); ++ ++ err = switchdev_port_attr_set(br->dev, &attr, extack); ++ if (err && err != -EOPNOTSUPP) { ++ br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); ++ return err; ++ } ++ ++ br_manage_promisc(br); ++ recalculate_group_addr(br); ++ br_recalculate_fwd_mask(br); ++ if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { ++ br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); ++ br_multicast_toggle_vlan_snooping(br, false, NULL); ++ } ++ ++ return 0; ++} ++ ++bool br_vlan_enabled(const struct net_device *dev) ++{ ++ struct net_bridge *br = netdev_priv(dev); ++ ++ return br_opt_get(br, BROPT_VLAN_ENABLED); ++} ++EXPORT_SYMBOL_GPL(br_vlan_enabled); ++ ++int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) ++{ ++ struct net_bridge *br = netdev_priv(dev); ++ ++ *p_proto = ntohs(br->vlan_proto); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_proto); ++ ++int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, ++ struct netlink_ext_ack *extack) ++{ ++ struct switchdev_attr attr = { ++ .orig_dev = br->dev, ++ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, ++ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, ++ .u.vlan_protocol = ntohs(proto), ++ }; ++ int err = 0; ++ struct net_bridge_port *p; ++ struct net_bridge_vlan *vlan; ++ struct net_bridge_vlan_group *vg; ++ __be16 oldproto = br->vlan_proto; ++ ++ if (br->vlan_proto == proto) ++ return 0; ++ ++ err = switchdev_port_attr_set(br->dev, &attr, extack); ++ if (err && err != -EOPNOTSUPP) ++ return err; ++ ++ /* Add VLANs for the new proto to the device filter. */ ++ list_for_each_entry(p, &br->port_list, list) { ++ vg = nbp_vlan_group(p); ++ list_for_each_entry(vlan, &vg->vlan_list, vlist) { ++ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ continue; ++ err = vlan_vid_add(p->dev, proto, vlan->vid); ++ if (err) ++ goto err_filt; ++ } ++ } ++ ++ br->vlan_proto = proto; ++ ++ recalculate_group_addr(br); ++ br_recalculate_fwd_mask(br); ++ ++ /* Delete VLANs for the old proto from the device filter. */ ++ list_for_each_entry(p, &br->port_list, list) { ++ vg = nbp_vlan_group(p); ++ list_for_each_entry(vlan, &vg->vlan_list, vlist) { ++ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ continue; ++ vlan_vid_del(p->dev, oldproto, vlan->vid); ++ } ++ } ++ ++ return 0; ++ ++err_filt: ++ attr.u.vlan_protocol = ntohs(oldproto); ++ switchdev_port_attr_set(br->dev, &attr, NULL); ++ ++ list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) { ++ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ continue; ++ vlan_vid_del(p->dev, proto, vlan->vid); ++ } ++ ++ list_for_each_entry_continue_reverse(p, &br->port_list, list) { ++ vg = nbp_vlan_group(p); ++ list_for_each_entry(vlan, &vg->vlan_list, vlist) { ++ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ continue; ++ vlan_vid_del(p->dev, proto, vlan->vid); ++ } ++ } ++ ++ return err; ++} ++ ++int br_vlan_set_proto(struct net_bridge *br, unsigned long val, ++ struct netlink_ext_ack *extack) ++{ ++ if (!eth_type_vlan(htons(val))) ++ return -EPROTONOSUPPORT; ++ ++ return __br_vlan_set_proto(br, htons(val), extack); ++} ++ ++int br_vlan_set_stats(struct net_bridge *br, unsigned long val) ++{ ++ switch (val) { ++ case 0: ++ case 1: ++ br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) ++{ ++ struct net_bridge_port *p; ++ ++ /* allow to change the option if there are no port vlans configured */ ++ list_for_each_entry(p, &br->port_list, list) { ++ struct net_bridge_vlan_group *vg = nbp_vlan_group(p); ++ ++ if (vg->num_vlans) ++ return -EBUSY; ++ } ++ ++ switch (val) { ++ case 0: ++ case 1: ++ br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) ++{ ++ struct net_bridge_vlan *v; ++ ++ if (vid != vg->pvid) ++ return false; ++ ++ v = br_vlan_lookup(&vg->vlan_hash, vid); ++ if (v && br_vlan_should_use(v) && ++ (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) ++ return true; ++ ++ return false; ++} ++ ++static void br_vlan_disable_default_pvid(struct net_bridge *br) ++{ ++ struct net_bridge_port *p; ++ u16 pvid = br->default_pvid; ++ ++ /* Disable default_pvid on all ports where it is still ++ * configured. ++ */ ++ if (vlan_default_pvid(br_vlan_group(br), pvid)) { ++ if (!br_vlan_delete(br, pvid)) ++ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); ++ } ++ ++ list_for_each_entry(p, &br->port_list, list) { ++ if (vlan_default_pvid(nbp_vlan_group(p), pvid) && ++ !nbp_vlan_delete(p, pvid)) ++ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); ++ } ++ ++ br->default_pvid = 0; ++} ++ ++int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, ++ struct netlink_ext_ack *extack) ++{ ++ const struct net_bridge_vlan *pvent; ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p; ++ unsigned long *changed; ++ bool vlchange; ++ u16 old_pvid; ++ int err = 0; ++ ++ if (!pvid) { ++ br_vlan_disable_default_pvid(br); ++ return 0; ++ } ++ ++ changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); ++ if (!changed) ++ return -ENOMEM; ++ ++ old_pvid = br->default_pvid; ++ ++ /* Update default_pvid config only if we do not conflict with ++ * user configuration. ++ */ ++ vg = br_vlan_group(br); ++ pvent = br_vlan_find(vg, pvid); ++ if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) && ++ (!pvent || !br_vlan_should_use(pvent))) { ++ err = br_vlan_add(br, pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED | ++ BRIDGE_VLAN_INFO_BRENTRY, ++ &vlchange, extack); ++ if (err) ++ goto out; ++ ++ if (br_vlan_delete(br, old_pvid)) ++ br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); ++ br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); ++ __set_bit(0, changed); ++ } ++ ++ list_for_each_entry(p, &br->port_list, list) { ++ /* Update default_pvid config only if we do not conflict with ++ * user configuration. ++ */ ++ vg = nbp_vlan_group(p); ++ if ((old_pvid && ++ !vlan_default_pvid(vg, old_pvid)) || ++ br_vlan_find(vg, pvid)) ++ continue; ++ ++ err = nbp_vlan_add(p, pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED, ++ &vlchange, extack); ++ if (err) ++ goto err_port; ++ if (nbp_vlan_delete(p, old_pvid)) ++ br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); ++ br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); ++ __set_bit(p->port_no, changed); ++ } ++ ++ br->default_pvid = pvid; ++ ++out: ++ bitmap_free(changed); ++ return err; ++ ++err_port: ++ list_for_each_entry_continue_reverse(p, &br->port_list, list) { ++ if (!test_bit(p->port_no, changed)) ++ continue; ++ ++ if (old_pvid) { ++ nbp_vlan_add(p, old_pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED, ++ &vlchange, NULL); ++ br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); ++ } ++ nbp_vlan_delete(p, pvid); ++ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); ++ } ++ ++ if (test_bit(0, changed)) { ++ if (old_pvid) { ++ br_vlan_add(br, old_pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED | ++ BRIDGE_VLAN_INFO_BRENTRY, ++ &vlchange, NULL); ++ br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); ++ } ++ br_vlan_delete(br, pvid); ++ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); ++ } ++ goto out; ++} ++ ++int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, ++ struct netlink_ext_ack *extack) ++{ ++ u16 pvid = val; ++ int err = 0; ++ ++ if (val >= VLAN_VID_MASK) ++ return -EINVAL; ++ ++ if (pvid == br->default_pvid) ++ goto out; ++ ++ /* Only allow default pvid change when filtering is disabled */ ++ if (br_opt_get(br, BROPT_VLAN_ENABLED)) { ++ pr_info_once("Please disable vlan filtering to change default_pvid\n"); ++ err = -EPERM; ++ goto out; ++ } ++ err = __br_vlan_set_default_pvid(br, pvid, extack); ++out: ++ return err; ++} ++ ++int br_vlan_init(struct net_bridge *br) ++{ ++ struct net_bridge_vlan_group *vg; ++ int ret = -ENOMEM; ++ ++ vg = kzalloc(sizeof(*vg), GFP_KERNEL); ++ if (!vg) ++ goto out; ++ ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); ++ if (ret) ++ goto err_rhtbl; ++ ret = vlan_tunnel_init(vg); ++ if (ret) ++ goto err_tunnel_init; ++ INIT_LIST_HEAD(&vg->vlan_list); ++ br->vlan_proto = htons(ETH_P_8021Q); ++ br->default_pvid = 1; ++ rcu_assign_pointer(br->vlgrp, vg); ++ ++out: ++ return ret; ++ ++err_tunnel_init: ++ rhashtable_destroy(&vg->vlan_hash); ++err_rhtbl: ++ kfree(vg); ++ ++ goto out; ++} ++ ++int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) ++{ ++ struct switchdev_attr attr = { ++ .orig_dev = p->br->dev, ++ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, ++ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, ++ .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED), ++ }; ++ struct net_bridge_vlan_group *vg; ++ int ret = -ENOMEM; ++ ++ vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); ++ if (!vg) ++ goto out; ++ ++ ret = switchdev_port_attr_set(p->dev, &attr, extack); ++ if (ret && ret != -EOPNOTSUPP) ++ goto err_vlan_enabled; ++ ++ ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); ++ if (ret) ++ goto err_rhtbl; ++ ret = vlan_tunnel_init(vg); ++ if (ret) ++ goto err_tunnel_init; ++ INIT_LIST_HEAD(&vg->vlan_list); ++ rcu_assign_pointer(p->vlgrp, vg); ++ if (p->br->default_pvid) { ++ bool changed; ++ ++ ret = nbp_vlan_add(p, p->br->default_pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED, ++ &changed, extack); ++ if (ret) ++ goto err_vlan_add; ++ br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); ++ } ++out: ++ return ret; ++ ++err_vlan_add: ++ RCU_INIT_POINTER(p->vlgrp, NULL); ++ synchronize_rcu(); ++ vlan_tunnel_deinit(vg); ++err_tunnel_init: ++ rhashtable_destroy(&vg->vlan_hash); ++err_rhtbl: ++err_vlan_enabled: ++ kfree(vg); ++ ++ goto out; ++} ++ ++/* Must be protected by RTNL. ++ * Must be called with vid in range from 1 to 4094 inclusive. ++ * changed must be true only if the vlan was created or updated ++ */ ++int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, ++ bool *changed, struct netlink_ext_ack *extack) ++{ ++ struct net_bridge_vlan *vlan; ++ int ret; ++ ++ ASSERT_RTNL(); ++ ++ *changed = false; ++ vlan = br_vlan_find(nbp_vlan_group(port), vid); ++ if (vlan) { ++ bool would_change = __vlan_flags_would_change(vlan, flags); ++ ++ if (would_change) { ++ /* Pass the flags to the hardware bridge */ ++ ret = br_switchdev_port_vlan_add(port->dev, vid, flags, ++ true, extack); ++ if (ret && ret != -EOPNOTSUPP) ++ return ret; ++ } ++ ++ __vlan_flags_commit(vlan, flags); ++ *changed = would_change; ++ ++ return 0; ++ } ++ ++ vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); ++ if (!vlan) ++ return -ENOMEM; ++ ++ vlan->vid = vid; ++ vlan->port = port; ++ ret = __vlan_add(vlan, flags, extack); ++ if (ret) ++ kfree(vlan); ++ else ++ *changed = true; ++ ++ return ret; ++} ++ ++/* Must be protected by RTNL. ++ * Must be called with vid in range from 1 to 4094 inclusive. ++ */ ++int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) ++{ ++ struct net_bridge_vlan *v; ++ ++ ASSERT_RTNL(); ++ ++ v = br_vlan_find(nbp_vlan_group(port), vid); ++ if (!v) ++ return -ENOENT; ++ br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); ++ br_fdb_delete_by_port(port->br, port, vid, 0); ++ ++ return __vlan_del(v); ++} ++ ++void nbp_vlan_flush(struct net_bridge_port *port) ++{ ++ struct net_bridge_vlan_group *vg; ++ ++ ASSERT_RTNL(); ++ ++ vg = nbp_vlan_group(port); ++ __vlan_flush(port->br, port, vg); ++ RCU_INIT_POINTER(port->vlgrp, NULL); ++ synchronize_rcu(); ++ __vlan_group_free(vg); ++} ++ ++void br_vlan_get_stats(const struct net_bridge_vlan *v, ++ struct pcpu_sw_netstats *stats) ++{ ++ int i; ++ ++ memset(stats, 0, sizeof(*stats)); ++ for_each_possible_cpu(i) { ++ u64 rxpackets, rxbytes, txpackets, txbytes; ++ struct pcpu_sw_netstats *cpu_stats; ++ unsigned int start; ++ ++ cpu_stats = per_cpu_ptr(v->stats, i); ++ do { ++ start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); ++ rxpackets = u64_stats_read(&cpu_stats->rx_packets); ++ rxbytes = u64_stats_read(&cpu_stats->rx_bytes); ++ txbytes = u64_stats_read(&cpu_stats->tx_bytes); ++ txpackets = u64_stats_read(&cpu_stats->tx_packets); ++ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); ++ ++ u64_stats_add(&stats->rx_packets, rxpackets); ++ u64_stats_add(&stats->rx_bytes, rxbytes); ++ u64_stats_add(&stats->tx_bytes, txbytes); ++ u64_stats_add(&stats->tx_packets, txpackets); ++ } ++} ++ ++int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p; ++ ++ ASSERT_RTNL(); ++ p = br_port_get_check_rtnl(dev); ++ if (p) ++ vg = nbp_vlan_group(p); ++ else if (netif_is_bridge_master(dev)) ++ vg = br_vlan_group(netdev_priv(dev)); ++ else ++ return -EINVAL; ++ ++ *p_pvid = br_get_pvid(vg); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_pvid); ++ ++int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p; ++ ++ p = br_port_get_check_rcu(dev); ++ if (p) ++ vg = nbp_vlan_group_rcu(p); ++ else if (netif_is_bridge_master(dev)) ++ vg = br_vlan_group_rcu(netdev_priv(dev)); ++ else ++ return -EINVAL; ++ ++ *p_pvid = br_get_pvid(vg); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); ++ ++void br_vlan_fill_forward_path_pvid(struct net_bridge *br, ++ struct net_device_path_ctx *ctx, ++ struct net_device_path *path) ++{ ++ struct net_bridge_vlan_group *vg; ++ int idx = ctx->num_vlans - 1; ++ u16 vid; ++ ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; ++ ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) ++ return; ++ ++ vg = br_vlan_group(br); ++ ++ if (idx >= 0 && ++ ctx->vlan[idx].proto == br->vlan_proto) { ++ vid = ctx->vlan[idx].id; ++ } else { ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG; ++ vid = br_get_pvid(vg); ++ } ++ ++ path->bridge.vlan_id = vid; ++ path->bridge.vlan_proto = br->vlan_proto; ++} ++ ++int br_vlan_fill_forward_path_mode(struct net_bridge *br, ++ struct net_bridge_port *dst, ++ struct net_device_path *path) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v; ++ ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) ++ return 0; ++ ++ vg = nbp_vlan_group_rcu(dst); ++ v = br_vlan_find(vg, path->bridge.vlan_id); ++ if (!v || !br_vlan_should_use(v)) ++ return -EINVAL; ++ ++ if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) ++ return 0; ++ ++ if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG) ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; ++ else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW; ++ else ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG; ++ ++ return 0; ++} ++ ++int br_vlan_get_info(const struct net_device *dev, u16 vid, ++ struct bridge_vlan_info *p_vinfo) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v; ++ struct net_bridge_port *p; ++ ++ ASSERT_RTNL(); ++ p = br_port_get_check_rtnl(dev); ++ if (p) ++ vg = nbp_vlan_group(p); ++ else if (netif_is_bridge_master(dev)) ++ vg = br_vlan_group(netdev_priv(dev)); ++ else ++ return -EINVAL; ++ ++ v = br_vlan_find(vg, vid); ++ if (!v) ++ return -ENOENT; ++ ++ p_vinfo->vid = vid; ++ p_vinfo->flags = v->flags; ++ if (vid == br_get_pvid(vg)) ++ p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_info); ++ ++int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, ++ struct bridge_vlan_info *p_vinfo) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v; ++ struct net_bridge_port *p; ++ ++ p = br_port_get_check_rcu(dev); ++ if (p) ++ vg = nbp_vlan_group_rcu(p); ++ else if (netif_is_bridge_master(dev)) ++ vg = br_vlan_group_rcu(netdev_priv(dev)); ++ else ++ return -EINVAL; ++ ++ v = br_vlan_find(vg, vid); ++ if (!v) ++ return -ENOENT; ++ ++ p_vinfo->vid = vid; ++ p_vinfo->flags = v->flags; ++ if (vid == br_get_pvid(vg)) ++ p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu); ++ ++static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) ++{ ++ return is_vlan_dev(dev) && ++ !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING); ++} ++ ++static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, ++ __always_unused struct netdev_nested_priv *priv) ++{ ++ return br_vlan_is_bind_vlan_dev(dev); ++} ++ ++static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev) ++{ ++ int found; ++ ++ rcu_read_lock(); ++ found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn, ++ NULL); ++ rcu_read_unlock(); ++ ++ return !!found; ++} ++ ++struct br_vlan_bind_walk_data { ++ u16 vid; ++ struct net_device *result; ++}; ++ ++static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, ++ struct netdev_nested_priv *priv) ++{ ++ struct br_vlan_bind_walk_data *data = priv->data; ++ int found = 0; ++ ++ if (br_vlan_is_bind_vlan_dev(dev) && ++ vlan_dev_priv(dev)->vlan_id == data->vid) { ++ data->result = dev; ++ found = 1; ++ } ++ ++ return found; ++} ++ ++static struct net_device * ++br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) ++{ ++ struct br_vlan_bind_walk_data data = { ++ .vid = vid, ++ }; ++ struct netdev_nested_priv priv = { ++ .data = (void *)&data, ++ }; ++ ++ rcu_read_lock(); ++ netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, ++ &priv); ++ rcu_read_unlock(); ++ ++ return data.result; ++} ++ ++static bool br_vlan_is_dev_up(const struct net_device *dev) ++{ ++ return !!(dev->flags & IFF_UP) && netif_oper_up(dev); ++} ++ ++static void br_vlan_set_vlan_dev_state(const struct net_bridge *br, ++ struct net_device *vlan_dev) ++{ ++ u16 vid = vlan_dev_priv(vlan_dev)->vlan_id; ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p; ++ bool has_carrier = false; ++ ++ if (!netif_carrier_ok(br->dev)) { ++ netif_carrier_off(vlan_dev); ++ return; ++ } ++ ++ list_for_each_entry(p, &br->port_list, list) { ++ vg = nbp_vlan_group(p); ++ if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) { ++ has_carrier = true; ++ break; ++ } ++ } ++ ++ if (has_carrier) ++ netif_carrier_on(vlan_dev); ++ else ++ netif_carrier_off(vlan_dev); ++} ++ ++static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) ++{ ++ struct net_bridge_vlan_group *vg = nbp_vlan_group(p); ++ struct net_bridge_vlan *vlan; ++ struct net_device *vlan_dev; ++ ++ list_for_each_entry(vlan, &vg->vlan_list, vlist) { ++ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, ++ vlan->vid); ++ if (vlan_dev) { ++ if (br_vlan_is_dev_up(p->dev)) { ++ if (netif_carrier_ok(p->br->dev)) ++ netif_carrier_on(vlan_dev); ++ } else { ++ br_vlan_set_vlan_dev_state(p->br, vlan_dev); ++ } ++ } ++ } ++} ++ ++static void br_vlan_upper_change(struct net_device *dev, ++ struct net_device *upper_dev, ++ bool linking) ++{ ++ struct net_bridge *br = netdev_priv(dev); ++ ++ if (!br_vlan_is_bind_vlan_dev(upper_dev)) ++ return; ++ ++ if (linking) { ++ br_vlan_set_vlan_dev_state(br, upper_dev); ++ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); ++ } else { ++ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, ++ br_vlan_has_upper_bind_vlan_dev(dev)); ++ } ++} ++ ++struct br_vlan_link_state_walk_data { ++ struct net_bridge *br; ++}; ++ ++static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, ++ struct netdev_nested_priv *priv) ++{ ++ struct br_vlan_link_state_walk_data *data = priv->data; ++ ++ if (br_vlan_is_bind_vlan_dev(vlan_dev)) ++ br_vlan_set_vlan_dev_state(data->br, vlan_dev); ++ ++ return 0; ++} ++ ++static void br_vlan_link_state_change(struct net_device *dev, ++ struct net_bridge *br) ++{ ++ struct br_vlan_link_state_walk_data data = { ++ .br = br ++ }; ++ struct netdev_nested_priv priv = { ++ .data = (void *)&data, ++ }; ++ ++ rcu_read_lock(); ++ netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, ++ &priv); ++ rcu_read_unlock(); ++} ++ ++/* Must be protected by RTNL. */ ++static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid) ++{ ++ struct net_device *vlan_dev; ++ ++ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) ++ return; ++ ++ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid); ++ if (vlan_dev) ++ br_vlan_set_vlan_dev_state(p->br, vlan_dev); ++} ++ ++/* Must be protected by RTNL. */ ++int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) ++{ ++ struct netdev_notifier_changeupper_info *info; ++ struct net_bridge *br = netdev_priv(dev); ++ int vlcmd = 0, ret = 0; ++ bool changed = false; ++ ++ switch (event) { ++ case NETDEV_REGISTER: ++ ret = br_vlan_add(br, br->default_pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED | ++ BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); ++ vlcmd = RTM_NEWVLAN; ++ break; ++ case NETDEV_UNREGISTER: ++ changed = !br_vlan_delete(br, br->default_pvid); ++ vlcmd = RTM_DELVLAN; ++ break; ++ case NETDEV_CHANGEUPPER: ++ info = ptr; ++ br_vlan_upper_change(dev, info->upper_dev, info->linking); ++ break; ++ ++ case NETDEV_CHANGE: ++ case NETDEV_UP: ++ if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING)) ++ break; ++ br_vlan_link_state_change(dev, br); ++ break; ++ } ++ if (changed) ++ br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd); ++ ++ return ret; ++} ++ ++/* Must be protected by RTNL. */ ++void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) ++{ ++ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) ++ return; ++ ++ switch (event) { ++ case NETDEV_CHANGE: ++ case NETDEV_DOWN: ++ case NETDEV_UP: ++ br_vlan_set_all_vlan_dev_state(p); ++ break; ++ } ++} ++ ++static bool br_vlan_stats_fill(struct sk_buff *skb, ++ const struct net_bridge_vlan *v) ++{ ++ struct pcpu_sw_netstats stats; ++ struct nlattr *nest; ++ ++ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS); ++ if (!nest) ++ return false; ++ ++ br_vlan_get_stats(v, &stats); ++ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES, ++ u64_stats_read(&stats.rx_bytes), ++ BRIDGE_VLANDB_STATS_PAD) || ++ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS, ++ u64_stats_read(&stats.rx_packets), ++ BRIDGE_VLANDB_STATS_PAD) || ++ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES, ++ u64_stats_read(&stats.tx_bytes), ++ BRIDGE_VLANDB_STATS_PAD) || ++ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS, ++ u64_stats_read(&stats.tx_packets), ++ BRIDGE_VLANDB_STATS_PAD)) ++ goto out_err; ++ ++ nla_nest_end(skb, nest); ++ ++ return true; ++ ++out_err: ++ nla_nest_cancel(skb, nest); ++ return false; ++} ++ ++/* v_opts is used to dump the options which must be equal in the whole range */ ++static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, ++ const struct net_bridge_vlan *v_opts, ++ u16 flags, ++ bool dump_stats) ++{ ++ struct bridge_vlan_info info; ++ struct nlattr *nest; ++ ++ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY); ++ if (!nest) ++ return false; ++ ++ memset(&info, 0, sizeof(info)); ++ info.vid = vid; ++ if (flags & BRIDGE_VLAN_INFO_UNTAGGED) ++ info.flags |= BRIDGE_VLAN_INFO_UNTAGGED; ++ if (flags & BRIDGE_VLAN_INFO_PVID) ++ info.flags |= BRIDGE_VLAN_INFO_PVID; ++ ++ if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) ++ goto out_err; ++ ++ if (vid_range && vid < vid_range && ++ !(flags & BRIDGE_VLAN_INFO_PVID) && ++ nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) ++ goto out_err; ++ ++ if (v_opts) { ++ if (!br_vlan_opts_fill(skb, v_opts)) ++ goto out_err; ++ ++ if (dump_stats && !br_vlan_stats_fill(skb, v_opts)) ++ goto out_err; ++ } ++ ++ nla_nest_end(skb, nest); ++ ++ return true; ++ ++out_err: ++ nla_nest_cancel(skb, nest); ++ return false; ++} ++ ++static size_t rtnl_vlan_nlmsg_size(void) ++{ ++ return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) ++ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ ++ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ ++ + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */ ++ + br_vlan_opts_nl_size(); /* bridge vlan options */ ++} ++ ++void br_vlan_notify(const struct net_bridge *br, ++ const struct net_bridge_port *p, ++ u16 vid, u16 vid_range, ++ int cmd) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v = NULL; ++ struct br_vlan_msg *bvm; ++ struct nlmsghdr *nlh; ++ struct sk_buff *skb; ++ int err = -ENOBUFS; ++ struct net *net; ++ u16 flags = 0; ++ int ifindex; ++ ++ /* right now notifications are done only with rtnl held */ ++ ASSERT_RTNL(); ++ ++ if (p) { ++ ifindex = p->dev->ifindex; ++ vg = nbp_vlan_group(p); ++ net = dev_net(p->dev); ++ } else { ++ ifindex = br->dev->ifindex; ++ vg = br_vlan_group(br); ++ net = dev_net(br->dev); ++ } ++ ++ skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL); ++ if (!skb) ++ goto out_err; ++ ++ err = -EMSGSIZE; ++ nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0); ++ if (!nlh) ++ goto out_err; ++ bvm = nlmsg_data(nlh); ++ memset(bvm, 0, sizeof(*bvm)); ++ bvm->family = AF_BRIDGE; ++ bvm->ifindex = ifindex; ++ ++ switch (cmd) { ++ case RTM_NEWVLAN: ++ /* need to find the vlan due to flags/options */ ++ v = br_vlan_find(vg, vid); ++ if (!v || !br_vlan_should_use(v)) ++ goto out_kfree; ++ ++ flags = v->flags; ++ if (br_get_pvid(vg) == v->vid) ++ flags |= BRIDGE_VLAN_INFO_PVID; ++ break; ++ case RTM_DELVLAN: ++ break; ++ default: ++ goto out_kfree; ++ } ++ ++ if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags, false)) ++ goto out_err; ++ ++ nlmsg_end(skb, nlh); ++ rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); ++ return; ++ ++out_err: ++ rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err); ++out_kfree: ++ kfree_skb(skb); ++} ++ ++/* check if v_curr can enter a range ending in range_end */ ++bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, ++ const struct net_bridge_vlan *range_end) ++{ ++ return v_curr->vid - range_end->vid == 1 && ++ range_end->flags == v_curr->flags && ++ br_vlan_opts_eq_range(v_curr, range_end); ++} ++ ++static int br_vlan_dump_dev(const struct net_device *dev, ++ struct sk_buff *skb, ++ struct netlink_callback *cb, ++ u32 dump_flags) ++{ ++ struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; ++ bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL); ++ bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); ++ struct net_bridge_vlan_group *vg; ++ int idx = 0, s_idx = cb->args[1]; ++ struct nlmsghdr *nlh = NULL; ++ struct net_bridge_port *p; ++ struct br_vlan_msg *bvm; ++ struct net_bridge *br; ++ int err = 0; ++ u16 pvid; ++ ++ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) ++ return -EINVAL; ++ ++ if (netif_is_bridge_master(dev)) { ++ br = netdev_priv(dev); ++ vg = br_vlan_group_rcu(br); ++ p = NULL; ++ } else { ++ /* global options are dumped only for bridge devices */ ++ if (dump_global) ++ return 0; ++ ++ p = br_port_get_rcu(dev); ++ if (WARN_ON(!p)) ++ return -EINVAL; ++ vg = nbp_vlan_group_rcu(p); ++ br = p->br; ++ } ++ ++ if (!vg) ++ return 0; ++ ++ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, ++ RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI); ++ if (!nlh) ++ return -EMSGSIZE; ++ bvm = nlmsg_data(nlh); ++ memset(bvm, 0, sizeof(*bvm)); ++ bvm->family = PF_BRIDGE; ++ bvm->ifindex = dev->ifindex; ++ pvid = br_get_pvid(vg); ++ ++ /* idx must stay at range's beginning until it is filled in */ ++ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { ++ if (!dump_global && !br_vlan_should_use(v)) ++ continue; ++ if (idx < s_idx) { ++ idx++; ++ continue; ++ } ++ ++ if (!range_start) { ++ range_start = v; ++ range_end = v; ++ continue; ++ } ++ ++ if (dump_global) { ++ if (br_vlan_global_opts_can_enter_range(v, range_end)) ++ goto update_end; ++ if (!br_vlan_global_opts_fill(skb, range_start->vid, ++ range_end->vid, ++ range_start)) { ++ err = -EMSGSIZE; ++ break; ++ } ++ /* advance number of filled vlans */ ++ idx += range_end->vid - range_start->vid + 1; ++ ++ range_start = v; ++ } else if (dump_stats || v->vid == pvid || ++ !br_vlan_can_enter_range(v, range_end)) { ++ u16 vlan_flags = br_vlan_flags(range_start, pvid); ++ ++ if (!br_vlan_fill_vids(skb, range_start->vid, ++ range_end->vid, range_start, ++ vlan_flags, dump_stats)) { ++ err = -EMSGSIZE; ++ break; ++ } ++ /* advance number of filled vlans */ ++ idx += range_end->vid - range_start->vid + 1; ++ ++ range_start = v; ++ } ++update_end: ++ range_end = v; ++ } ++ ++ /* err will be 0 and range_start will be set in 3 cases here: ++ * - first vlan (range_start == range_end) ++ * - last vlan (range_start == range_end, not in range) ++ * - last vlan range (range_start != range_end, in range) ++ */ ++ if (!err && range_start) { ++ if (dump_global && ++ !br_vlan_global_opts_fill(skb, range_start->vid, ++ range_end->vid, range_start)) ++ err = -EMSGSIZE; ++ else if (!dump_global && ++ !br_vlan_fill_vids(skb, range_start->vid, ++ range_end->vid, range_start, ++ br_vlan_flags(range_start, pvid), ++ dump_stats)) ++ err = -EMSGSIZE; ++ } ++ ++ cb->args[1] = err ? idx : 0; ++ ++ nlmsg_end(skb, nlh); ++ ++ return err; ++} ++ ++static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = { ++ [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 }, ++}; ++ ++static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) ++{ ++ struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1]; ++ int idx = 0, err = 0, s_idx = cb->args[0]; ++ struct net *net = sock_net(skb->sk); ++ struct br_vlan_msg *bvm; ++ struct net_device *dev; ++ u32 dump_flags = 0; ++ ++ err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX, ++ br_vlan_db_dump_pol, cb->extack); ++ if (err < 0) ++ return err; ++ ++ bvm = nlmsg_data(cb->nlh); ++ if (dtb[BRIDGE_VLANDB_DUMP_FLAGS]) ++ dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]); ++ ++ rcu_read_lock(); ++ if (bvm->ifindex) { ++ dev = dev_get_by_index_rcu(net, bvm->ifindex); ++ if (!dev) { ++ err = -ENODEV; ++ goto out_err; ++ } ++ err = br_vlan_dump_dev(dev, skb, cb, dump_flags); ++ /* if the dump completed without an error we return 0 here */ ++ if (err != -EMSGSIZE) ++ goto out_err; ++ } else { ++ for_each_netdev_rcu(net, dev) { ++ if (idx < s_idx) ++ goto skip; ++ ++ err = br_vlan_dump_dev(dev, skb, cb, dump_flags); ++ if (err == -EMSGSIZE) ++ break; ++skip: ++ idx++; ++ } ++ } ++ cb->args[0] = idx; ++ rcu_read_unlock(); ++ ++ return skb->len; ++ ++out_err: ++ rcu_read_unlock(); ++ ++ return err; ++} ++ ++static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { ++ [BRIDGE_VLANDB_ENTRY_INFO] = ++ NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)), ++ [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, ++ [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, ++ [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, ++ [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, ++}; ++ ++static int br_vlan_rtm_process_one(struct net_device *dev, ++ const struct nlattr *attr, ++ int cmd, struct netlink_ext_ack *extack) ++{ ++ struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; ++ struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; ++ bool changed = false, skip_processing = false; ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p = NULL; ++ int err = 0, cmdmap = 0; ++ struct net_bridge *br; ++ ++ if (netif_is_bridge_master(dev)) { ++ br = netdev_priv(dev); ++ vg = br_vlan_group(br); ++ } else { ++ p = br_port_get_rtnl(dev); ++ if (WARN_ON(!p)) ++ return -ENODEV; ++ br = p->br; ++ vg = nbp_vlan_group(p); ++ } ++ ++ if (WARN_ON(!vg)) ++ return -ENODEV; ++ ++ err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr, ++ br_vlan_db_policy, extack); ++ if (err) ++ return err; ++ ++ if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) { ++ NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); ++ return -EINVAL; ++ } ++ memset(&vrange_end, 0, sizeof(vrange_end)); ++ ++ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); ++ if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | ++ BRIDGE_VLAN_INFO_RANGE_END)) { ++ NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls"); ++ return -EINVAL; ++ } ++ if (!br_vlan_valid_id(vinfo->vid, extack)) ++ return -EINVAL; ++ ++ if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) { ++ vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]); ++ /* validate user-provided flags without RANGE_BEGIN */ ++ vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags; ++ vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; ++ ++ /* vinfo_last is the range start, vinfo the range end */ ++ vinfo_last = vinfo; ++ vinfo = &vrange_end; ++ ++ if (!br_vlan_valid_id(vinfo->vid, extack) || ++ !br_vlan_valid_range(vinfo, vinfo_last, extack)) ++ return -EINVAL; ++ } ++ ++ switch (cmd) { ++ case RTM_NEWVLAN: ++ cmdmap = RTM_SETLINK; ++ skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS); ++ break; ++ case RTM_DELVLAN: ++ cmdmap = RTM_DELLINK; ++ break; ++ } ++ ++ if (!skip_processing) { ++ struct bridge_vlan_info *tmp_last = vinfo_last; ++ ++ /* br_process_vlan_info may overwrite vinfo_last */ ++ err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last, ++ &changed, extack); ++ ++ /* notify first if anything changed */ ++ if (changed) ++ br_ifinfo_notify(cmdmap, br, p); ++ ++ if (err) ++ return err; ++ } ++ ++ /* deal with options */ ++ if (cmd == RTM_NEWVLAN) { ++ struct net_bridge_vlan *range_start, *range_end; ++ ++ if (vinfo_last) { ++ range_start = br_vlan_find(vg, vinfo_last->vid); ++ range_end = br_vlan_find(vg, vinfo->vid); ++ } else { ++ range_start = br_vlan_find(vg, vinfo->vid); ++ range_end = range_start; ++ } ++ ++ err = br_vlan_process_options(br, p, range_start, range_end, ++ tb, extack); ++ } ++ ++ return err; ++} ++ ++static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, ++ struct netlink_ext_ack *extack) ++{ ++ struct net *net = sock_net(skb->sk); ++ struct br_vlan_msg *bvm; ++ struct net_device *dev; ++ struct nlattr *attr; ++ int err, vlans = 0; ++ int rem; ++ ++ /* this should validate the header and check for remaining bytes */ ++ err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL, ++ extack); ++ if (err < 0) ++ return err; ++ ++ bvm = nlmsg_data(nlh); ++ dev = __dev_get_by_index(net, bvm->ifindex); ++ if (!dev) ++ return -ENODEV; ++ ++ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { ++ NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port"); ++ return -EINVAL; ++ } ++ ++ nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { ++ switch (nla_type(attr)) { ++ case BRIDGE_VLANDB_ENTRY: ++ err = br_vlan_rtm_process_one(dev, attr, ++ nlh->nlmsg_type, ++ extack); ++ break; ++ case BRIDGE_VLANDB_GLOBAL_OPTIONS: ++ err = br_vlan_rtm_process_global_options(dev, attr, ++ nlh->nlmsg_type, ++ extack); ++ break; ++ default: ++ continue; ++ } ++ ++ vlans++; ++ if (err) ++ break; ++ } ++ if (!vlans) { ++ NL_SET_ERR_MSG_MOD(extack, "No vlans found to process"); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++void br_vlan_rtnl_init(void) ++{ ++ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, ++ br_vlan_rtm_dump, 0); ++ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, ++ br_vlan_rtm_process, NULL, 0); ++ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, ++ br_vlan_rtm_process, NULL, 0); ++} ++ ++void br_vlan_rtnl_uninit(void) ++{ ++ rtnl_unregister(PF_BRIDGE, RTM_GETVLAN); ++ rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN); ++ rtnl_unregister(PF_BRIDGE, RTM_DELVLAN); ++} +diff -rupN linux.orig/net/core/dev.c linux/net/core/dev.c +--- linux.orig/net/core/dev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/dev.c 2022-12-04 10:40:26.732034003 -0500 +@@ -4582,15 +4582,6 @@ static void rps_trigger_softirq(void *da #endif /* CONFIG_RPS */ @@ -8638,7 +39072,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644 /* * Check if this softnet_data structure is another cpu one * If yes, queue it to our IPI list and return 1 -@@ -6661,6 +6652,30 @@ static void skb_defer_free_flush(struct softnet_data *sd) +@@ -6665,6 +6656,30 @@ static void skb_defer_free_flush(struct } } @@ -8669,7 +39103,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644 static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); -@@ -10492,12 +10507,12 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, +@@ -10496,12 +10511,12 @@ void dev_fetch_sw_netstats(struct rtnl_l stats = per_cpu_ptr(netstats, cpu); do { @@ -8684,7 +39118,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644 s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; -@@ -11412,7 +11427,11 @@ static int __init net_dev_init(void) +@@ -11416,7 +11431,11 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif @@ -8696,11 +39130,11469 @@ index 56c8b0921c9fd..d96506980d2f2 100644 spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); -diff --git a/net/core/devlink.c b/net/core/devlink.c -index b50bcc18b8d9e..cfa6a099457ae 100644 ---- a/net/core/devlink.c -+++ b/net/core/devlink.c -@@ -8268,10 +8268,10 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, +diff -rupN linux.orig/net/core/dev.c.orig linux/net/core/dev.c.orig +--- linux.orig/net/core/dev.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/net/core/dev.c.orig 2022-12-04 10:40:18.728054516 -0500 +@@ -0,0 +1,11455 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * NET3 Protocol independent device support routines. ++ * ++ * Derived from the non IP parts of dev.c 1.0.19 ++ * Authors: Ross Biro ++ * Fred N. van Kempen, ++ * Mark Evans, ++ * ++ * Additional Authors: ++ * Florian la Roche ++ * Alan Cox ++ * David Hinds ++ * Alexey Kuznetsov ++ * Adam Sulmicki ++ * Pekka Riikonen ++ * ++ * Changes: ++ * D.J. Barrow : Fixed bug where dev->refcnt gets set ++ * to 2 if register_netdev gets called ++ * before net_dev_init & also removed a ++ * few lines of code in the process. ++ * Alan Cox : device private ioctl copies fields back. ++ * Alan Cox : Transmit queue code does relevant ++ * stunts to keep the queue safe. ++ * Alan Cox : Fixed double lock. ++ * Alan Cox : Fixed promisc NULL pointer trap ++ * ???????? : Support the full private ioctl range ++ * Alan Cox : Moved ioctl permission check into ++ * drivers ++ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI ++ * Alan Cox : 100 backlog just doesn't cut it when ++ * you start doing multicast video 8) ++ * Alan Cox : Rewrote net_bh and list manager. ++ * Alan Cox : Fix ETH_P_ALL echoback lengths. ++ * Alan Cox : Took out transmit every packet pass ++ * Saved a few bytes in the ioctl handler ++ * Alan Cox : Network driver sets packet type before ++ * calling netif_rx. Saves a function ++ * call a packet. ++ * Alan Cox : Hashed net_bh() ++ * Richard Kooijman: Timestamp fixes. ++ * Alan Cox : Wrong field in SIOCGIFDSTADDR ++ * Alan Cox : Device lock protection. ++ * Alan Cox : Fixed nasty side effect of device close ++ * changes. ++ * Rudi Cilibrasi : Pass the right thing to ++ * set_mac_address() ++ * Dave Miller : 32bit quantity for the device lock to ++ * make it work out on a Sparc. ++ * Bjorn Ekwall : Added KERNELD hack. ++ * Alan Cox : Cleaned up the backlog initialise. ++ * Craig Metz : SIOCGIFCONF fix if space for under ++ * 1 device. ++ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there ++ * is no device open function. ++ * Andi Kleen : Fix error reporting for SIOCGIFCONF ++ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF ++ * Cyrus Durgin : Cleaned for KMOD ++ * Adam Sulmicki : Bug Fix : Network Device Unload ++ * A network device unload needs to purge ++ * the backlog queue. ++ * Paul Rusty Russell : SIOCSIFNAME ++ * Pekka Riikonen : Netdev boot-time settings code ++ * Andrew Morton : Make unregister_netdevice wait ++ * indefinitely on dev->refcnt ++ * J Hadi Salim : - Backlog queue sampling ++ * - netif_rx() feedback ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "dev.h" ++#include "net-sysfs.h" ++ ++ ++static DEFINE_SPINLOCK(ptype_lock); ++struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; ++struct list_head ptype_all __read_mostly; /* Taps */ ++ ++static int netif_rx_internal(struct sk_buff *skb); ++static int call_netdevice_notifiers_info(unsigned long val, ++ struct netdev_notifier_info *info); ++static int call_netdevice_notifiers_extack(unsigned long val, ++ struct net_device *dev, ++ struct netlink_ext_ack *extack); ++static struct napi_struct *napi_by_id(unsigned int napi_id); ++ ++/* ++ * The @dev_base_head list is protected by @dev_base_lock and the rtnl ++ * semaphore. ++ * ++ * Pure readers hold dev_base_lock for reading, or rcu_read_lock() ++ * ++ * Writers must hold the rtnl semaphore while they loop through the ++ * dev_base_head list, and hold dev_base_lock for writing when they do the ++ * actual updates. This allows pure readers to access the list even ++ * while a writer is preparing to update it. ++ * ++ * To put it another way, dev_base_lock is held for writing only to ++ * protect against pure readers; the rtnl semaphore provides the ++ * protection against other writers. ++ * ++ * See, for example usages, register_netdevice() and ++ * unregister_netdevice(), which must be called with the rtnl ++ * semaphore held. ++ */ ++DEFINE_RWLOCK(dev_base_lock); ++EXPORT_SYMBOL(dev_base_lock); ++ ++static DEFINE_MUTEX(ifalias_mutex); ++ ++/* protects napi_hash addition/deletion and napi_gen_id */ ++static DEFINE_SPINLOCK(napi_hash_lock); ++ ++static unsigned int napi_gen_id = NR_CPUS; ++static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); ++ ++static DECLARE_RWSEM(devnet_rename_sem); ++ ++static inline void dev_base_seq_inc(struct net *net) ++{ ++ while (++net->dev_base_seq == 0) ++ ; ++} ++ ++static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) ++{ ++ unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); ++ ++ return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; ++} ++ ++static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) ++{ ++ return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; ++} ++ ++static inline void rps_lock_irqsave(struct softnet_data *sd, ++ unsigned long *flags) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_save(*flags); ++} ++ ++static inline void rps_lock_irq_disable(struct softnet_data *sd) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_lock_irq(&sd->input_pkt_queue.lock); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); ++} ++ ++static inline void rps_unlock_irq_restore(struct softnet_data *sd, ++ unsigned long *flags) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_restore(*flags); ++} ++ ++static inline void rps_unlock_irq_enable(struct softnet_data *sd) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_unlock_irq(&sd->input_pkt_queue.lock); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); ++} ++ ++static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, ++ const char *name) ++{ ++ struct netdev_name_node *name_node; ++ ++ name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); ++ if (!name_node) ++ return NULL; ++ INIT_HLIST_NODE(&name_node->hlist); ++ name_node->dev = dev; ++ name_node->name = name; ++ return name_node; ++} ++ ++static struct netdev_name_node * ++netdev_name_node_head_alloc(struct net_device *dev) ++{ ++ struct netdev_name_node *name_node; ++ ++ name_node = netdev_name_node_alloc(dev, dev->name); ++ if (!name_node) ++ return NULL; ++ INIT_LIST_HEAD(&name_node->list); ++ return name_node; ++} ++ ++static void netdev_name_node_free(struct netdev_name_node *name_node) ++{ ++ kfree(name_node); ++} ++ ++static void netdev_name_node_add(struct net *net, ++ struct netdev_name_node *name_node) ++{ ++ hlist_add_head_rcu(&name_node->hlist, ++ dev_name_hash(net, name_node->name)); ++} ++ ++static void netdev_name_node_del(struct netdev_name_node *name_node) ++{ ++ hlist_del_rcu(&name_node->hlist); ++} ++ ++static struct netdev_name_node *netdev_name_node_lookup(struct net *net, ++ const char *name) ++{ ++ struct hlist_head *head = dev_name_hash(net, name); ++ struct netdev_name_node *name_node; ++ ++ hlist_for_each_entry(name_node, head, hlist) ++ if (!strcmp(name_node->name, name)) ++ return name_node; ++ return NULL; ++} ++ ++static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, ++ const char *name) ++{ ++ struct hlist_head *head = dev_name_hash(net, name); ++ struct netdev_name_node *name_node; ++ ++ hlist_for_each_entry_rcu(name_node, head, hlist) ++ if (!strcmp(name_node->name, name)) ++ return name_node; ++ return NULL; ++} ++ ++bool netdev_name_in_use(struct net *net, const char *name) ++{ ++ return netdev_name_node_lookup(net, name); ++} ++EXPORT_SYMBOL(netdev_name_in_use); ++ ++int netdev_name_node_alt_create(struct net_device *dev, const char *name) ++{ ++ struct netdev_name_node *name_node; ++ struct net *net = dev_net(dev); ++ ++ name_node = netdev_name_node_lookup(net, name); ++ if (name_node) ++ return -EEXIST; ++ name_node = netdev_name_node_alloc(dev, name); ++ if (!name_node) ++ return -ENOMEM; ++ netdev_name_node_add(net, name_node); ++ /* The node that holds dev->name acts as a head of per-device list. */ ++ list_add_tail(&name_node->list, &dev->name_node->list); ++ ++ return 0; ++} ++ ++static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) ++{ ++ list_del(&name_node->list); ++ netdev_name_node_del(name_node); ++ kfree(name_node->name); ++ netdev_name_node_free(name_node); ++} ++ ++int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) ++{ ++ struct netdev_name_node *name_node; ++ struct net *net = dev_net(dev); ++ ++ name_node = netdev_name_node_lookup(net, name); ++ if (!name_node) ++ return -ENOENT; ++ /* lookup might have found our primary name or a name belonging ++ * to another device. ++ */ ++ if (name_node == dev->name_node || name_node->dev != dev) ++ return -EINVAL; ++ ++ __netdev_name_node_alt_destroy(name_node); ++ ++ return 0; ++} ++ ++static void netdev_name_node_alt_flush(struct net_device *dev) ++{ ++ struct netdev_name_node *name_node, *tmp; ++ ++ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) ++ __netdev_name_node_alt_destroy(name_node); ++} ++ ++/* Device list insertion */ ++static void list_netdevice(struct net_device *dev) ++{ ++ struct net *net = dev_net(dev); ++ ++ ASSERT_RTNL(); ++ ++ write_lock(&dev_base_lock); ++ list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); ++ netdev_name_node_add(net, dev->name_node); ++ hlist_add_head_rcu(&dev->index_hlist, ++ dev_index_hash(net, dev->ifindex)); ++ write_unlock(&dev_base_lock); ++ ++ dev_base_seq_inc(net); ++} ++ ++/* Device list removal ++ * caller must respect a RCU grace period before freeing/reusing dev ++ */ ++static void unlist_netdevice(struct net_device *dev, bool lock) ++{ ++ ASSERT_RTNL(); ++ ++ /* Unlink dev from the device chain */ ++ if (lock) ++ write_lock(&dev_base_lock); ++ list_del_rcu(&dev->dev_list); ++ netdev_name_node_del(dev->name_node); ++ hlist_del_rcu(&dev->index_hlist); ++ if (lock) ++ write_unlock(&dev_base_lock); ++ ++ dev_base_seq_inc(dev_net(dev)); ++} ++ ++/* ++ * Our notifier list ++ */ ++ ++static RAW_NOTIFIER_HEAD(netdev_chain); ++ ++/* ++ * Device drivers call our routines to queue packets here. We empty the ++ * queue in the local softnet handler. ++ */ ++ ++DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); ++EXPORT_PER_CPU_SYMBOL(softnet_data); ++ ++#ifdef CONFIG_LOCKDEP ++/* ++ * register_netdevice() inits txq->_xmit_lock and sets lockdep class ++ * according to dev->type ++ */ ++static const unsigned short netdev_lock_type[] = { ++ ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ++ ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ++ ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ++ ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, ++ ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, ++ ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, ++ ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, ++ ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, ++ ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, ++ ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, ++ ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ++ ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ++ ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ++ ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ++ ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; ++ ++static const char *const netdev_lock_name[] = { ++ "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", ++ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", ++ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", ++ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", ++ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", ++ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", ++ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", ++ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", ++ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", ++ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", ++ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", ++ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", ++ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", ++ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", ++ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; ++ ++static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; ++static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; ++ ++static inline unsigned short netdev_lock_pos(unsigned short dev_type) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) ++ if (netdev_lock_type[i] == dev_type) ++ return i; ++ /* the last key is used by default */ ++ return ARRAY_SIZE(netdev_lock_type) - 1; ++} ++ ++static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, ++ unsigned short dev_type) ++{ ++ int i; ++ ++ i = netdev_lock_pos(dev_type); ++ lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], ++ netdev_lock_name[i]); ++} ++ ++static inline void netdev_set_addr_lockdep_class(struct net_device *dev) ++{ ++ int i; ++ ++ i = netdev_lock_pos(dev->type); ++ lockdep_set_class_and_name(&dev->addr_list_lock, ++ &netdev_addr_lock_key[i], ++ netdev_lock_name[i]); ++} ++#else ++static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, ++ unsigned short dev_type) ++{ ++} ++ ++static inline void netdev_set_addr_lockdep_class(struct net_device *dev) ++{ ++} ++#endif ++ ++/******************************************************************************* ++ * ++ * Protocol management and registration routines ++ * ++ *******************************************************************************/ ++ ++ ++/* ++ * Add a protocol ID to the list. Now that the input handler is ++ * smarter we can dispense with all the messy stuff that used to be ++ * here. ++ * ++ * BEWARE!!! Protocol handlers, mangling input packets, ++ * MUST BE last in hash buckets and checking protocol handlers ++ * MUST start from promiscuous ptype_all chain in net_bh. ++ * It is true now, do not change it. ++ * Explanation follows: if protocol handler, mangling packet, will ++ * be the first on list, it is not able to sense, that packet ++ * is cloned and should be copied-on-write, so that it will ++ * change it and subsequent readers will get broken packet. ++ * --ANK (980803) ++ */ ++ ++static inline struct list_head *ptype_head(const struct packet_type *pt) ++{ ++ if (pt->type == htons(ETH_P_ALL)) ++ return pt->dev ? &pt->dev->ptype_all : &ptype_all; ++ else ++ return pt->dev ? &pt->dev->ptype_specific : ++ &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; ++} ++ ++/** ++ * dev_add_pack - add packet handler ++ * @pt: packet type declaration ++ * ++ * Add a protocol handler to the networking stack. The passed &packet_type ++ * is linked into kernel lists and may not be freed until it has been ++ * removed from the kernel lists. ++ * ++ * This call does not sleep therefore it can not ++ * guarantee all CPU's that are in middle of receiving packets ++ * will see the new packet type (until the next received packet). ++ */ ++ ++void dev_add_pack(struct packet_type *pt) ++{ ++ struct list_head *head = ptype_head(pt); ++ ++ spin_lock(&ptype_lock); ++ list_add_rcu(&pt->list, head); ++ spin_unlock(&ptype_lock); ++} ++EXPORT_SYMBOL(dev_add_pack); ++ ++/** ++ * __dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ * ++ * The packet type might still be in use by receivers ++ * and must not be freed until after all the CPU's have gone ++ * through a quiescent state. ++ */ ++void __dev_remove_pack(struct packet_type *pt) ++{ ++ struct list_head *head = ptype_head(pt); ++ struct packet_type *pt1; ++ ++ spin_lock(&ptype_lock); ++ ++ list_for_each_entry(pt1, head, list) { ++ if (pt == pt1) { ++ list_del_rcu(&pt->list); ++ goto out; ++ } ++ } ++ ++ pr_warn("dev_remove_pack: %p not found\n", pt); ++out: ++ spin_unlock(&ptype_lock); ++} ++EXPORT_SYMBOL(__dev_remove_pack); ++ ++/** ++ * dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ * ++ * This call sleeps to guarantee that no CPU is looking at the packet ++ * type after return. ++ */ ++void dev_remove_pack(struct packet_type *pt) ++{ ++ __dev_remove_pack(pt); ++ ++ synchronize_net(); ++} ++EXPORT_SYMBOL(dev_remove_pack); ++ ++ ++/******************************************************************************* ++ * ++ * Device Interface Subroutines ++ * ++ *******************************************************************************/ ++ ++/** ++ * dev_get_iflink - get 'iflink' value of a interface ++ * @dev: targeted interface ++ * ++ * Indicates the ifindex the interface is linked to. ++ * Physical interfaces have the same 'ifindex' and 'iflink' values. ++ */ ++ ++int dev_get_iflink(const struct net_device *dev) ++{ ++ if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) ++ return dev->netdev_ops->ndo_get_iflink(dev); ++ ++ return dev->ifindex; ++} ++EXPORT_SYMBOL(dev_get_iflink); ++ ++/** ++ * dev_fill_metadata_dst - Retrieve tunnel egress information. ++ * @dev: targeted interface ++ * @skb: The packet. ++ * ++ * For better visibility of tunnel traffic OVS needs to retrieve ++ * egress tunnel information for a packet. Following API allows ++ * user to get this info. ++ */ ++int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) ++{ ++ struct ip_tunnel_info *info; ++ ++ if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) ++ return -EINVAL; ++ ++ info = skb_tunnel_info_unclone(skb); ++ if (!info) ++ return -ENOMEM; ++ if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) ++ return -EINVAL; ++ ++ return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); ++} ++EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); ++ ++static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) ++{ ++ int k = stack->num_paths++; ++ ++ if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) ++ return NULL; ++ ++ return &stack->path[k]; ++} ++ ++int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, ++ struct net_device_path_stack *stack) ++{ ++ const struct net_device *last_dev; ++ struct net_device_path_ctx ctx = { ++ .dev = dev, ++ }; ++ struct net_device_path *path; ++ int ret = 0; ++ ++ memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); ++ stack->num_paths = 0; ++ while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { ++ last_dev = ctx.dev; ++ path = dev_fwd_path(stack); ++ if (!path) ++ return -1; ++ ++ memset(path, 0, sizeof(struct net_device_path)); ++ ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); ++ if (ret < 0) ++ return -1; ++ ++ if (WARN_ON_ONCE(last_dev == ctx.dev)) ++ return -1; ++ } ++ ++ if (!ctx.dev) ++ return ret; ++ ++ path = dev_fwd_path(stack); ++ if (!path) ++ return -1; ++ path->type = DEV_PATH_ETHERNET; ++ path->dev = ctx.dev; ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(dev_fill_forward_path); ++ ++/** ++ * __dev_get_by_name - find a device by its name ++ * @net: the applicable net namespace ++ * @name: name to find ++ * ++ * Find an interface by name. Must be called under RTNL semaphore ++ * or @dev_base_lock. If the name is found a pointer to the device ++ * is returned. If the name is not found then %NULL is returned. The ++ * reference counters are not incremented so the caller must be ++ * careful with locks. ++ */ ++ ++struct net_device *__dev_get_by_name(struct net *net, const char *name) ++{ ++ struct netdev_name_node *node_name; ++ ++ node_name = netdev_name_node_lookup(net, name); ++ return node_name ? node_name->dev : NULL; ++} ++EXPORT_SYMBOL(__dev_get_by_name); ++ ++/** ++ * dev_get_by_name_rcu - find a device by its name ++ * @net: the applicable net namespace ++ * @name: name to find ++ * ++ * Find an interface by name. ++ * If the name is found a pointer to the device is returned. ++ * If the name is not found then %NULL is returned. ++ * The reference counters are not incremented so the caller must be ++ * careful with locks. The caller must hold RCU lock. ++ */ ++ ++struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) ++{ ++ struct netdev_name_node *node_name; ++ ++ node_name = netdev_name_node_lookup_rcu(net, name); ++ return node_name ? node_name->dev : NULL; ++} ++EXPORT_SYMBOL(dev_get_by_name_rcu); ++ ++/** ++ * dev_get_by_name - find a device by its name ++ * @net: the applicable net namespace ++ * @name: name to find ++ * ++ * Find an interface by name. This can be called from any ++ * context and does its own locking. The returned handle has ++ * the usage count incremented and the caller must use dev_put() to ++ * release it when it is no longer needed. %NULL is returned if no ++ * matching device is found. ++ */ ++ ++struct net_device *dev_get_by_name(struct net *net, const char *name) ++{ ++ struct net_device *dev; ++ ++ rcu_read_lock(); ++ dev = dev_get_by_name_rcu(net, name); ++ dev_hold(dev); ++ rcu_read_unlock(); ++ return dev; ++} ++EXPORT_SYMBOL(dev_get_by_name); ++ ++/** ++ * __dev_get_by_index - find a device by its ifindex ++ * @net: the applicable net namespace ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not ++ * had its reference counter increased so the caller must be careful ++ * about locking. The caller must hold either the RTNL semaphore ++ * or @dev_base_lock. ++ */ ++ ++struct net_device *__dev_get_by_index(struct net *net, int ifindex) ++{ ++ struct net_device *dev; ++ struct hlist_head *head = dev_index_hash(net, ifindex); ++ ++ hlist_for_each_entry(dev, head, index_hlist) ++ if (dev->ifindex == ifindex) ++ return dev; ++ ++ return NULL; ++} ++EXPORT_SYMBOL(__dev_get_by_index); ++ ++/** ++ * dev_get_by_index_rcu - find a device by its ifindex ++ * @net: the applicable net namespace ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not ++ * had its reference counter increased so the caller must be careful ++ * about locking. The caller must hold RCU lock. ++ */ ++ ++struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) ++{ ++ struct net_device *dev; ++ struct hlist_head *head = dev_index_hash(net, ifindex); ++ ++ hlist_for_each_entry_rcu(dev, head, index_hlist) ++ if (dev->ifindex == ifindex) ++ return dev; ++ ++ return NULL; ++} ++EXPORT_SYMBOL(dev_get_by_index_rcu); ++ ++ ++/** ++ * dev_get_by_index - find a device by its ifindex ++ * @net: the applicable net namespace ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns NULL if the device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device *dev_get_by_index(struct net *net, int ifindex) ++{ ++ struct net_device *dev; ++ ++ rcu_read_lock(); ++ dev = dev_get_by_index_rcu(net, ifindex); ++ dev_hold(dev); ++ rcu_read_unlock(); ++ return dev; ++} ++EXPORT_SYMBOL(dev_get_by_index); ++ ++/** ++ * dev_get_by_napi_id - find a device by napi_id ++ * @napi_id: ID of the NAPI struct ++ * ++ * Search for an interface by NAPI ID. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not had ++ * its reference counter increased so the caller must be careful ++ * about locking. The caller must hold RCU lock. ++ */ ++ ++struct net_device *dev_get_by_napi_id(unsigned int napi_id) ++{ ++ struct napi_struct *napi; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++ ++ if (napi_id < MIN_NAPI_ID) ++ return NULL; ++ ++ napi = napi_by_id(napi_id); ++ ++ return napi ? napi->dev : NULL; ++} ++EXPORT_SYMBOL(dev_get_by_napi_id); ++ ++/** ++ * netdev_get_name - get a netdevice name, knowing its ifindex. ++ * @net: network namespace ++ * @name: a pointer to the buffer where the name will be stored. ++ * @ifindex: the ifindex of the interface to get the name from. ++ */ ++int netdev_get_name(struct net *net, char *name, int ifindex) ++{ ++ struct net_device *dev; ++ int ret; ++ ++ down_read(&devnet_rename_sem); ++ rcu_read_lock(); ++ ++ dev = dev_get_by_index_rcu(net, ifindex); ++ if (!dev) { ++ ret = -ENODEV; ++ goto out; ++ } ++ ++ strcpy(name, dev->name); ++ ++ ret = 0; ++out: ++ rcu_read_unlock(); ++ up_read(&devnet_rename_sem); ++ return ret; ++} ++ ++/** ++ * dev_getbyhwaddr_rcu - find a device by its hardware address ++ * @net: the applicable net namespace ++ * @type: media type of device ++ * @ha: hardware address ++ * ++ * Search for an interface by MAC address. Returns NULL if the device ++ * is not found or a pointer to the device. ++ * The caller must hold RCU or RTNL. ++ * The returned device has not had its ref count increased ++ * and the caller must therefore be careful about locking ++ * ++ */ ++ ++struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, ++ const char *ha) ++{ ++ struct net_device *dev; ++ ++ for_each_netdev_rcu(net, dev) ++ if (dev->type == type && ++ !memcmp(dev->dev_addr, ha, dev->addr_len)) ++ return dev; ++ ++ return NULL; ++} ++EXPORT_SYMBOL(dev_getbyhwaddr_rcu); ++ ++struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) ++{ ++ struct net_device *dev, *ret = NULL; ++ ++ rcu_read_lock(); ++ for_each_netdev_rcu(net, dev) ++ if (dev->type == type) { ++ dev_hold(dev); ++ ret = dev; ++ break; ++ } ++ rcu_read_unlock(); ++ return ret; ++} ++EXPORT_SYMBOL(dev_getfirstbyhwtype); ++ ++/** ++ * __dev_get_by_flags - find any device with given flags ++ * @net: the applicable net namespace ++ * @if_flags: IFF_* values ++ * @mask: bitmask of bits in if_flags to check ++ * ++ * Search for any interface with the given flags. Returns NULL if a device ++ * is not found or a pointer to the device. Must be called inside ++ * rtnl_lock(), and result refcount is unchanged. ++ */ ++ ++struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, ++ unsigned short mask) ++{ ++ struct net_device *dev, *ret; ++ ++ ASSERT_RTNL(); ++ ++ ret = NULL; ++ for_each_netdev(net, dev) { ++ if (((dev->flags ^ if_flags) & mask) == 0) { ++ ret = dev; ++ break; ++ } ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__dev_get_by_flags); ++ ++/** ++ * dev_valid_name - check if name is okay for network device ++ * @name: name string ++ * ++ * Network device names need to be valid file names to ++ * allow sysfs to work. We also disallow any kind of ++ * whitespace. ++ */ ++bool dev_valid_name(const char *name) ++{ ++ if (*name == '\0') ++ return false; ++ if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) ++ return false; ++ if (!strcmp(name, ".") || !strcmp(name, "..")) ++ return false; ++ ++ while (*name) { ++ if (*name == '/' || *name == ':' || isspace(*name)) ++ return false; ++ name++; ++ } ++ return true; ++} ++EXPORT_SYMBOL(dev_valid_name); ++ ++/** ++ * __dev_alloc_name - allocate a name for a device ++ * @net: network namespace to allocate the device name in ++ * @name: name format string ++ * @buf: scratch buffer and result name string ++ * ++ * Passed a format string - eg "lt%d" it will try and find a suitable ++ * id. It scans list of devices to build up a free map, then chooses ++ * the first empty slot. The caller must hold the dev_base or rtnl lock ++ * while allocating the name and adding the device in order to avoid ++ * duplicates. ++ * Limited to bits_per_byte * page size devices (ie 32K on most platforms). ++ * Returns the number of the unit assigned or a negative errno code. ++ */ ++ ++static int __dev_alloc_name(struct net *net, const char *name, char *buf) ++{ ++ int i = 0; ++ const char *p; ++ const int max_netdevices = 8*PAGE_SIZE; ++ unsigned long *inuse; ++ struct net_device *d; ++ ++ if (!dev_valid_name(name)) ++ return -EINVAL; ++ ++ p = strchr(name, '%'); ++ if (p) { ++ /* ++ * Verify the string as this thing may have come from ++ * the user. There must be either one "%d" and no other "%" ++ * characters. ++ */ ++ if (p[1] != 'd' || strchr(p + 2, '%')) ++ return -EINVAL; ++ ++ /* Use one page as a bit array of possible slots */ ++ inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); ++ if (!inuse) ++ return -ENOMEM; ++ ++ for_each_netdev(net, d) { ++ struct netdev_name_node *name_node; ++ list_for_each_entry(name_node, &d->name_node->list, list) { ++ if (!sscanf(name_node->name, name, &i)) ++ continue; ++ if (i < 0 || i >= max_netdevices) ++ continue; ++ ++ /* avoid cases where sscanf is not exact inverse of printf */ ++ snprintf(buf, IFNAMSIZ, name, i); ++ if (!strncmp(buf, name_node->name, IFNAMSIZ)) ++ __set_bit(i, inuse); ++ } ++ if (!sscanf(d->name, name, &i)) ++ continue; ++ if (i < 0 || i >= max_netdevices) ++ continue; ++ ++ /* avoid cases where sscanf is not exact inverse of printf */ ++ snprintf(buf, IFNAMSIZ, name, i); ++ if (!strncmp(buf, d->name, IFNAMSIZ)) ++ __set_bit(i, inuse); ++ } ++ ++ i = find_first_zero_bit(inuse, max_netdevices); ++ free_page((unsigned long) inuse); ++ } ++ ++ snprintf(buf, IFNAMSIZ, name, i); ++ if (!netdev_name_in_use(net, buf)) ++ return i; ++ ++ /* It is possible to run out of possible slots ++ * when the name is long and there isn't enough space left ++ * for the digits, or if all bits are used. ++ */ ++ return -ENFILE; ++} ++ ++static int dev_alloc_name_ns(struct net *net, ++ struct net_device *dev, ++ const char *name) ++{ ++ char buf[IFNAMSIZ]; ++ int ret; ++ ++ BUG_ON(!net); ++ ret = __dev_alloc_name(net, name, buf); ++ if (ret >= 0) ++ strlcpy(dev->name, buf, IFNAMSIZ); ++ return ret; ++} ++ ++/** ++ * dev_alloc_name - allocate a name for a device ++ * @dev: device ++ * @name: name format string ++ * ++ * Passed a format string - eg "lt%d" it will try and find a suitable ++ * id. It scans list of devices to build up a free map, then chooses ++ * the first empty slot. The caller must hold the dev_base or rtnl lock ++ * while allocating the name and adding the device in order to avoid ++ * duplicates. ++ * Limited to bits_per_byte * page size devices (ie 32K on most platforms). ++ * Returns the number of the unit assigned or a negative errno code. ++ */ ++ ++int dev_alloc_name(struct net_device *dev, const char *name) ++{ ++ return dev_alloc_name_ns(dev_net(dev), dev, name); ++} ++EXPORT_SYMBOL(dev_alloc_name); ++ ++static int dev_get_valid_name(struct net *net, struct net_device *dev, ++ const char *name) ++{ ++ BUG_ON(!net); ++ ++ if (!dev_valid_name(name)) ++ return -EINVAL; ++ ++ if (strchr(name, '%')) ++ return dev_alloc_name_ns(net, dev, name); ++ else if (netdev_name_in_use(net, name)) ++ return -EEXIST; ++ else if (dev->name != name) ++ strlcpy(dev->name, name, IFNAMSIZ); ++ ++ return 0; ++} ++ ++/** ++ * dev_change_name - change name of a device ++ * @dev: device ++ * @newname: name (or format string) must be at least IFNAMSIZ ++ * ++ * Change name of a device, can pass format strings "eth%d". ++ * for wildcarding. ++ */ ++int dev_change_name(struct net_device *dev, const char *newname) ++{ ++ unsigned char old_assign_type; ++ char oldname[IFNAMSIZ]; ++ int err = 0; ++ int ret; ++ struct net *net; ++ ++ ASSERT_RTNL(); ++ BUG_ON(!dev_net(dev)); ++ ++ net = dev_net(dev); ++ ++ /* Some auto-enslaved devices e.g. failover slaves are ++ * special, as userspace might rename the device after ++ * the interface had been brought up and running since ++ * the point kernel initiated auto-enslavement. Allow ++ * live name change even when these slave devices are ++ * up and running. ++ * ++ * Typically, users of these auto-enslaving devices ++ * don't actually care about slave name change, as ++ * they are supposed to operate on master interface ++ * directly. ++ */ ++ if (dev->flags & IFF_UP && ++ likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) ++ return -EBUSY; ++ ++ down_write(&devnet_rename_sem); ++ ++ if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { ++ up_write(&devnet_rename_sem); ++ return 0; ++ } ++ ++ memcpy(oldname, dev->name, IFNAMSIZ); ++ ++ err = dev_get_valid_name(net, dev, newname); ++ if (err < 0) { ++ up_write(&devnet_rename_sem); ++ return err; ++ } ++ ++ if (oldname[0] && !strchr(oldname, '%')) ++ netdev_info(dev, "renamed from %s\n", oldname); ++ ++ old_assign_type = dev->name_assign_type; ++ dev->name_assign_type = NET_NAME_RENAMED; ++ ++rollback: ++ ret = device_rename(&dev->dev, dev->name); ++ if (ret) { ++ memcpy(dev->name, oldname, IFNAMSIZ); ++ dev->name_assign_type = old_assign_type; ++ up_write(&devnet_rename_sem); ++ return ret; ++ } ++ ++ up_write(&devnet_rename_sem); ++ ++ netdev_adjacent_rename_links(dev, oldname); ++ ++ write_lock(&dev_base_lock); ++ netdev_name_node_del(dev->name_node); ++ write_unlock(&dev_base_lock); ++ ++ synchronize_rcu(); ++ ++ write_lock(&dev_base_lock); ++ netdev_name_node_add(net, dev->name_node); ++ write_unlock(&dev_base_lock); ++ ++ ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ++ ret = notifier_to_errno(ret); ++ ++ if (ret) { ++ /* err >= 0 after dev_alloc_name() or stores the first errno */ ++ if (err >= 0) { ++ err = ret; ++ down_write(&devnet_rename_sem); ++ memcpy(dev->name, oldname, IFNAMSIZ); ++ memcpy(oldname, newname, IFNAMSIZ); ++ dev->name_assign_type = old_assign_type; ++ old_assign_type = NET_NAME_RENAMED; ++ goto rollback; ++ } else { ++ netdev_err(dev, "name change rollback failed: %d\n", ++ ret); ++ } ++ } ++ ++ return err; ++} ++ ++/** ++ * dev_set_alias - change ifalias of a device ++ * @dev: device ++ * @alias: name up to IFALIASZ ++ * @len: limit of bytes to copy from info ++ * ++ * Set ifalias for a device, ++ */ ++int dev_set_alias(struct net_device *dev, const char *alias, size_t len) ++{ ++ struct dev_ifalias *new_alias = NULL; ++ ++ if (len >= IFALIASZ) ++ return -EINVAL; ++ ++ if (len) { ++ new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); ++ if (!new_alias) ++ return -ENOMEM; ++ ++ memcpy(new_alias->ifalias, alias, len); ++ new_alias->ifalias[len] = 0; ++ } ++ ++ mutex_lock(&ifalias_mutex); ++ new_alias = rcu_replace_pointer(dev->ifalias, new_alias, ++ mutex_is_locked(&ifalias_mutex)); ++ mutex_unlock(&ifalias_mutex); ++ ++ if (new_alias) ++ kfree_rcu(new_alias, rcuhead); ++ ++ return len; ++} ++EXPORT_SYMBOL(dev_set_alias); ++ ++/** ++ * dev_get_alias - get ifalias of a device ++ * @dev: device ++ * @name: buffer to store name of ifalias ++ * @len: size of buffer ++ * ++ * get ifalias for a device. Caller must make sure dev cannot go ++ * away, e.g. rcu read lock or own a reference count to device. ++ */ ++int dev_get_alias(const struct net_device *dev, char *name, size_t len) ++{ ++ const struct dev_ifalias *alias; ++ int ret = 0; ++ ++ rcu_read_lock(); ++ alias = rcu_dereference(dev->ifalias); ++ if (alias) ++ ret = snprintf(name, len, "%s", alias->ifalias); ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++/** ++ * netdev_features_change - device changes features ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed features. ++ */ ++void netdev_features_change(struct net_device *dev) ++{ ++ call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); ++} ++EXPORT_SYMBOL(netdev_features_change); ++ ++/** ++ * netdev_state_change - device changes state ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed state. This function calls ++ * the notifier chains for netdev_chain and sends a NEWLINK message ++ * to the routing socket. ++ */ ++void netdev_state_change(struct net_device *dev) ++{ ++ if (dev->flags & IFF_UP) { ++ struct netdev_notifier_change_info change_info = { ++ .info.dev = dev, ++ }; ++ ++ call_netdevice_notifiers_info(NETDEV_CHANGE, ++ &change_info.info); ++ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); ++ } ++} ++EXPORT_SYMBOL(netdev_state_change); ++ ++/** ++ * __netdev_notify_peers - notify network peers about existence of @dev, ++ * to be called when rtnl lock is already held. ++ * @dev: network device ++ * ++ * Generate traffic such that interested network peers are aware of ++ * @dev, such as by generating a gratuitous ARP. This may be used when ++ * a device wants to inform the rest of the network about some sort of ++ * reconfiguration such as a failover event or virtual machine ++ * migration. ++ */ ++void __netdev_notify_peers(struct net_device *dev) ++{ ++ ASSERT_RTNL(); ++ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); ++ call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); ++} ++EXPORT_SYMBOL(__netdev_notify_peers); ++ ++/** ++ * netdev_notify_peers - notify network peers about existence of @dev ++ * @dev: network device ++ * ++ * Generate traffic such that interested network peers are aware of ++ * @dev, such as by generating a gratuitous ARP. This may be used when ++ * a device wants to inform the rest of the network about some sort of ++ * reconfiguration such as a failover event or virtual machine ++ * migration. ++ */ ++void netdev_notify_peers(struct net_device *dev) ++{ ++ rtnl_lock(); ++ __netdev_notify_peers(dev); ++ rtnl_unlock(); ++} ++EXPORT_SYMBOL(netdev_notify_peers); ++ ++static int napi_threaded_poll(void *data); ++ ++static int napi_kthread_create(struct napi_struct *n) ++{ ++ int err = 0; ++ ++ /* Create and wake up the kthread once to put it in ++ * TASK_INTERRUPTIBLE mode to avoid the blocked task ++ * warning and work with loadavg. ++ */ ++ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", ++ n->dev->name, n->napi_id); ++ if (IS_ERR(n->thread)) { ++ err = PTR_ERR(n->thread); ++ pr_err("kthread_run failed with err %d\n", err); ++ n->thread = NULL; ++ } ++ ++ return err; ++} ++ ++static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ int ret; ++ ++ ASSERT_RTNL(); ++ dev_addr_check(dev); ++ ++ if (!netif_device_present(dev)) { ++ /* may be detached because parent is runtime-suspended */ ++ if (dev->dev.parent) ++ pm_runtime_resume(dev->dev.parent); ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ } ++ ++ /* Block netpoll from trying to do any rx path servicing. ++ * If we don't do this there is a chance ndo_poll_controller ++ * or ndo_poll may be running while we open the device ++ */ ++ netpoll_poll_disable(dev); ++ ++ ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ++ ret = notifier_to_errno(ret); ++ if (ret) ++ return ret; ++ ++ set_bit(__LINK_STATE_START, &dev->state); ++ ++ if (ops->ndo_validate_addr) ++ ret = ops->ndo_validate_addr(dev); ++ ++ if (!ret && ops->ndo_open) ++ ret = ops->ndo_open(dev); ++ ++ netpoll_poll_enable(dev); ++ ++ if (ret) ++ clear_bit(__LINK_STATE_START, &dev->state); ++ else { ++ dev->flags |= IFF_UP; ++ dev_set_rx_mode(dev); ++ dev_activate(dev); ++ add_device_randomness(dev->dev_addr, dev->addr_len); ++ } ++ ++ return ret; ++} ++ ++/** ++ * dev_open - prepare an interface for use. ++ * @dev: device to open ++ * @extack: netlink extended ack ++ * ++ * Takes a device from down to up state. The device's private open ++ * function is invoked and then the multicast lists are loaded. Finally ++ * the device is moved into the up state and a %NETDEV_UP message is ++ * sent to the netdev notifier chain. ++ * ++ * Calling this function on an active interface is a nop. On a failure ++ * a negative errno code is returned. ++ */ ++int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) ++{ ++ int ret; ++ ++ if (dev->flags & IFF_UP) ++ return 0; ++ ++ ret = __dev_open(dev, extack); ++ if (ret < 0) ++ return ret; ++ ++ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); ++ call_netdevice_notifiers(NETDEV_UP, dev); ++ ++ return ret; ++} ++EXPORT_SYMBOL(dev_open); ++ ++static void __dev_close_many(struct list_head *head) ++{ ++ struct net_device *dev; ++ ++ ASSERT_RTNL(); ++ might_sleep(); ++ ++ list_for_each_entry(dev, head, close_list) { ++ /* Temporarily disable netpoll until the interface is down */ ++ netpoll_poll_disable(dev); ++ ++ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); ++ ++ clear_bit(__LINK_STATE_START, &dev->state); ++ ++ /* Synchronize to scheduled poll. We cannot touch poll list, it ++ * can be even on different cpu. So just clear netif_running(). ++ * ++ * dev->stop() will invoke napi_disable() on all of it's ++ * napi_struct instances on this device. ++ */ ++ smp_mb__after_atomic(); /* Commit netif_running(). */ ++ } ++ ++ dev_deactivate_many(head); ++ ++ list_for_each_entry(dev, head, close_list) { ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ /* ++ * Call the device specific close. This cannot fail. ++ * Only if device is UP ++ * ++ * We allow it to be called even after a DETACH hot-plug ++ * event. ++ */ ++ if (ops->ndo_stop) ++ ops->ndo_stop(dev); ++ ++ dev->flags &= ~IFF_UP; ++ netpoll_poll_enable(dev); ++ } ++} ++ ++static void __dev_close(struct net_device *dev) ++{ ++ LIST_HEAD(single); ++ ++ list_add(&dev->close_list, &single); ++ __dev_close_many(&single); ++ list_del(&single); ++} ++ ++void dev_close_many(struct list_head *head, bool unlink) ++{ ++ struct net_device *dev, *tmp; ++ ++ /* Remove the devices that don't need to be closed */ ++ list_for_each_entry_safe(dev, tmp, head, close_list) ++ if (!(dev->flags & IFF_UP)) ++ list_del_init(&dev->close_list); ++ ++ __dev_close_many(head); ++ ++ list_for_each_entry_safe(dev, tmp, head, close_list) { ++ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); ++ call_netdevice_notifiers(NETDEV_DOWN, dev); ++ if (unlink) ++ list_del_init(&dev->close_list); ++ } ++} ++EXPORT_SYMBOL(dev_close_many); ++ ++/** ++ * dev_close - shutdown an interface. ++ * @dev: device to shutdown ++ * ++ * This function moves an active device into down state. A ++ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device ++ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier ++ * chain. ++ */ ++void dev_close(struct net_device *dev) ++{ ++ if (dev->flags & IFF_UP) { ++ LIST_HEAD(single); ++ ++ list_add(&dev->close_list, &single); ++ dev_close_many(&single, true); ++ list_del(&single); ++ } ++} ++EXPORT_SYMBOL(dev_close); ++ ++ ++/** ++ * dev_disable_lro - disable Large Receive Offload on a device ++ * @dev: device ++ * ++ * Disable Large Receive Offload (LRO) on a net device. Must be ++ * called under RTNL. This is needed if received packets may be ++ * forwarded to another interface. ++ */ ++void dev_disable_lro(struct net_device *dev) ++{ ++ struct net_device *lower_dev; ++ struct list_head *iter; ++ ++ dev->wanted_features &= ~NETIF_F_LRO; ++ netdev_update_features(dev); ++ ++ if (unlikely(dev->features & NETIF_F_LRO)) ++ netdev_WARN(dev, "failed to disable LRO!\n"); ++ ++ netdev_for_each_lower_dev(dev, lower_dev, iter) ++ dev_disable_lro(lower_dev); ++} ++EXPORT_SYMBOL(dev_disable_lro); ++ ++/** ++ * dev_disable_gro_hw - disable HW Generic Receive Offload on a device ++ * @dev: device ++ * ++ * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be ++ * called under RTNL. This is needed if Generic XDP is installed on ++ * the device. ++ */ ++static void dev_disable_gro_hw(struct net_device *dev) ++{ ++ dev->wanted_features &= ~NETIF_F_GRO_HW; ++ netdev_update_features(dev); ++ ++ if (unlikely(dev->features & NETIF_F_GRO_HW)) ++ netdev_WARN(dev, "failed to disable GRO_HW!\n"); ++} ++ ++const char *netdev_cmd_to_name(enum netdev_cmd cmd) ++{ ++#define N(val) \ ++ case NETDEV_##val: \ ++ return "NETDEV_" __stringify(val); ++ switch (cmd) { ++ N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) ++ N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) ++ N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) ++ N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) ++ N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) ++ N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) ++ N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) ++ N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) ++ N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) ++ N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) ++ N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) ++ } ++#undef N ++ return "UNKNOWN_NETDEV_EVENT"; ++} ++EXPORT_SYMBOL_GPL(netdev_cmd_to_name); ++ ++static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, ++ struct net_device *dev) ++{ ++ struct netdev_notifier_info info = { ++ .dev = dev, ++ }; ++ ++ return nb->notifier_call(nb, val, &info); ++} ++ ++static int call_netdevice_register_notifiers(struct notifier_block *nb, ++ struct net_device *dev) ++{ ++ int err; ++ ++ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); ++ err = notifier_to_errno(err); ++ if (err) ++ return err; ++ ++ if (!(dev->flags & IFF_UP)) ++ return 0; ++ ++ call_netdevice_notifier(nb, NETDEV_UP, dev); ++ return 0; ++} ++ ++static void call_netdevice_unregister_notifiers(struct notifier_block *nb, ++ struct net_device *dev) ++{ ++ if (dev->flags & IFF_UP) { ++ call_netdevice_notifier(nb, NETDEV_GOING_DOWN, ++ dev); ++ call_netdevice_notifier(nb, NETDEV_DOWN, dev); ++ } ++ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); ++} ++ ++static int call_netdevice_register_net_notifiers(struct notifier_block *nb, ++ struct net *net) ++{ ++ struct net_device *dev; ++ int err; ++ ++ for_each_netdev(net, dev) { ++ err = call_netdevice_register_notifiers(nb, dev); ++ if (err) ++ goto rollback; ++ } ++ return 0; ++ ++rollback: ++ for_each_netdev_continue_reverse(net, dev) ++ call_netdevice_unregister_notifiers(nb, dev); ++ return err; ++} ++ ++static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, ++ struct net *net) ++{ ++ struct net_device *dev; ++ ++ for_each_netdev(net, dev) ++ call_netdevice_unregister_notifiers(nb, dev); ++} ++ ++static int dev_boot_phase = 1; ++ ++/** ++ * register_netdevice_notifier - register a network notifier block ++ * @nb: notifier ++ * ++ * Register a notifier to be called when network device events occur. ++ * The notifier passed is linked into the kernel structures and must ++ * not be reused until it has been unregistered. A negative errno code ++ * is returned on a failure. ++ * ++ * When registered all registration and up events are replayed ++ * to the new notifier to allow device to have a race free ++ * view of the network device list. ++ */ ++ ++int register_netdevice_notifier(struct notifier_block *nb) ++{ ++ struct net *net; ++ int err; ++ ++ /* Close race with setup_net() and cleanup_net() */ ++ down_write(&pernet_ops_rwsem); ++ rtnl_lock(); ++ err = raw_notifier_chain_register(&netdev_chain, nb); ++ if (err) ++ goto unlock; ++ if (dev_boot_phase) ++ goto unlock; ++ for_each_net(net) { ++ err = call_netdevice_register_net_notifiers(nb, net); ++ if (err) ++ goto rollback; ++ } ++ ++unlock: ++ rtnl_unlock(); ++ up_write(&pernet_ops_rwsem); ++ return err; ++ ++rollback: ++ for_each_net_continue_reverse(net) ++ call_netdevice_unregister_net_notifiers(nb, net); ++ ++ raw_notifier_chain_unregister(&netdev_chain, nb); ++ goto unlock; ++} ++EXPORT_SYMBOL(register_netdevice_notifier); ++ ++/** ++ * unregister_netdevice_notifier - unregister a network notifier block ++ * @nb: notifier ++ * ++ * Unregister a notifier previously registered by ++ * register_netdevice_notifier(). The notifier is unlinked into the ++ * kernel structures and may then be reused. A negative errno code ++ * is returned on a failure. ++ * ++ * After unregistering unregister and down device events are synthesized ++ * for all devices on the device list to the removed notifier to remove ++ * the need for special case cleanup code. ++ */ ++ ++int unregister_netdevice_notifier(struct notifier_block *nb) ++{ ++ struct net *net; ++ int err; ++ ++ /* Close race with setup_net() and cleanup_net() */ ++ down_write(&pernet_ops_rwsem); ++ rtnl_lock(); ++ err = raw_notifier_chain_unregister(&netdev_chain, nb); ++ if (err) ++ goto unlock; ++ ++ for_each_net(net) ++ call_netdevice_unregister_net_notifiers(nb, net); ++ ++unlock: ++ rtnl_unlock(); ++ up_write(&pernet_ops_rwsem); ++ return err; ++} ++EXPORT_SYMBOL(unregister_netdevice_notifier); ++ ++static int __register_netdevice_notifier_net(struct net *net, ++ struct notifier_block *nb, ++ bool ignore_call_fail) ++{ ++ int err; ++ ++ err = raw_notifier_chain_register(&net->netdev_chain, nb); ++ if (err) ++ return err; ++ if (dev_boot_phase) ++ return 0; ++ ++ err = call_netdevice_register_net_notifiers(nb, net); ++ if (err && !ignore_call_fail) ++ goto chain_unregister; ++ ++ return 0; ++ ++chain_unregister: ++ raw_notifier_chain_unregister(&net->netdev_chain, nb); ++ return err; ++} ++ ++static int __unregister_netdevice_notifier_net(struct net *net, ++ struct notifier_block *nb) ++{ ++ int err; ++ ++ err = raw_notifier_chain_unregister(&net->netdev_chain, nb); ++ if (err) ++ return err; ++ ++ call_netdevice_unregister_net_notifiers(nb, net); ++ return 0; ++} ++ ++/** ++ * register_netdevice_notifier_net - register a per-netns network notifier block ++ * @net: network namespace ++ * @nb: notifier ++ * ++ * Register a notifier to be called when network device events occur. ++ * The notifier passed is linked into the kernel structures and must ++ * not be reused until it has been unregistered. A negative errno code ++ * is returned on a failure. ++ * ++ * When registered all registration and up events are replayed ++ * to the new notifier to allow device to have a race free ++ * view of the network device list. ++ */ ++ ++int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) ++{ ++ int err; ++ ++ rtnl_lock(); ++ err = __register_netdevice_notifier_net(net, nb, false); ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(register_netdevice_notifier_net); ++ ++/** ++ * unregister_netdevice_notifier_net - unregister a per-netns ++ * network notifier block ++ * @net: network namespace ++ * @nb: notifier ++ * ++ * Unregister a notifier previously registered by ++ * register_netdevice_notifier(). The notifier is unlinked into the ++ * kernel structures and may then be reused. A negative errno code ++ * is returned on a failure. ++ * ++ * After unregistering unregister and down device events are synthesized ++ * for all devices on the device list to the removed notifier to remove ++ * the need for special case cleanup code. ++ */ ++ ++int unregister_netdevice_notifier_net(struct net *net, ++ struct notifier_block *nb) ++{ ++ int err; ++ ++ rtnl_lock(); ++ err = __unregister_netdevice_notifier_net(net, nb); ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(unregister_netdevice_notifier_net); ++ ++int register_netdevice_notifier_dev_net(struct net_device *dev, ++ struct notifier_block *nb, ++ struct netdev_net_notifier *nn) ++{ ++ int err; ++ ++ rtnl_lock(); ++ err = __register_netdevice_notifier_net(dev_net(dev), nb, false); ++ if (!err) { ++ nn->nb = nb; ++ list_add(&nn->list, &dev->net_notifier_list); ++ } ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(register_netdevice_notifier_dev_net); ++ ++int unregister_netdevice_notifier_dev_net(struct net_device *dev, ++ struct notifier_block *nb, ++ struct netdev_net_notifier *nn) ++{ ++ int err; ++ ++ rtnl_lock(); ++ list_del(&nn->list); ++ err = __unregister_netdevice_notifier_net(dev_net(dev), nb); ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); ++ ++static void move_netdevice_notifiers_dev_net(struct net_device *dev, ++ struct net *net) ++{ ++ struct netdev_net_notifier *nn; ++ ++ list_for_each_entry(nn, &dev->net_notifier_list, list) { ++ __unregister_netdevice_notifier_net(dev_net(dev), nn->nb); ++ __register_netdevice_notifier_net(net, nn->nb, true); ++ } ++} ++ ++/** ++ * call_netdevice_notifiers_info - call all network notifier blocks ++ * @val: value passed unmodified to notifier function ++ * @info: notifier information data ++ * ++ * Call all network notifier blocks. Parameters and return value ++ * are as for raw_notifier_call_chain(). ++ */ ++ ++static int call_netdevice_notifiers_info(unsigned long val, ++ struct netdev_notifier_info *info) ++{ ++ struct net *net = dev_net(info->dev); ++ int ret; ++ ++ ASSERT_RTNL(); ++ ++ /* Run per-netns notifier block chain first, then run the global one. ++ * Hopefully, one day, the global one is going to be removed after ++ * all notifier block registrators get converted to be per-netns. ++ */ ++ ret = raw_notifier_call_chain(&net->netdev_chain, val, info); ++ if (ret & NOTIFY_STOP_MASK) ++ return ret; ++ return raw_notifier_call_chain(&netdev_chain, val, info); ++} ++ ++/** ++ * call_netdevice_notifiers_info_robust - call per-netns notifier blocks ++ * for and rollback on error ++ * @val_up: value passed unmodified to notifier function ++ * @val_down: value passed unmodified to the notifier function when ++ * recovering from an error on @val_up ++ * @info: notifier information data ++ * ++ * Call all per-netns network notifier blocks, but not notifier blocks on ++ * the global notifier chain. Parameters and return value are as for ++ * raw_notifier_call_chain_robust(). ++ */ ++ ++static int ++call_netdevice_notifiers_info_robust(unsigned long val_up, ++ unsigned long val_down, ++ struct netdev_notifier_info *info) ++{ ++ struct net *net = dev_net(info->dev); ++ ++ ASSERT_RTNL(); ++ ++ return raw_notifier_call_chain_robust(&net->netdev_chain, ++ val_up, val_down, info); ++} ++ ++static int call_netdevice_notifiers_extack(unsigned long val, ++ struct net_device *dev, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_info info = { ++ .dev = dev, ++ .extack = extack, ++ }; ++ ++ return call_netdevice_notifiers_info(val, &info); ++} ++ ++/** ++ * call_netdevice_notifiers - call all network notifier blocks ++ * @val: value passed unmodified to notifier function ++ * @dev: net_device pointer passed unmodified to notifier function ++ * ++ * Call all network notifier blocks. Parameters and return value ++ * are as for raw_notifier_call_chain(). ++ */ ++ ++int call_netdevice_notifiers(unsigned long val, struct net_device *dev) ++{ ++ return call_netdevice_notifiers_extack(val, dev, NULL); ++} ++EXPORT_SYMBOL(call_netdevice_notifiers); ++ ++/** ++ * call_netdevice_notifiers_mtu - call all network notifier blocks ++ * @val: value passed unmodified to notifier function ++ * @dev: net_device pointer passed unmodified to notifier function ++ * @arg: additional u32 argument passed to the notifier function ++ * ++ * Call all network notifier blocks. Parameters and return value ++ * are as for raw_notifier_call_chain(). ++ */ ++static int call_netdevice_notifiers_mtu(unsigned long val, ++ struct net_device *dev, u32 arg) ++{ ++ struct netdev_notifier_info_ext info = { ++ .info.dev = dev, ++ .ext.mtu = arg, ++ }; ++ ++ BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); ++ ++ return call_netdevice_notifiers_info(val, &info.info); ++} ++ ++#ifdef CONFIG_NET_INGRESS ++static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); ++ ++void net_inc_ingress_queue(void) ++{ ++ static_branch_inc(&ingress_needed_key); ++} ++EXPORT_SYMBOL_GPL(net_inc_ingress_queue); ++ ++void net_dec_ingress_queue(void) ++{ ++ static_branch_dec(&ingress_needed_key); ++} ++EXPORT_SYMBOL_GPL(net_dec_ingress_queue); ++#endif ++ ++#ifdef CONFIG_NET_EGRESS ++static DEFINE_STATIC_KEY_FALSE(egress_needed_key); ++ ++void net_inc_egress_queue(void) ++{ ++ static_branch_inc(&egress_needed_key); ++} ++EXPORT_SYMBOL_GPL(net_inc_egress_queue); ++ ++void net_dec_egress_queue(void) ++{ ++ static_branch_dec(&egress_needed_key); ++} ++EXPORT_SYMBOL_GPL(net_dec_egress_queue); ++#endif ++ ++DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); ++EXPORT_SYMBOL(netstamp_needed_key); ++#ifdef CONFIG_JUMP_LABEL ++static atomic_t netstamp_needed_deferred; ++static atomic_t netstamp_wanted; ++static void netstamp_clear(struct work_struct *work) ++{ ++ int deferred = atomic_xchg(&netstamp_needed_deferred, 0); ++ int wanted; ++ ++ wanted = atomic_add_return(deferred, &netstamp_wanted); ++ if (wanted > 0) ++ static_branch_enable(&netstamp_needed_key); ++ else ++ static_branch_disable(&netstamp_needed_key); ++} ++static DECLARE_WORK(netstamp_work, netstamp_clear); ++#endif ++ ++void net_enable_timestamp(void) ++{ ++#ifdef CONFIG_JUMP_LABEL ++ int wanted; ++ ++ while (1) { ++ wanted = atomic_read(&netstamp_wanted); ++ if (wanted <= 0) ++ break; ++ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) ++ return; ++ } ++ atomic_inc(&netstamp_needed_deferred); ++ schedule_work(&netstamp_work); ++#else ++ static_branch_inc(&netstamp_needed_key); ++#endif ++} ++EXPORT_SYMBOL(net_enable_timestamp); ++ ++void net_disable_timestamp(void) ++{ ++#ifdef CONFIG_JUMP_LABEL ++ int wanted; ++ ++ while (1) { ++ wanted = atomic_read(&netstamp_wanted); ++ if (wanted <= 1) ++ break; ++ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) ++ return; ++ } ++ atomic_dec(&netstamp_needed_deferred); ++ schedule_work(&netstamp_work); ++#else ++ static_branch_dec(&netstamp_needed_key); ++#endif ++} ++EXPORT_SYMBOL(net_disable_timestamp); ++ ++static inline void net_timestamp_set(struct sk_buff *skb) ++{ ++ skb->tstamp = 0; ++ skb->mono_delivery_time = 0; ++ if (static_branch_unlikely(&netstamp_needed_key)) ++ skb->tstamp = ktime_get_real(); ++} ++ ++#define net_timestamp_check(COND, SKB) \ ++ if (static_branch_unlikely(&netstamp_needed_key)) { \ ++ if ((COND) && !(SKB)->tstamp) \ ++ (SKB)->tstamp = ktime_get_real(); \ ++ } \ ++ ++bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) ++{ ++ return __is_skb_forwardable(dev, skb, true); ++} ++EXPORT_SYMBOL_GPL(is_skb_forwardable); ++ ++static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, ++ bool check_mtu) ++{ ++ int ret = ____dev_forward_skb(dev, skb, check_mtu); ++ ++ if (likely(!ret)) { ++ skb->protocol = eth_type_trans(skb, dev); ++ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); ++ } ++ ++ return ret; ++} ++ ++int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) ++{ ++ return __dev_forward_skb2(dev, skb, true); ++} ++EXPORT_SYMBOL_GPL(__dev_forward_skb); ++ ++/** ++ * dev_forward_skb - loopback an skb to another netif ++ * ++ * @dev: destination network device ++ * @skb: buffer to forward ++ * ++ * return values: ++ * NET_RX_SUCCESS (no congestion) ++ * NET_RX_DROP (packet was dropped, but freed) ++ * ++ * dev_forward_skb can be used for injecting an skb from the ++ * start_xmit function of one device into the receive queue ++ * of another device. ++ * ++ * The receiving device may be in another namespace, so ++ * we have to clear all information in the skb that could ++ * impact namespace isolation. ++ */ ++int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) ++{ ++ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); ++} ++EXPORT_SYMBOL_GPL(dev_forward_skb); ++ ++int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) ++{ ++ return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); ++} ++ ++static inline int deliver_skb(struct sk_buff *skb, ++ struct packet_type *pt_prev, ++ struct net_device *orig_dev) ++{ ++ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) ++ return -ENOMEM; ++ refcount_inc(&skb->users); ++ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); ++} ++ ++static inline void deliver_ptype_list_skb(struct sk_buff *skb, ++ struct packet_type **pt, ++ struct net_device *orig_dev, ++ __be16 type, ++ struct list_head *ptype_list) ++{ ++ struct packet_type *ptype, *pt_prev = *pt; ++ ++ list_for_each_entry_rcu(ptype, ptype_list, list) { ++ if (ptype->type != type) ++ continue; ++ if (pt_prev) ++ deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = ptype; ++ } ++ *pt = pt_prev; ++} ++ ++static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) ++{ ++ if (!ptype->af_packet_priv || !skb->sk) ++ return false; ++ ++ if (ptype->id_match) ++ return ptype->id_match(ptype, skb->sk); ++ else if ((struct sock *)ptype->af_packet_priv == skb->sk) ++ return true; ++ ++ return false; ++} ++ ++/** ++ * dev_nit_active - return true if any network interface taps are in use ++ * ++ * @dev: network device to check for the presence of taps ++ */ ++bool dev_nit_active(struct net_device *dev) ++{ ++ return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); ++} ++EXPORT_SYMBOL_GPL(dev_nit_active); ++ ++/* ++ * Support routine. Sends outgoing frames to any network ++ * taps currently in use. ++ */ ++ ++void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct packet_type *ptype; ++ struct sk_buff *skb2 = NULL; ++ struct packet_type *pt_prev = NULL; ++ struct list_head *ptype_list = &ptype_all; ++ ++ rcu_read_lock(); ++again: ++ list_for_each_entry_rcu(ptype, ptype_list, list) { ++ if (ptype->ignore_outgoing) ++ continue; ++ ++ /* Never send packets back to the socket ++ * they originated from - MvS (miquels@drinkel.ow.org) ++ */ ++ if (skb_loop_sk(ptype, skb)) ++ continue; ++ ++ if (pt_prev) { ++ deliver_skb(skb2, pt_prev, skb->dev); ++ pt_prev = ptype; ++ continue; ++ } ++ ++ /* need to clone skb, done only once */ ++ skb2 = skb_clone(skb, GFP_ATOMIC); ++ if (!skb2) ++ goto out_unlock; ++ ++ net_timestamp_set(skb2); ++ ++ /* skb->nh should be correctly ++ * set by sender, so that the second statement is ++ * just protection against buggy protocols. ++ */ ++ skb_reset_mac_header(skb2); ++ ++ if (skb_network_header(skb2) < skb2->data || ++ skb_network_header(skb2) > skb_tail_pointer(skb2)) { ++ net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ++ ntohs(skb2->protocol), ++ dev->name); ++ skb_reset_network_header(skb2); ++ } ++ ++ skb2->transport_header = skb2->network_header; ++ skb2->pkt_type = PACKET_OUTGOING; ++ pt_prev = ptype; ++ } ++ ++ if (ptype_list == &ptype_all) { ++ ptype_list = &dev->ptype_all; ++ goto again; ++ } ++out_unlock: ++ if (pt_prev) { ++ if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) ++ pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); ++ else ++ kfree_skb(skb2); ++ } ++ rcu_read_unlock(); ++} ++EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); ++ ++/** ++ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change ++ * @dev: Network device ++ * @txq: number of queues available ++ * ++ * If real_num_tx_queues is changed the tc mappings may no longer be ++ * valid. To resolve this verify the tc mapping remains valid and if ++ * not NULL the mapping. With no priorities mapping to this ++ * offset/count pair it will no longer be used. In the worst case TC0 ++ * is invalid nothing can be done so disable priority mappings. If is ++ * expected that drivers will fix this mapping if they can before ++ * calling netif_set_real_num_tx_queues. ++ */ ++static void netif_setup_tc(struct net_device *dev, unsigned int txq) ++{ ++ int i; ++ struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; ++ ++ /* If TC0 is invalidated disable TC mapping */ ++ if (tc->offset + tc->count > txq) { ++ netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); ++ dev->num_tc = 0; ++ return; ++ } ++ ++ /* Invalidated prio to tc mappings set to TC0 */ ++ for (i = 1; i < TC_BITMASK + 1; i++) { ++ int q = netdev_get_prio_tc_map(dev, i); ++ ++ tc = &dev->tc_to_txq[q]; ++ if (tc->offset + tc->count > txq) { ++ netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", ++ i, q); ++ netdev_set_prio_tc_map(dev, i, 0); ++ } ++ } ++} ++ ++int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) ++{ ++ if (dev->num_tc) { ++ struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; ++ int i; ++ ++ /* walk through the TCs and see if it falls into any of them */ ++ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { ++ if ((txq - tc->offset) < tc->count) ++ return i; ++ } ++ ++ /* didn't find it, just return -1 to indicate no match */ ++ return -1; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(netdev_txq_to_tc); ++ ++#ifdef CONFIG_XPS ++static struct static_key xps_needed __read_mostly; ++static struct static_key xps_rxqs_needed __read_mostly; ++static DEFINE_MUTEX(xps_map_mutex); ++#define xmap_dereference(P) \ ++ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) ++ ++static bool remove_xps_queue(struct xps_dev_maps *dev_maps, ++ struct xps_dev_maps *old_maps, int tci, u16 index) ++{ ++ struct xps_map *map = NULL; ++ int pos; ++ ++ if (dev_maps) ++ map = xmap_dereference(dev_maps->attr_map[tci]); ++ if (!map) ++ return false; ++ ++ for (pos = map->len; pos--;) { ++ if (map->queues[pos] != index) ++ continue; ++ ++ if (map->len > 1) { ++ map->queues[pos] = map->queues[--map->len]; ++ break; ++ } ++ ++ if (old_maps) ++ RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); ++ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); ++ kfree_rcu(map, rcu); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool remove_xps_queue_cpu(struct net_device *dev, ++ struct xps_dev_maps *dev_maps, ++ int cpu, u16 offset, u16 count) ++{ ++ int num_tc = dev_maps->num_tc; ++ bool active = false; ++ int tci; ++ ++ for (tci = cpu * num_tc; num_tc--; tci++) { ++ int i, j; ++ ++ for (i = count, j = offset; i--; j++) { ++ if (!remove_xps_queue(dev_maps, NULL, tci, j)) ++ break; ++ } ++ ++ active |= i < 0; ++ } ++ ++ return active; ++} ++ ++static void reset_xps_maps(struct net_device *dev, ++ struct xps_dev_maps *dev_maps, ++ enum xps_map_type type) ++{ ++ static_key_slow_dec_cpuslocked(&xps_needed); ++ if (type == XPS_RXQS) ++ static_key_slow_dec_cpuslocked(&xps_rxqs_needed); ++ ++ RCU_INIT_POINTER(dev->xps_maps[type], NULL); ++ ++ kfree_rcu(dev_maps, rcu); ++} ++ ++static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, ++ u16 offset, u16 count) ++{ ++ struct xps_dev_maps *dev_maps; ++ bool active = false; ++ int i, j; ++ ++ dev_maps = xmap_dereference(dev->xps_maps[type]); ++ if (!dev_maps) ++ return; ++ ++ for (j = 0; j < dev_maps->nr_ids; j++) ++ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); ++ if (!active) ++ reset_xps_maps(dev, dev_maps, type); ++ ++ if (type == XPS_CPUS) { ++ for (i = offset + (count - 1); count--; i--) ++ netdev_queue_numa_node_write( ++ netdev_get_tx_queue(dev, i), NUMA_NO_NODE); ++ } ++} ++ ++static void netif_reset_xps_queues(struct net_device *dev, u16 offset, ++ u16 count) ++{ ++ if (!static_key_false(&xps_needed)) ++ return; ++ ++ cpus_read_lock(); ++ mutex_lock(&xps_map_mutex); ++ ++ if (static_key_false(&xps_rxqs_needed)) ++ clean_xps_maps(dev, XPS_RXQS, offset, count); ++ ++ clean_xps_maps(dev, XPS_CPUS, offset, count); ++ ++ mutex_unlock(&xps_map_mutex); ++ cpus_read_unlock(); ++} ++ ++static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) ++{ ++ netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); ++} ++ ++static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, ++ u16 index, bool is_rxqs_map) ++{ ++ struct xps_map *new_map; ++ int alloc_len = XPS_MIN_MAP_ALLOC; ++ int i, pos; ++ ++ for (pos = 0; map && pos < map->len; pos++) { ++ if (map->queues[pos] != index) ++ continue; ++ return map; ++ } ++ ++ /* Need to add tx-queue to this CPU's/rx-queue's existing map */ ++ if (map) { ++ if (pos < map->alloc_len) ++ return map; ++ ++ alloc_len = map->alloc_len * 2; ++ } ++ ++ /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's ++ * map ++ */ ++ if (is_rxqs_map) ++ new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); ++ else ++ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, ++ cpu_to_node(attr_index)); ++ if (!new_map) ++ return NULL; ++ ++ for (i = 0; i < pos; i++) ++ new_map->queues[i] = map->queues[i]; ++ new_map->alloc_len = alloc_len; ++ new_map->len = pos; ++ ++ return new_map; ++} ++ ++/* Copy xps maps at a given index */ ++static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, ++ struct xps_dev_maps *new_dev_maps, int index, ++ int tc, bool skip_tc) ++{ ++ int i, tci = index * dev_maps->num_tc; ++ struct xps_map *map; ++ ++ /* copy maps belonging to foreign traffic classes */ ++ for (i = 0; i < dev_maps->num_tc; i++, tci++) { ++ if (i == tc && skip_tc) ++ continue; ++ ++ /* fill in the new device map from the old device map */ ++ map = xmap_dereference(dev_maps->attr_map[tci]); ++ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); ++ } ++} ++ ++/* Must be called under cpus_read_lock */ ++int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, ++ u16 index, enum xps_map_type type) ++{ ++ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; ++ const unsigned long *online_mask = NULL; ++ bool active = false, copy = false; ++ int i, j, tci, numa_node_id = -2; ++ int maps_sz, num_tc = 1, tc = 0; ++ struct xps_map *map, *new_map; ++ unsigned int nr_ids; ++ ++ if (dev->num_tc) { ++ /* Do not allow XPS on subordinate device directly */ ++ num_tc = dev->num_tc; ++ if (num_tc < 0) ++ return -EINVAL; ++ ++ /* If queue belongs to subordinate dev use its map */ ++ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; ++ ++ tc = netdev_txq_to_tc(dev, index); ++ if (tc < 0) ++ return -EINVAL; ++ } ++ ++ mutex_lock(&xps_map_mutex); ++ ++ dev_maps = xmap_dereference(dev->xps_maps[type]); ++ if (type == XPS_RXQS) { ++ maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); ++ nr_ids = dev->num_rx_queues; ++ } else { ++ maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); ++ if (num_possible_cpus() > 1) ++ online_mask = cpumask_bits(cpu_online_mask); ++ nr_ids = nr_cpu_ids; ++ } ++ ++ if (maps_sz < L1_CACHE_BYTES) ++ maps_sz = L1_CACHE_BYTES; ++ ++ /* The old dev_maps could be larger or smaller than the one we're ++ * setting up now, as dev->num_tc or nr_ids could have been updated in ++ * between. We could try to be smart, but let's be safe instead and only ++ * copy foreign traffic classes if the two map sizes match. ++ */ ++ if (dev_maps && ++ dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) ++ copy = true; ++ ++ /* allocate memory for queue storage */ ++ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), ++ j < nr_ids;) { ++ if (!new_dev_maps) { ++ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); ++ if (!new_dev_maps) { ++ mutex_unlock(&xps_map_mutex); ++ return -ENOMEM; ++ } ++ ++ new_dev_maps->nr_ids = nr_ids; ++ new_dev_maps->num_tc = num_tc; ++ } ++ ++ tci = j * num_tc + tc; ++ map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; ++ ++ map = expand_xps_map(map, j, index, type == XPS_RXQS); ++ if (!map) ++ goto error; ++ ++ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); ++ } ++ ++ if (!new_dev_maps) ++ goto out_no_new_maps; ++ ++ if (!dev_maps) { ++ /* Increment static keys at most once per type */ ++ static_key_slow_inc_cpuslocked(&xps_needed); ++ if (type == XPS_RXQS) ++ static_key_slow_inc_cpuslocked(&xps_rxqs_needed); ++ } ++ ++ for (j = 0; j < nr_ids; j++) { ++ bool skip_tc = false; ++ ++ tci = j * num_tc + tc; ++ if (netif_attr_test_mask(j, mask, nr_ids) && ++ netif_attr_test_online(j, online_mask, nr_ids)) { ++ /* add tx-queue to CPU/rx-queue maps */ ++ int pos = 0; ++ ++ skip_tc = true; ++ ++ map = xmap_dereference(new_dev_maps->attr_map[tci]); ++ while ((pos < map->len) && (map->queues[pos] != index)) ++ pos++; ++ ++ if (pos == map->len) ++ map->queues[map->len++] = index; ++#ifdef CONFIG_NUMA ++ if (type == XPS_CPUS) { ++ if (numa_node_id == -2) ++ numa_node_id = cpu_to_node(j); ++ else if (numa_node_id != cpu_to_node(j)) ++ numa_node_id = -1; ++ } ++#endif ++ } ++ ++ if (copy) ++ xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, ++ skip_tc); ++ } ++ ++ rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); ++ ++ /* Cleanup old maps */ ++ if (!dev_maps) ++ goto out_no_old_maps; ++ ++ for (j = 0; j < dev_maps->nr_ids; j++) { ++ for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { ++ map = xmap_dereference(dev_maps->attr_map[tci]); ++ if (!map) ++ continue; ++ ++ if (copy) { ++ new_map = xmap_dereference(new_dev_maps->attr_map[tci]); ++ if (map == new_map) ++ continue; ++ } ++ ++ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); ++ kfree_rcu(map, rcu); ++ } ++ } ++ ++ old_dev_maps = dev_maps; ++ ++out_no_old_maps: ++ dev_maps = new_dev_maps; ++ active = true; ++ ++out_no_new_maps: ++ if (type == XPS_CPUS) ++ /* update Tx queue numa node */ ++ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), ++ (numa_node_id >= 0) ? ++ numa_node_id : NUMA_NO_NODE); ++ ++ if (!dev_maps) ++ goto out_no_maps; ++ ++ /* removes tx-queue from unused CPUs/rx-queues */ ++ for (j = 0; j < dev_maps->nr_ids; j++) { ++ tci = j * dev_maps->num_tc; ++ ++ for (i = 0; i < dev_maps->num_tc; i++, tci++) { ++ if (i == tc && ++ netif_attr_test_mask(j, mask, dev_maps->nr_ids) && ++ netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) ++ continue; ++ ++ active |= remove_xps_queue(dev_maps, ++ copy ? old_dev_maps : NULL, ++ tci, index); ++ } ++ } ++ ++ if (old_dev_maps) ++ kfree_rcu(old_dev_maps, rcu); ++ ++ /* free map if not active */ ++ if (!active) ++ reset_xps_maps(dev, dev_maps, type); ++ ++out_no_maps: ++ mutex_unlock(&xps_map_mutex); ++ ++ return 0; ++error: ++ /* remove any maps that we added */ ++ for (j = 0; j < nr_ids; j++) { ++ for (i = num_tc, tci = j * num_tc; i--; tci++) { ++ new_map = xmap_dereference(new_dev_maps->attr_map[tci]); ++ map = copy ? ++ xmap_dereference(dev_maps->attr_map[tci]) : ++ NULL; ++ if (new_map && new_map != map) ++ kfree(new_map); ++ } ++ } ++ ++ mutex_unlock(&xps_map_mutex); ++ ++ kfree(new_dev_maps); ++ return -ENOMEM; ++} ++EXPORT_SYMBOL_GPL(__netif_set_xps_queue); ++ ++int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, ++ u16 index) ++{ ++ int ret; ++ ++ cpus_read_lock(); ++ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); ++ cpus_read_unlock(); ++ ++ return ret; ++} ++EXPORT_SYMBOL(netif_set_xps_queue); ++ ++#endif ++static void netdev_unbind_all_sb_channels(struct net_device *dev) ++{ ++ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; ++ ++ /* Unbind any subordinate channels */ ++ while (txq-- != &dev->_tx[0]) { ++ if (txq->sb_dev) ++ netdev_unbind_sb_channel(dev, txq->sb_dev); ++ } ++} ++ ++void netdev_reset_tc(struct net_device *dev) ++{ ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues_gt(dev, 0); ++#endif ++ netdev_unbind_all_sb_channels(dev); ++ ++ /* Reset TC configuration of device */ ++ dev->num_tc = 0; ++ memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); ++ memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); ++} ++EXPORT_SYMBOL(netdev_reset_tc); ++ ++int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) ++{ ++ if (tc >= dev->num_tc) ++ return -EINVAL; ++ ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues(dev, offset, count); ++#endif ++ dev->tc_to_txq[tc].count = count; ++ dev->tc_to_txq[tc].offset = offset; ++ return 0; ++} ++EXPORT_SYMBOL(netdev_set_tc_queue); ++ ++int netdev_set_num_tc(struct net_device *dev, u8 num_tc) ++{ ++ if (num_tc > TC_MAX_QUEUE) ++ return -EINVAL; ++ ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues_gt(dev, 0); ++#endif ++ netdev_unbind_all_sb_channels(dev); ++ ++ dev->num_tc = num_tc; ++ return 0; ++} ++EXPORT_SYMBOL(netdev_set_num_tc); ++ ++void netdev_unbind_sb_channel(struct net_device *dev, ++ struct net_device *sb_dev) ++{ ++ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; ++ ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues_gt(sb_dev, 0); ++#endif ++ memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); ++ memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); ++ ++ while (txq-- != &dev->_tx[0]) { ++ if (txq->sb_dev == sb_dev) ++ txq->sb_dev = NULL; ++ } ++} ++EXPORT_SYMBOL(netdev_unbind_sb_channel); ++ ++int netdev_bind_sb_channel_queue(struct net_device *dev, ++ struct net_device *sb_dev, ++ u8 tc, u16 count, u16 offset) ++{ ++ /* Make certain the sb_dev and dev are already configured */ ++ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) ++ return -EINVAL; ++ ++ /* We cannot hand out queues we don't have */ ++ if ((offset + count) > dev->real_num_tx_queues) ++ return -EINVAL; ++ ++ /* Record the mapping */ ++ sb_dev->tc_to_txq[tc].count = count; ++ sb_dev->tc_to_txq[tc].offset = offset; ++ ++ /* Provide a way for Tx queue to find the tc_to_txq map or ++ * XPS map for itself. ++ */ ++ while (count--) ++ netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; ++ ++ return 0; ++} ++EXPORT_SYMBOL(netdev_bind_sb_channel_queue); ++ ++int netdev_set_sb_channel(struct net_device *dev, u16 channel) ++{ ++ /* Do not use a multiqueue device to represent a subordinate channel */ ++ if (netif_is_multiqueue(dev)) ++ return -ENODEV; ++ ++ /* We allow channels 1 - 32767 to be used for subordinate channels. ++ * Channel 0 is meant to be "native" mode and used only to represent ++ * the main root device. We allow writing 0 to reset the device back ++ * to normal mode after being used as a subordinate channel. ++ */ ++ if (channel > S16_MAX) ++ return -EINVAL; ++ ++ dev->num_tc = -channel; ++ ++ return 0; ++} ++EXPORT_SYMBOL(netdev_set_sb_channel); ++ ++/* ++ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues ++ * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. ++ */ ++int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) ++{ ++ bool disabling; ++ int rc; ++ ++ disabling = txq < dev->real_num_tx_queues; ++ ++ if (txq < 1 || txq > dev->num_tx_queues) ++ return -EINVAL; ++ ++ if (dev->reg_state == NETREG_REGISTERED || ++ dev->reg_state == NETREG_UNREGISTERING) { ++ ASSERT_RTNL(); ++ ++ rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, ++ txq); ++ if (rc) ++ return rc; ++ ++ if (dev->num_tc) ++ netif_setup_tc(dev, txq); ++ ++ dev_qdisc_change_real_num_tx(dev, txq); ++ ++ dev->real_num_tx_queues = txq; ++ ++ if (disabling) { ++ synchronize_net(); ++ qdisc_reset_all_tx_gt(dev, txq); ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues_gt(dev, txq); ++#endif ++ } ++ } else { ++ dev->real_num_tx_queues = txq; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(netif_set_real_num_tx_queues); ++ ++#ifdef CONFIG_SYSFS ++/** ++ * netif_set_real_num_rx_queues - set actual number of RX queues used ++ * @dev: Network device ++ * @rxq: Actual number of RX queues ++ * ++ * This must be called either with the rtnl_lock held or before ++ * registration of the net device. Returns 0 on success, or a ++ * negative error code. If called before registration, it always ++ * succeeds. ++ */ ++int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) ++{ ++ int rc; ++ ++ if (rxq < 1 || rxq > dev->num_rx_queues) ++ return -EINVAL; ++ ++ if (dev->reg_state == NETREG_REGISTERED) { ++ ASSERT_RTNL(); ++ ++ rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, ++ rxq); ++ if (rc) ++ return rc; ++ } ++ ++ dev->real_num_rx_queues = rxq; ++ return 0; ++} ++EXPORT_SYMBOL(netif_set_real_num_rx_queues); ++#endif ++ ++/** ++ * netif_set_real_num_queues - set actual number of RX and TX queues used ++ * @dev: Network device ++ * @txq: Actual number of TX queues ++ * @rxq: Actual number of RX queues ++ * ++ * Set the real number of both TX and RX queues. ++ * Does nothing if the number of queues is already correct. ++ */ ++int netif_set_real_num_queues(struct net_device *dev, ++ unsigned int txq, unsigned int rxq) ++{ ++ unsigned int old_rxq = dev->real_num_rx_queues; ++ int err; ++ ++ if (txq < 1 || txq > dev->num_tx_queues || ++ rxq < 1 || rxq > dev->num_rx_queues) ++ return -EINVAL; ++ ++ /* Start from increases, so the error path only does decreases - ++ * decreases can't fail. ++ */ ++ if (rxq > dev->real_num_rx_queues) { ++ err = netif_set_real_num_rx_queues(dev, rxq); ++ if (err) ++ return err; ++ } ++ if (txq > dev->real_num_tx_queues) { ++ err = netif_set_real_num_tx_queues(dev, txq); ++ if (err) ++ goto undo_rx; ++ } ++ if (rxq < dev->real_num_rx_queues) ++ WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); ++ if (txq < dev->real_num_tx_queues) ++ WARN_ON(netif_set_real_num_tx_queues(dev, txq)); ++ ++ return 0; ++undo_rx: ++ WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); ++ return err; ++} ++EXPORT_SYMBOL(netif_set_real_num_queues); ++ ++/** ++ * netif_set_tso_max_size() - set the max size of TSO frames supported ++ * @dev: netdev to update ++ * @size: max skb->len of a TSO frame ++ * ++ * Set the limit on the size of TSO super-frames the device can handle. ++ * Unless explicitly set the stack will assume the value of ++ * %GSO_LEGACY_MAX_SIZE. ++ */ ++void netif_set_tso_max_size(struct net_device *dev, unsigned int size) ++{ ++ dev->tso_max_size = min(GSO_MAX_SIZE, size); ++ if (size < READ_ONCE(dev->gso_max_size)) ++ netif_set_gso_max_size(dev, size); ++} ++EXPORT_SYMBOL(netif_set_tso_max_size); ++ ++/** ++ * netif_set_tso_max_segs() - set the max number of segs supported for TSO ++ * @dev: netdev to update ++ * @segs: max number of TCP segments ++ * ++ * Set the limit on the number of TCP segments the device can generate from ++ * a single TSO super-frame. ++ * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. ++ */ ++void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) ++{ ++ dev->tso_max_segs = segs; ++ if (segs < READ_ONCE(dev->gso_max_segs)) ++ netif_set_gso_max_segs(dev, segs); ++} ++EXPORT_SYMBOL(netif_set_tso_max_segs); ++ ++/** ++ * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper ++ * @to: netdev to update ++ * @from: netdev from which to copy the limits ++ */ ++void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) ++{ ++ netif_set_tso_max_size(to, from->tso_max_size); ++ netif_set_tso_max_segs(to, from->tso_max_segs); ++} ++EXPORT_SYMBOL(netif_inherit_tso_max); ++ ++/** ++ * netif_get_num_default_rss_queues - default number of RSS queues ++ * ++ * Default value is the number of physical cores if there are only 1 or 2, or ++ * divided by 2 if there are more. ++ */ ++int netif_get_num_default_rss_queues(void) ++{ ++ cpumask_var_t cpus; ++ int cpu, count = 0; ++ ++ if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) ++ return 1; ++ ++ cpumask_copy(cpus, cpu_online_mask); ++ for_each_cpu(cpu, cpus) { ++ ++count; ++ cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); ++ } ++ free_cpumask_var(cpus); ++ ++ return count > 2 ? DIV_ROUND_UP(count, 2) : count; ++} ++EXPORT_SYMBOL(netif_get_num_default_rss_queues); ++ ++static void __netif_reschedule(struct Qdisc *q) ++{ ++ struct softnet_data *sd; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ sd = this_cpu_ptr(&softnet_data); ++ q->next_sched = NULL; ++ *sd->output_queue_tailp = q; ++ sd->output_queue_tailp = &q->next_sched; ++ raise_softirq_irqoff(NET_TX_SOFTIRQ); ++ local_irq_restore(flags); ++} ++ ++void __netif_schedule(struct Qdisc *q) ++{ ++ if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) ++ __netif_reschedule(q); ++} ++EXPORT_SYMBOL(__netif_schedule); ++ ++struct dev_kfree_skb_cb { ++ enum skb_free_reason reason; ++}; ++ ++static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) ++{ ++ return (struct dev_kfree_skb_cb *)skb->cb; ++} ++ ++void netif_schedule_queue(struct netdev_queue *txq) ++{ ++ rcu_read_lock(); ++ if (!netif_xmit_stopped(txq)) { ++ struct Qdisc *q = rcu_dereference(txq->qdisc); ++ ++ __netif_schedule(q); ++ } ++ rcu_read_unlock(); ++} ++EXPORT_SYMBOL(netif_schedule_queue); ++ ++void netif_tx_wake_queue(struct netdev_queue *dev_queue) ++{ ++ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { ++ struct Qdisc *q; ++ ++ rcu_read_lock(); ++ q = rcu_dereference(dev_queue->qdisc); ++ __netif_schedule(q); ++ rcu_read_unlock(); ++ } ++} ++EXPORT_SYMBOL(netif_tx_wake_queue); ++ ++void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) ++{ ++ unsigned long flags; ++ ++ if (unlikely(!skb)) ++ return; ++ ++ if (likely(refcount_read(&skb->users) == 1)) { ++ smp_rmb(); ++ refcount_set(&skb->users, 0); ++ } else if (likely(!refcount_dec_and_test(&skb->users))) { ++ return; ++ } ++ get_kfree_skb_cb(skb)->reason = reason; ++ local_irq_save(flags); ++ skb->next = __this_cpu_read(softnet_data.completion_queue); ++ __this_cpu_write(softnet_data.completion_queue, skb); ++ raise_softirq_irqoff(NET_TX_SOFTIRQ); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL(__dev_kfree_skb_irq); ++ ++void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) ++{ ++ if (in_hardirq() || irqs_disabled()) ++ __dev_kfree_skb_irq(skb, reason); ++ else ++ dev_kfree_skb(skb); ++} ++EXPORT_SYMBOL(__dev_kfree_skb_any); ++ ++ ++/** ++ * netif_device_detach - mark device as removed ++ * @dev: network device ++ * ++ * Mark device as removed from system and therefore no longer available. ++ */ ++void netif_device_detach(struct net_device *dev) ++{ ++ if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && ++ netif_running(dev)) { ++ netif_tx_stop_all_queues(dev); ++ } ++} ++EXPORT_SYMBOL(netif_device_detach); ++ ++/** ++ * netif_device_attach - mark device as attached ++ * @dev: network device ++ * ++ * Mark device as attached from system and restart if needed. ++ */ ++void netif_device_attach(struct net_device *dev) ++{ ++ if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && ++ netif_running(dev)) { ++ netif_tx_wake_all_queues(dev); ++ __netdev_watchdog_up(dev); ++ } ++} ++EXPORT_SYMBOL(netif_device_attach); ++ ++/* ++ * Returns a Tx hash based on the given packet descriptor a Tx queues' number ++ * to be used as a distribution range. ++ */ ++static u16 skb_tx_hash(const struct net_device *dev, ++ const struct net_device *sb_dev, ++ struct sk_buff *skb) ++{ ++ u32 hash; ++ u16 qoffset = 0; ++ u16 qcount = dev->real_num_tx_queues; ++ ++ if (dev->num_tc) { ++ u8 tc = netdev_get_prio_tc_map(dev, skb->priority); ++ ++ qoffset = sb_dev->tc_to_txq[tc].offset; ++ qcount = sb_dev->tc_to_txq[tc].count; ++ if (unlikely(!qcount)) { ++ net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", ++ sb_dev->name, qoffset, tc); ++ qoffset = 0; ++ qcount = dev->real_num_tx_queues; ++ } ++ } ++ ++ if (skb_rx_queue_recorded(skb)) { ++ hash = skb_get_rx_queue(skb); ++ if (hash >= qoffset) ++ hash -= qoffset; ++ while (unlikely(hash >= qcount)) ++ hash -= qcount; ++ return hash + qoffset; ++ } ++ ++ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; ++} ++ ++static void skb_warn_bad_offload(const struct sk_buff *skb) ++{ ++ static const netdev_features_t null_features; ++ struct net_device *dev = skb->dev; ++ const char *name = ""; ++ ++ if (!net_ratelimit()) ++ return; ++ ++ if (dev) { ++ if (dev->dev.parent) ++ name = dev_driver_string(dev->dev.parent); ++ else ++ name = netdev_name(dev); ++ } ++ skb_dump(KERN_WARNING, skb, false); ++ WARN(1, "%s: caps=(%pNF, %pNF)\n", ++ name, dev ? &dev->features : &null_features, ++ skb->sk ? &skb->sk->sk_route_caps : &null_features); ++} ++ ++/* ++ * Invalidate hardware checksum when packet is to be mangled, and ++ * complete checksum manually on outgoing path. ++ */ ++int skb_checksum_help(struct sk_buff *skb) ++{ ++ __wsum csum; ++ int ret = 0, offset; ++ ++ if (skb->ip_summed == CHECKSUM_COMPLETE) ++ goto out_set_summed; ++ ++ if (unlikely(skb_is_gso(skb))) { ++ skb_warn_bad_offload(skb); ++ return -EINVAL; ++ } ++ ++ /* Before computing a checksum, we should make sure no frag could ++ * be modified by an external entity : checksum could be wrong. ++ */ ++ if (skb_has_shared_frag(skb)) { ++ ret = __skb_linearize(skb); ++ if (ret) ++ goto out; ++ } ++ ++ offset = skb_checksum_start_offset(skb); ++ ret = -EINVAL; ++ if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ++ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); ++ goto out; ++ } ++ csum = skb_checksum(skb, offset, skb->len - offset, 0); ++ ++ offset += skb->csum_offset; ++ if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) { ++ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); ++ goto out; ++ } ++ ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); ++ if (ret) ++ goto out; ++ ++ *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; ++out_set_summed: ++ skb->ip_summed = CHECKSUM_NONE; ++out: ++ return ret; ++} ++EXPORT_SYMBOL(skb_checksum_help); ++ ++int skb_crc32c_csum_help(struct sk_buff *skb) ++{ ++ __le32 crc32c_csum; ++ int ret = 0, offset, start; ++ ++ if (skb->ip_summed != CHECKSUM_PARTIAL) ++ goto out; ++ ++ if (unlikely(skb_is_gso(skb))) ++ goto out; ++ ++ /* Before computing a checksum, we should make sure no frag could ++ * be modified by an external entity : checksum could be wrong. ++ */ ++ if (unlikely(skb_has_shared_frag(skb))) { ++ ret = __skb_linearize(skb); ++ if (ret) ++ goto out; ++ } ++ start = skb_checksum_start_offset(skb); ++ offset = start + offsetof(struct sctphdr, checksum); ++ if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ ret = skb_ensure_writable(skb, offset + sizeof(__le32)); ++ if (ret) ++ goto out; ++ ++ crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, ++ skb->len - start, ~(__u32)0, ++ crc32c_csum_stub)); ++ *(__le32 *)(skb->data + offset) = crc32c_csum; ++ skb->ip_summed = CHECKSUM_NONE; ++ skb->csum_not_inet = 0; ++out: ++ return ret; ++} ++ ++__be16 skb_network_protocol(struct sk_buff *skb, int *depth) ++{ ++ __be16 type = skb->protocol; ++ ++ /* Tunnel gso handlers can set protocol to ethernet. */ ++ if (type == htons(ETH_P_TEB)) { ++ struct ethhdr *eth; ++ ++ if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) ++ return 0; ++ ++ eth = (struct ethhdr *)skb->data; ++ type = eth->h_proto; ++ } ++ ++ return __vlan_get_protocol(skb, type, depth); ++} ++ ++/* openvswitch calls this on rx path, so we need a different check. ++ */ ++static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) ++{ ++ if (tx_path) ++ return skb->ip_summed != CHECKSUM_PARTIAL && ++ skb->ip_summed != CHECKSUM_UNNECESSARY; ++ ++ return skb->ip_summed == CHECKSUM_NONE; ++} ++ ++/** ++ * __skb_gso_segment - Perform segmentation on skb. ++ * @skb: buffer to segment ++ * @features: features for the output path (see dev->features) ++ * @tx_path: whether it is called in TX path ++ * ++ * This function segments the given skb and returns a list of segments. ++ * ++ * It may return NULL if the skb requires no segmentation. This is ++ * only possible when GSO is used for verifying header integrity. ++ * ++ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. ++ */ ++struct sk_buff *__skb_gso_segment(struct sk_buff *skb, ++ netdev_features_t features, bool tx_path) ++{ ++ struct sk_buff *segs; ++ ++ if (unlikely(skb_needs_check(skb, tx_path))) { ++ int err; ++ ++ /* We're going to init ->check field in TCP or UDP header */ ++ err = skb_cow_head(skb, 0); ++ if (err < 0) ++ return ERR_PTR(err); ++ } ++ ++ /* Only report GSO partial support if it will enable us to ++ * support segmentation on this frame without needing additional ++ * work. ++ */ ++ if (features & NETIF_F_GSO_PARTIAL) { ++ netdev_features_t partial_features = NETIF_F_GSO_ROBUST; ++ struct net_device *dev = skb->dev; ++ ++ partial_features |= dev->features & dev->gso_partial_features; ++ if (!skb_gso_ok(skb, features | partial_features)) ++ features &= ~NETIF_F_GSO_PARTIAL; ++ } ++ ++ BUILD_BUG_ON(SKB_GSO_CB_OFFSET + ++ sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); ++ ++ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); ++ SKB_GSO_CB(skb)->encap_level = 0; ++ ++ skb_reset_mac_header(skb); ++ skb_reset_mac_len(skb); ++ ++ segs = skb_mac_gso_segment(skb, features); ++ ++ if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) ++ skb_warn_bad_offload(skb); ++ ++ return segs; ++} ++EXPORT_SYMBOL(__skb_gso_segment); ++ ++/* Take action when hardware reception checksum errors are detected. */ ++#ifdef CONFIG_BUG ++static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) ++{ ++ netdev_err(dev, "hw csum failure\n"); ++ skb_dump(KERN_ERR, skb, true); ++ dump_stack(); ++} ++ ++void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) ++{ ++ DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); ++} ++EXPORT_SYMBOL(netdev_rx_csum_fault); ++#endif ++ ++/* XXX: check that highmem exists at all on the given machine. */ ++static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) ++{ ++#ifdef CONFIG_HIGHMEM ++ int i; ++ ++ if (!(dev->features & NETIF_F_HIGHDMA)) { ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ if (PageHighMem(skb_frag_page(frag))) ++ return 1; ++ } ++ } ++#endif ++ return 0; ++} ++ ++/* If MPLS offload request, verify we are testing hardware MPLS features ++ * instead of standard features for the netdev. ++ */ ++#if IS_ENABLED(CONFIG_NET_MPLS_GSO) ++static netdev_features_t net_mpls_features(struct sk_buff *skb, ++ netdev_features_t features, ++ __be16 type) ++{ ++ if (eth_p_mpls(type)) ++ features &= skb->dev->mpls_features; ++ ++ return features; ++} ++#else ++static netdev_features_t net_mpls_features(struct sk_buff *skb, ++ netdev_features_t features, ++ __be16 type) ++{ ++ return features; ++} ++#endif ++ ++static netdev_features_t harmonize_features(struct sk_buff *skb, ++ netdev_features_t features) ++{ ++ __be16 type; ++ ++ type = skb_network_protocol(skb, NULL); ++ features = net_mpls_features(skb, features, type); ++ ++ if (skb->ip_summed != CHECKSUM_NONE && ++ !can_checksum_protocol(features, type)) { ++ features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); ++ } ++ if (illegal_highdma(skb->dev, skb)) ++ features &= ~NETIF_F_SG; ++ ++ return features; ++} ++ ++netdev_features_t passthru_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) ++{ ++ return features; ++} ++EXPORT_SYMBOL(passthru_features_check); ++ ++static netdev_features_t dflt_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) ++{ ++ return vlan_features_check(skb, features); ++} ++ ++static netdev_features_t gso_features_check(const struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) ++{ ++ u16 gso_segs = skb_shinfo(skb)->gso_segs; ++ ++ if (gso_segs > READ_ONCE(dev->gso_max_segs)) ++ return features & ~NETIF_F_GSO_MASK; ++ ++ if (!skb_shinfo(skb)->gso_type) { ++ skb_warn_bad_offload(skb); ++ return features & ~NETIF_F_GSO_MASK; ++ } ++ ++ /* Support for GSO partial features requires software ++ * intervention before we can actually process the packets ++ * so we need to strip support for any partial features now ++ * and we can pull them back in after we have partially ++ * segmented the frame. ++ */ ++ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) ++ features &= ~dev->gso_partial_features; ++ ++ /* Make sure to clear the IPv4 ID mangling feature if the ++ * IPv4 header has the potential to be fragmented. ++ */ ++ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { ++ struct iphdr *iph = skb->encapsulation ? ++ inner_ip_hdr(skb) : ip_hdr(skb); ++ ++ if (!(iph->frag_off & htons(IP_DF))) ++ features &= ~NETIF_F_TSO_MANGLEID; ++ } ++ ++ return features; ++} ++ ++netdev_features_t netif_skb_features(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ netdev_features_t features = dev->features; ++ ++ if (skb_is_gso(skb)) ++ features = gso_features_check(skb, dev, features); ++ ++ /* If encapsulation offload request, verify we are testing ++ * hardware encapsulation features instead of standard ++ * features for the netdev ++ */ ++ if (skb->encapsulation) ++ features &= dev->hw_enc_features; ++ ++ if (skb_vlan_tagged(skb)) ++ features = netdev_intersect_features(features, ++ dev->vlan_features | ++ NETIF_F_HW_VLAN_CTAG_TX | ++ NETIF_F_HW_VLAN_STAG_TX); ++ ++ if (dev->netdev_ops->ndo_features_check) ++ features &= dev->netdev_ops->ndo_features_check(skb, dev, ++ features); ++ else ++ features &= dflt_features_check(skb, dev, features); ++ ++ return harmonize_features(skb, features); ++} ++EXPORT_SYMBOL(netif_skb_features); ++ ++static int xmit_one(struct sk_buff *skb, struct net_device *dev, ++ struct netdev_queue *txq, bool more) ++{ ++ unsigned int len; ++ int rc; ++ ++ if (dev_nit_active(dev)) ++ dev_queue_xmit_nit(skb, dev); ++ ++ len = skb->len; ++ trace_net_dev_start_xmit(skb, dev); ++ rc = netdev_start_xmit(skb, dev, txq, more); ++ trace_net_dev_xmit(skb, rc, dev, len); ++ ++ return rc; ++} ++ ++struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, ++ struct netdev_queue *txq, int *ret) ++{ ++ struct sk_buff *skb = first; ++ int rc = NETDEV_TX_OK; ++ ++ while (skb) { ++ struct sk_buff *next = skb->next; ++ ++ skb_mark_not_on_list(skb); ++ rc = xmit_one(skb, dev, txq, next != NULL); ++ if (unlikely(!dev_xmit_complete(rc))) { ++ skb->next = next; ++ goto out; ++ } ++ ++ skb = next; ++ if (netif_tx_queue_stopped(txq) && skb) { ++ rc = NETDEV_TX_BUSY; ++ break; ++ } ++ } ++ ++out: ++ *ret = rc; ++ return skb; ++} ++ ++static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, ++ netdev_features_t features) ++{ ++ if (skb_vlan_tag_present(skb) && ++ !vlan_hw_offload_capable(features, skb->vlan_proto)) ++ skb = __vlan_hwaccel_push_inside(skb); ++ return skb; ++} ++ ++int skb_csum_hwoffload_help(struct sk_buff *skb, ++ const netdev_features_t features) ++{ ++ if (unlikely(skb_csum_is_sctp(skb))) ++ return !!(features & NETIF_F_SCTP_CRC) ? 0 : ++ skb_crc32c_csum_help(skb); ++ ++ if (features & NETIF_F_HW_CSUM) ++ return 0; ++ ++ if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { ++ switch (skb->csum_offset) { ++ case offsetof(struct tcphdr, check): ++ case offsetof(struct udphdr, check): ++ return 0; ++ } ++ } ++ ++ return skb_checksum_help(skb); ++} ++EXPORT_SYMBOL(skb_csum_hwoffload_help); ++ ++static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) ++{ ++ netdev_features_t features; ++ ++ features = netif_skb_features(skb); ++ skb = validate_xmit_vlan(skb, features); ++ if (unlikely(!skb)) ++ goto out_null; ++ ++ skb = sk_validate_xmit_skb(skb, dev); ++ if (unlikely(!skb)) ++ goto out_null; ++ ++ if (netif_needs_gso(skb, features)) { ++ struct sk_buff *segs; ++ ++ segs = skb_gso_segment(skb, features); ++ if (IS_ERR(segs)) { ++ goto out_kfree_skb; ++ } else if (segs) { ++ consume_skb(skb); ++ skb = segs; ++ } ++ } else { ++ if (skb_needs_linearize(skb, features) && ++ __skb_linearize(skb)) ++ goto out_kfree_skb; ++ ++ /* If packet is not checksummed and device does not ++ * support checksumming for this protocol, complete ++ * checksumming here. ++ */ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ if (skb->encapsulation) ++ skb_set_inner_transport_header(skb, ++ skb_checksum_start_offset(skb)); ++ else ++ skb_set_transport_header(skb, ++ skb_checksum_start_offset(skb)); ++ if (skb_csum_hwoffload_help(skb, features)) ++ goto out_kfree_skb; ++ } ++ } ++ ++ skb = validate_xmit_xfrm(skb, features, again); ++ ++ return skb; ++ ++out_kfree_skb: ++ kfree_skb(skb); ++out_null: ++ dev_core_stats_tx_dropped_inc(dev); ++ return NULL; ++} ++ ++struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) ++{ ++ struct sk_buff *next, *head = NULL, *tail; ++ ++ for (; skb != NULL; skb = next) { ++ next = skb->next; ++ skb_mark_not_on_list(skb); ++ ++ /* in case skb wont be segmented, point to itself */ ++ skb->prev = skb; ++ ++ skb = validate_xmit_skb(skb, dev, again); ++ if (!skb) ++ continue; ++ ++ if (!head) ++ head = skb; ++ else ++ tail->next = skb; ++ /* If skb was segmented, skb->prev points to ++ * the last segment. If not, it still contains skb. ++ */ ++ tail = skb->prev; ++ } ++ return head; ++} ++EXPORT_SYMBOL_GPL(validate_xmit_skb_list); ++ ++static void qdisc_pkt_len_init(struct sk_buff *skb) ++{ ++ const struct skb_shared_info *shinfo = skb_shinfo(skb); ++ ++ qdisc_skb_cb(skb)->pkt_len = skb->len; ++ ++ /* To get more precise estimation of bytes sent on wire, ++ * we add to pkt_len the headers size of all segments ++ */ ++ if (shinfo->gso_size && skb_transport_header_was_set(skb)) { ++ unsigned int hdr_len; ++ u16 gso_segs = shinfo->gso_segs; ++ ++ /* mac layer + network layer */ ++ hdr_len = skb_transport_header(skb) - skb_mac_header(skb); ++ ++ /* + transport layer */ ++ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { ++ const struct tcphdr *th; ++ struct tcphdr _tcphdr; ++ ++ th = skb_header_pointer(skb, skb_transport_offset(skb), ++ sizeof(_tcphdr), &_tcphdr); ++ if (likely(th)) ++ hdr_len += __tcp_hdrlen(th); ++ } else { ++ struct udphdr _udphdr; ++ ++ if (skb_header_pointer(skb, skb_transport_offset(skb), ++ sizeof(_udphdr), &_udphdr)) ++ hdr_len += sizeof(struct udphdr); ++ } ++ ++ if (shinfo->gso_type & SKB_GSO_DODGY) ++ gso_segs = DIV_ROUND_UP(skb->len - hdr_len, ++ shinfo->gso_size); ++ ++ qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; ++ } ++} ++ ++static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, ++ struct sk_buff **to_free, ++ struct netdev_queue *txq) ++{ ++ int rc; ++ ++ rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; ++ if (rc == NET_XMIT_SUCCESS) ++ trace_qdisc_enqueue(q, txq, skb); ++ return rc; ++} ++ ++static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, ++ struct net_device *dev, ++ struct netdev_queue *txq) ++{ ++ spinlock_t *root_lock = qdisc_lock(q); ++ struct sk_buff *to_free = NULL; ++ bool contended; ++ int rc; ++ ++ qdisc_calculate_pkt_len(skb, q); ++ ++ if (q->flags & TCQ_F_NOLOCK) { ++ if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && ++ qdisc_run_begin(q)) { ++ /* Retest nolock_qdisc_is_empty() within the protection ++ * of q->seqlock to protect from racing with requeuing. ++ */ ++ if (unlikely(!nolock_qdisc_is_empty(q))) { ++ rc = dev_qdisc_enqueue(skb, q, &to_free, txq); ++ __qdisc_run(q); ++ qdisc_run_end(q); ++ ++ goto no_lock_out; ++ } ++ ++ qdisc_bstats_cpu_update(q, skb); ++ if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && ++ !nolock_qdisc_is_empty(q)) ++ __qdisc_run(q); ++ ++ qdisc_run_end(q); ++ return NET_XMIT_SUCCESS; ++ } ++ ++ rc = dev_qdisc_enqueue(skb, q, &to_free, txq); ++ qdisc_run(q); ++ ++no_lock_out: ++ if (unlikely(to_free)) ++ kfree_skb_list_reason(to_free, ++ SKB_DROP_REASON_QDISC_DROP); ++ return rc; ++ } ++ ++ /* ++ * Heuristic to force contended enqueues to serialize on a ++ * separate lock before trying to get qdisc main lock. ++ * This permits qdisc->running owner to get the lock more ++ * often and dequeue packets faster. ++ * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit ++ * and then other tasks will only enqueue packets. The packets will be ++ * sent after the qdisc owner is scheduled again. To prevent this ++ * scenario the task always serialize on the lock. ++ */ ++ contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); ++ if (unlikely(contended)) ++ spin_lock(&q->busylock); ++ ++ spin_lock(root_lock); ++ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { ++ __qdisc_drop(skb, &to_free); ++ rc = NET_XMIT_DROP; ++ } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && ++ qdisc_run_begin(q)) { ++ /* ++ * This is a work-conserving queue; there are no old skbs ++ * waiting to be sent out; and the qdisc is not running - ++ * xmit the skb directly. ++ */ ++ ++ qdisc_bstats_update(q, skb); ++ ++ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { ++ if (unlikely(contended)) { ++ spin_unlock(&q->busylock); ++ contended = false; ++ } ++ __qdisc_run(q); ++ } ++ ++ qdisc_run_end(q); ++ rc = NET_XMIT_SUCCESS; ++ } else { ++ rc = dev_qdisc_enqueue(skb, q, &to_free, txq); ++ if (qdisc_run_begin(q)) { ++ if (unlikely(contended)) { ++ spin_unlock(&q->busylock); ++ contended = false; ++ } ++ __qdisc_run(q); ++ qdisc_run_end(q); ++ } ++ } ++ spin_unlock(root_lock); ++ if (unlikely(to_free)) ++ kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); ++ if (unlikely(contended)) ++ spin_unlock(&q->busylock); ++ return rc; ++} ++ ++#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) ++static void skb_update_prio(struct sk_buff *skb) ++{ ++ const struct netprio_map *map; ++ const struct sock *sk; ++ unsigned int prioidx; ++ ++ if (skb->priority) ++ return; ++ map = rcu_dereference_bh(skb->dev->priomap); ++ if (!map) ++ return; ++ sk = skb_to_full_sk(skb); ++ if (!sk) ++ return; ++ ++ prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); ++ ++ if (prioidx < map->priomap_len) ++ skb->priority = map->priomap[prioidx]; ++} ++#else ++#define skb_update_prio(skb) ++#endif ++ ++/** ++ * dev_loopback_xmit - loop back @skb ++ * @net: network namespace this loopback is happening in ++ * @sk: sk needed to be a netfilter okfn ++ * @skb: buffer to transmit ++ */ ++int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) ++{ ++ skb_reset_mac_header(skb); ++ __skb_pull(skb, skb_network_offset(skb)); ++ skb->pkt_type = PACKET_LOOPBACK; ++ if (skb->ip_summed == CHECKSUM_NONE) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); ++ skb_dst_force(skb); ++ netif_rx(skb); ++ return 0; ++} ++EXPORT_SYMBOL(dev_loopback_xmit); ++ ++#ifdef CONFIG_NET_EGRESS ++static struct sk_buff * ++sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) ++{ ++#ifdef CONFIG_NET_CLS_ACT ++ struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); ++ struct tcf_result cl_res; ++ ++ if (!miniq) ++ return skb; ++ ++ /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ ++ tc_skb_cb(skb)->mru = 0; ++ tc_skb_cb(skb)->post_ct = false; ++ mini_qdisc_bstats_cpu_update(miniq, skb); ++ ++ switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { ++ case TC_ACT_OK: ++ case TC_ACT_RECLASSIFY: ++ skb->tc_index = TC_H_MIN(cl_res.classid); ++ break; ++ case TC_ACT_SHOT: ++ mini_qdisc_qstats_cpu_drop(miniq); ++ *ret = NET_XMIT_DROP; ++ kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); ++ return NULL; ++ case TC_ACT_STOLEN: ++ case TC_ACT_QUEUED: ++ case TC_ACT_TRAP: ++ *ret = NET_XMIT_SUCCESS; ++ consume_skb(skb); ++ return NULL; ++ case TC_ACT_REDIRECT: ++ /* No need to push/pop skb's mac_header here on egress! */ ++ skb_do_redirect(skb); ++ *ret = NET_XMIT_SUCCESS; ++ return NULL; ++ default: ++ break; ++ } ++#endif /* CONFIG_NET_CLS_ACT */ ++ ++ return skb; ++} ++ ++static struct netdev_queue * ++netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) ++{ ++ int qm = skb_get_queue_mapping(skb); ++ ++ return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); ++} ++ ++static bool netdev_xmit_txqueue_skipped(void) ++{ ++ return __this_cpu_read(softnet_data.xmit.skip_txqueue); ++} ++ ++void netdev_xmit_skip_txqueue(bool skip) ++{ ++ __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); ++} ++EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); ++#endif /* CONFIG_NET_EGRESS */ ++ ++#ifdef CONFIG_XPS ++static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, ++ struct xps_dev_maps *dev_maps, unsigned int tci) ++{ ++ int tc = netdev_get_prio_tc_map(dev, skb->priority); ++ struct xps_map *map; ++ int queue_index = -1; ++ ++ if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) ++ return queue_index; ++ ++ tci *= dev_maps->num_tc; ++ tci += tc; ++ ++ map = rcu_dereference(dev_maps->attr_map[tci]); ++ if (map) { ++ if (map->len == 1) ++ queue_index = map->queues[0]; ++ else ++ queue_index = map->queues[reciprocal_scale( ++ skb_get_hash(skb), map->len)]; ++ if (unlikely(queue_index >= dev->real_num_tx_queues)) ++ queue_index = -1; ++ } ++ return queue_index; ++} ++#endif ++ ++static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, ++ struct sk_buff *skb) ++{ ++#ifdef CONFIG_XPS ++ struct xps_dev_maps *dev_maps; ++ struct sock *sk = skb->sk; ++ int queue_index = -1; ++ ++ if (!static_key_false(&xps_needed)) ++ return -1; ++ ++ rcu_read_lock(); ++ if (!static_key_false(&xps_rxqs_needed)) ++ goto get_cpus_map; ++ ++ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); ++ if (dev_maps) { ++ int tci = sk_rx_queue_get(sk); ++ ++ if (tci >= 0) ++ queue_index = __get_xps_queue_idx(dev, skb, dev_maps, ++ tci); ++ } ++ ++get_cpus_map: ++ if (queue_index < 0) { ++ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); ++ if (dev_maps) { ++ unsigned int tci = skb->sender_cpu - 1; ++ ++ queue_index = __get_xps_queue_idx(dev, skb, dev_maps, ++ tci); ++ } ++ } ++ rcu_read_unlock(); ++ ++ return queue_index; ++#else ++ return -1; ++#endif ++} ++ ++u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, ++ struct net_device *sb_dev) ++{ ++ return 0; ++} ++EXPORT_SYMBOL(dev_pick_tx_zero); ++ ++u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, ++ struct net_device *sb_dev) ++{ ++ return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; ++} ++EXPORT_SYMBOL(dev_pick_tx_cpu_id); ++ ++u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, ++ struct net_device *sb_dev) ++{ ++ struct sock *sk = skb->sk; ++ int queue_index = sk_tx_queue_get(sk); ++ ++ sb_dev = sb_dev ? : dev; ++ ++ if (queue_index < 0 || skb->ooo_okay || ++ queue_index >= dev->real_num_tx_queues) { ++ int new_index = get_xps_queue(dev, sb_dev, skb); ++ ++ if (new_index < 0) ++ new_index = skb_tx_hash(dev, sb_dev, skb); ++ ++ if (queue_index != new_index && sk && ++ sk_fullsock(sk) && ++ rcu_access_pointer(sk->sk_dst_cache)) ++ sk_tx_queue_set(sk, new_index); ++ ++ queue_index = new_index; ++ } ++ ++ return queue_index; ++} ++EXPORT_SYMBOL(netdev_pick_tx); ++ ++struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, ++ struct sk_buff *skb, ++ struct net_device *sb_dev) ++{ ++ int queue_index = 0; ++ ++#ifdef CONFIG_XPS ++ u32 sender_cpu = skb->sender_cpu - 1; ++ ++ if (sender_cpu >= (u32)NR_CPUS) ++ skb->sender_cpu = raw_smp_processor_id() + 1; ++#endif ++ ++ if (dev->real_num_tx_queues != 1) { ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (ops->ndo_select_queue) ++ queue_index = ops->ndo_select_queue(dev, skb, sb_dev); ++ else ++ queue_index = netdev_pick_tx(dev, skb, sb_dev); ++ ++ queue_index = netdev_cap_txqueue(dev, queue_index); ++ } ++ ++ skb_set_queue_mapping(skb, queue_index); ++ return netdev_get_tx_queue(dev, queue_index); ++} ++ ++/** ++ * __dev_queue_xmit() - transmit a buffer ++ * @skb: buffer to transmit ++ * @sb_dev: suboordinate device used for L2 forwarding offload ++ * ++ * Queue a buffer for transmission to a network device. The caller must ++ * have set the device and priority and built the buffer before calling ++ * this function. The function can be called from an interrupt. ++ * ++ * When calling this method, interrupts MUST be enabled. This is because ++ * the BH enable code must have IRQs enabled so that it will not deadlock. ++ * ++ * Regardless of the return value, the skb is consumed, so it is currently ++ * difficult to retry a send to this method. (You can bump the ref count ++ * before sending to hold a reference for retry if you are careful.) ++ * ++ * Return: ++ * * 0 - buffer successfully transmitted ++ * * positive qdisc return code - NET_XMIT_DROP etc. ++ * * negative errno - other errors ++ */ ++int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) ++{ ++ struct net_device *dev = skb->dev; ++ struct netdev_queue *txq = NULL; ++ struct Qdisc *q; ++ int rc = -ENOMEM; ++ bool again = false; ++ ++ skb_reset_mac_header(skb); ++ skb_assert_len(skb); ++ ++ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) ++ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); ++ ++ /* Disable soft irqs for various locks below. Also ++ * stops preemption for RCU. ++ */ ++ rcu_read_lock_bh(); ++ ++ skb_update_prio(skb); ++ ++ qdisc_pkt_len_init(skb); ++#ifdef CONFIG_NET_CLS_ACT ++ skb->tc_at_ingress = 0; ++#endif ++#ifdef CONFIG_NET_EGRESS ++ if (static_branch_unlikely(&egress_needed_key)) { ++ if (nf_hook_egress_active()) { ++ skb = nf_hook_egress(skb, &rc, dev); ++ if (!skb) ++ goto out; ++ } ++ ++ netdev_xmit_skip_txqueue(false); ++ ++ nf_skip_egress(skb, true); ++ skb = sch_handle_egress(skb, &rc, dev); ++ if (!skb) ++ goto out; ++ nf_skip_egress(skb, false); ++ ++ if (netdev_xmit_txqueue_skipped()) ++ txq = netdev_tx_queue_mapping(dev, skb); ++ } ++#endif ++ /* If device/qdisc don't need skb->dst, release it right now while ++ * its hot in this cpu cache. ++ */ ++ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) ++ skb_dst_drop(skb); ++ else ++ skb_dst_force(skb); ++ ++ if (!txq) ++ txq = netdev_core_pick_tx(dev, skb, sb_dev); ++ ++ q = rcu_dereference_bh(txq->qdisc); ++ ++ trace_net_dev_queue(skb); ++ if (q->enqueue) { ++ rc = __dev_xmit_skb(skb, q, dev, txq); ++ goto out; ++ } ++ ++ /* The device has no queue. Common case for software devices: ++ * loopback, all the sorts of tunnels... ++ ++ * Really, it is unlikely that netif_tx_lock protection is necessary ++ * here. (f.e. loopback and IP tunnels are clean ignoring statistics ++ * counters.) ++ * However, it is possible, that they rely on protection ++ * made by us here. ++ ++ * Check this and shot the lock. It is not prone from deadlocks. ++ *Either shot noqueue qdisc, it is even simpler 8) ++ */ ++ if (dev->flags & IFF_UP) { ++ int cpu = smp_processor_id(); /* ok because BHs are off */ ++ ++ /* Other cpus might concurrently change txq->xmit_lock_owner ++ * to -1 or to their cpu id, but not to our id. ++ */ ++ if (READ_ONCE(txq->xmit_lock_owner) != cpu) { ++ if (dev_xmit_recursion()) ++ goto recursion_alert; ++ ++ skb = validate_xmit_skb(skb, dev, &again); ++ if (!skb) ++ goto out; ++ ++ HARD_TX_LOCK(dev, txq, cpu); ++ ++ if (!netif_xmit_stopped(txq)) { ++ dev_xmit_recursion_inc(); ++ skb = dev_hard_start_xmit(skb, dev, txq, &rc); ++ dev_xmit_recursion_dec(); ++ if (dev_xmit_complete(rc)) { ++ HARD_TX_UNLOCK(dev, txq); ++ goto out; ++ } ++ } ++ HARD_TX_UNLOCK(dev, txq); ++ net_crit_ratelimited("Virtual device %s asks to queue packet!\n", ++ dev->name); ++ } else { ++ /* Recursion is detected! It is possible, ++ * unfortunately ++ */ ++recursion_alert: ++ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", ++ dev->name); ++ } ++ } ++ ++ rc = -ENETDOWN; ++ rcu_read_unlock_bh(); ++ ++ dev_core_stats_tx_dropped_inc(dev); ++ kfree_skb_list(skb); ++ return rc; ++out: ++ rcu_read_unlock_bh(); ++ return rc; ++} ++EXPORT_SYMBOL(__dev_queue_xmit); ++ ++int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) ++{ ++ struct net_device *dev = skb->dev; ++ struct sk_buff *orig_skb = skb; ++ struct netdev_queue *txq; ++ int ret = NETDEV_TX_BUSY; ++ bool again = false; ++ ++ if (unlikely(!netif_running(dev) || ++ !netif_carrier_ok(dev))) ++ goto drop; ++ ++ skb = validate_xmit_skb_list(skb, dev, &again); ++ if (skb != orig_skb) ++ goto drop; ++ ++ skb_set_queue_mapping(skb, queue_id); ++ txq = skb_get_tx_queue(dev, skb); ++ ++ local_bh_disable(); ++ ++ dev_xmit_recursion_inc(); ++ HARD_TX_LOCK(dev, txq, smp_processor_id()); ++ if (!netif_xmit_frozen_or_drv_stopped(txq)) ++ ret = netdev_start_xmit(skb, dev, txq, false); ++ HARD_TX_UNLOCK(dev, txq); ++ dev_xmit_recursion_dec(); ++ ++ local_bh_enable(); ++ return ret; ++drop: ++ dev_core_stats_tx_dropped_inc(dev); ++ kfree_skb_list(skb); ++ return NET_XMIT_DROP; ++} ++EXPORT_SYMBOL(__dev_direct_xmit); ++ ++/************************************************************************* ++ * Receiver routines ++ *************************************************************************/ ++ ++int netdev_max_backlog __read_mostly = 1000; ++EXPORT_SYMBOL(netdev_max_backlog); ++ ++int netdev_tstamp_prequeue __read_mostly = 1; ++unsigned int sysctl_skb_defer_max __read_mostly = 64; ++int netdev_budget __read_mostly = 300; ++/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ ++unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; ++int weight_p __read_mostly = 64; /* old backlog weight */ ++int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ ++int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ ++int dev_rx_weight __read_mostly = 64; ++int dev_tx_weight __read_mostly = 64; ++ ++/* Called with irq disabled */ ++static inline void ____napi_schedule(struct softnet_data *sd, ++ struct napi_struct *napi) ++{ ++ struct task_struct *thread; ++ ++ lockdep_assert_irqs_disabled(); ++ ++ if (test_bit(NAPI_STATE_THREADED, &napi->state)) { ++ /* Paired with smp_mb__before_atomic() in ++ * napi_enable()/dev_set_threaded(). ++ * Use READ_ONCE() to guarantee a complete ++ * read on napi->thread. Only call ++ * wake_up_process() when it's not NULL. ++ */ ++ thread = READ_ONCE(napi->thread); ++ if (thread) { ++ /* Avoid doing set_bit() if the thread is in ++ * INTERRUPTIBLE state, cause napi_thread_wait() ++ * makes sure to proceed with napi polling ++ * if the thread is explicitly woken from here. ++ */ ++ if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) ++ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); ++ wake_up_process(thread); ++ return; ++ } ++ } ++ ++ list_add_tail(&napi->poll_list, &sd->poll_list); ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++} ++ ++#ifdef CONFIG_RPS ++ ++/* One global table that all flow-based protocols share. */ ++struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; ++EXPORT_SYMBOL(rps_sock_flow_table); ++u32 rps_cpu_mask __read_mostly; ++EXPORT_SYMBOL(rps_cpu_mask); ++ ++struct static_key_false rps_needed __read_mostly; ++EXPORT_SYMBOL(rps_needed); ++struct static_key_false rfs_needed __read_mostly; ++EXPORT_SYMBOL(rfs_needed); ++ ++static struct rps_dev_flow * ++set_rps_cpu(struct net_device *dev, struct sk_buff *skb, ++ struct rps_dev_flow *rflow, u16 next_cpu) ++{ ++ if (next_cpu < nr_cpu_ids) { ++#ifdef CONFIG_RFS_ACCEL ++ struct netdev_rx_queue *rxqueue; ++ struct rps_dev_flow_table *flow_table; ++ struct rps_dev_flow *old_rflow; ++ u32 flow_id; ++ u16 rxq_index; ++ int rc; ++ ++ /* Should we steer this flow to a different hardware queue? */ ++ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || ++ !(dev->features & NETIF_F_NTUPLE)) ++ goto out; ++ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); ++ if (rxq_index == skb_get_rx_queue(skb)) ++ goto out; ++ ++ rxqueue = dev->_rx + rxq_index; ++ flow_table = rcu_dereference(rxqueue->rps_flow_table); ++ if (!flow_table) ++ goto out; ++ flow_id = skb_get_hash(skb) & flow_table->mask; ++ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, ++ rxq_index, flow_id); ++ if (rc < 0) ++ goto out; ++ old_rflow = rflow; ++ rflow = &flow_table->flows[flow_id]; ++ rflow->filter = rc; ++ if (old_rflow->filter == rflow->filter) ++ old_rflow->filter = RPS_NO_FILTER; ++ out: ++#endif ++ rflow->last_qtail = ++ per_cpu(softnet_data, next_cpu).input_queue_head; ++ } ++ ++ rflow->cpu = next_cpu; ++ return rflow; ++} ++ ++/* ++ * get_rps_cpu is called from netif_receive_skb and returns the target ++ * CPU from the RPS map of the receiving queue for a given skb. ++ * rcu_read_lock must be held on entry. ++ */ ++static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ++ struct rps_dev_flow **rflowp) ++{ ++ const struct rps_sock_flow_table *sock_flow_table; ++ struct netdev_rx_queue *rxqueue = dev->_rx; ++ struct rps_dev_flow_table *flow_table; ++ struct rps_map *map; ++ int cpu = -1; ++ u32 tcpu; ++ u32 hash; ++ ++ if (skb_rx_queue_recorded(skb)) { ++ u16 index = skb_get_rx_queue(skb); ++ ++ if (unlikely(index >= dev->real_num_rx_queues)) { ++ WARN_ONCE(dev->real_num_rx_queues > 1, ++ "%s received packet on queue %u, but number " ++ "of RX queues is %u\n", ++ dev->name, index, dev->real_num_rx_queues); ++ goto done; ++ } ++ rxqueue += index; ++ } ++ ++ /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ ++ ++ flow_table = rcu_dereference(rxqueue->rps_flow_table); ++ map = rcu_dereference(rxqueue->rps_map); ++ if (!flow_table && !map) ++ goto done; ++ ++ skb_reset_network_header(skb); ++ hash = skb_get_hash(skb); ++ if (!hash) ++ goto done; ++ ++ sock_flow_table = rcu_dereference(rps_sock_flow_table); ++ if (flow_table && sock_flow_table) { ++ struct rps_dev_flow *rflow; ++ u32 next_cpu; ++ u32 ident; ++ ++ /* First check into global flow table if there is a match */ ++ ident = sock_flow_table->ents[hash & sock_flow_table->mask]; ++ if ((ident ^ hash) & ~rps_cpu_mask) ++ goto try_rps; ++ ++ next_cpu = ident & rps_cpu_mask; ++ ++ /* OK, now we know there is a match, ++ * we can look at the local (per receive queue) flow table ++ */ ++ rflow = &flow_table->flows[hash & flow_table->mask]; ++ tcpu = rflow->cpu; ++ ++ /* ++ * If the desired CPU (where last recvmsg was done) is ++ * different from current CPU (one in the rx-queue flow ++ * table entry), switch if one of the following holds: ++ * - Current CPU is unset (>= nr_cpu_ids). ++ * - Current CPU is offline. ++ * - The current CPU's queue tail has advanced beyond the ++ * last packet that was enqueued using this table entry. ++ * This guarantees that all previous packets for the flow ++ * have been dequeued, thus preserving in order delivery. ++ */ ++ if (unlikely(tcpu != next_cpu) && ++ (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ++ ((int)(per_cpu(softnet_data, tcpu).input_queue_head - ++ rflow->last_qtail)) >= 0)) { ++ tcpu = next_cpu; ++ rflow = set_rps_cpu(dev, skb, rflow, next_cpu); ++ } ++ ++ if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { ++ *rflowp = rflow; ++ cpu = tcpu; ++ goto done; ++ } ++ } ++ ++try_rps: ++ ++ if (map) { ++ tcpu = map->cpus[reciprocal_scale(hash, map->len)]; ++ if (cpu_online(tcpu)) { ++ cpu = tcpu; ++ goto done; ++ } ++ } ++ ++done: ++ return cpu; ++} ++ ++#ifdef CONFIG_RFS_ACCEL ++ ++/** ++ * rps_may_expire_flow - check whether an RFS hardware filter may be removed ++ * @dev: Device on which the filter was set ++ * @rxq_index: RX queue index ++ * @flow_id: Flow ID passed to ndo_rx_flow_steer() ++ * @filter_id: Filter ID returned by ndo_rx_flow_steer() ++ * ++ * Drivers that implement ndo_rx_flow_steer() should periodically call ++ * this function for each installed filter and remove the filters for ++ * which it returns %true. ++ */ ++bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, ++ u32 flow_id, u16 filter_id) ++{ ++ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; ++ struct rps_dev_flow_table *flow_table; ++ struct rps_dev_flow *rflow; ++ bool expire = true; ++ unsigned int cpu; ++ ++ rcu_read_lock(); ++ flow_table = rcu_dereference(rxqueue->rps_flow_table); ++ if (flow_table && flow_id <= flow_table->mask) { ++ rflow = &flow_table->flows[flow_id]; ++ cpu = READ_ONCE(rflow->cpu); ++ if (rflow->filter == filter_id && cpu < nr_cpu_ids && ++ ((int)(per_cpu(softnet_data, cpu).input_queue_head - ++ rflow->last_qtail) < ++ (int)(10 * flow_table->mask))) ++ expire = false; ++ } ++ rcu_read_unlock(); ++ return expire; ++} ++EXPORT_SYMBOL(rps_may_expire_flow); ++ ++#endif /* CONFIG_RFS_ACCEL */ ++ ++/* Called from hardirq (IPI) context */ ++static void rps_trigger_softirq(void *data) ++{ ++ struct softnet_data *sd = data; ++ ++ ____napi_schedule(sd, &sd->backlog); ++ sd->received_rps++; ++} ++ ++#endif /* CONFIG_RPS */ ++ ++/* Called from hardirq (IPI) context */ ++static void trigger_rx_softirq(void *data) ++{ ++ struct softnet_data *sd = data; ++ ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ smp_store_release(&sd->defer_ipi_scheduled, 0); ++} ++ ++/* ++ * Check if this softnet_data structure is another cpu one ++ * If yes, queue it to our IPI list and return 1 ++ * If no, return 0 ++ */ ++static int napi_schedule_rps(struct softnet_data *sd) ++{ ++ struct softnet_data *mysd = this_cpu_ptr(&softnet_data); ++ ++#ifdef CONFIG_RPS ++ if (sd != mysd) { ++ sd->rps_ipi_next = mysd->rps_ipi_list; ++ mysd->rps_ipi_list = sd; ++ ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ return 1; ++ } ++#endif /* CONFIG_RPS */ ++ __napi_schedule_irqoff(&mysd->backlog); ++ return 0; ++} ++ ++#ifdef CONFIG_NET_FLOW_LIMIT ++int netdev_flow_limit_table_len __read_mostly = (1 << 12); ++#endif ++ ++static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) ++{ ++#ifdef CONFIG_NET_FLOW_LIMIT ++ struct sd_flow_limit *fl; ++ struct softnet_data *sd; ++ unsigned int old_flow, new_flow; ++ ++ if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) ++ return false; ++ ++ sd = this_cpu_ptr(&softnet_data); ++ ++ rcu_read_lock(); ++ fl = rcu_dereference(sd->flow_limit); ++ if (fl) { ++ new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); ++ old_flow = fl->history[fl->history_head]; ++ fl->history[fl->history_head] = new_flow; ++ ++ fl->history_head++; ++ fl->history_head &= FLOW_LIMIT_HISTORY - 1; ++ ++ if (likely(fl->buckets[old_flow])) ++ fl->buckets[old_flow]--; ++ ++ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { ++ fl->count++; ++ rcu_read_unlock(); ++ return true; ++ } ++ } ++ rcu_read_unlock(); ++#endif ++ return false; ++} ++ ++/* ++ * enqueue_to_backlog is called to queue an skb to a per CPU backlog ++ * queue (may be a remote CPU queue). ++ */ ++static int enqueue_to_backlog(struct sk_buff *skb, int cpu, ++ unsigned int *qtail) ++{ ++ enum skb_drop_reason reason; ++ struct softnet_data *sd; ++ unsigned long flags; ++ unsigned int qlen; ++ ++ reason = SKB_DROP_REASON_NOT_SPECIFIED; ++ sd = &per_cpu(softnet_data, cpu); ++ ++ rps_lock_irqsave(sd, &flags); ++ if (!netif_running(skb->dev)) ++ goto drop; ++ qlen = skb_queue_len(&sd->input_pkt_queue); ++ if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { ++ if (qlen) { ++enqueue: ++ __skb_queue_tail(&sd->input_pkt_queue, skb); ++ input_queue_tail_incr_save(sd, qtail); ++ rps_unlock_irq_restore(sd, &flags); ++ return NET_RX_SUCCESS; ++ } ++ ++ /* Schedule NAPI for backlog device ++ * We can use non atomic operation since we own the queue lock ++ */ ++ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) ++ napi_schedule_rps(sd); ++ goto enqueue; ++ } ++ reason = SKB_DROP_REASON_CPU_BACKLOG; ++ ++drop: ++ sd->dropped++; ++ rps_unlock_irq_restore(sd, &flags); ++ ++ dev_core_stats_rx_dropped_inc(skb->dev); ++ kfree_skb_reason(skb, reason); ++ return NET_RX_DROP; ++} ++ ++static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ struct netdev_rx_queue *rxqueue; ++ ++ rxqueue = dev->_rx; ++ ++ if (skb_rx_queue_recorded(skb)) { ++ u16 index = skb_get_rx_queue(skb); ++ ++ if (unlikely(index >= dev->real_num_rx_queues)) { ++ WARN_ONCE(dev->real_num_rx_queues > 1, ++ "%s received packet on queue %u, but number " ++ "of RX queues is %u\n", ++ dev->name, index, dev->real_num_rx_queues); ++ ++ return rxqueue; /* Return first rxqueue */ ++ } ++ rxqueue += index; ++ } ++ return rxqueue; ++} ++ ++u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, ++ struct bpf_prog *xdp_prog) ++{ ++ void *orig_data, *orig_data_end, *hard_start; ++ struct netdev_rx_queue *rxqueue; ++ bool orig_bcast, orig_host; ++ u32 mac_len, frame_sz; ++ __be16 orig_eth_type; ++ struct ethhdr *eth; ++ u32 metalen, act; ++ int off; ++ ++ /* The XDP program wants to see the packet starting at the MAC ++ * header. ++ */ ++ mac_len = skb->data - skb_mac_header(skb); ++ hard_start = skb->data - skb_headroom(skb); ++ ++ /* SKB "head" area always have tailroom for skb_shared_info */ ++ frame_sz = (void *)skb_end_pointer(skb) - hard_start; ++ frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ ++ rxqueue = netif_get_rxqueue(skb); ++ xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); ++ xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, ++ skb_headlen(skb) + mac_len, true); ++ ++ orig_data_end = xdp->data_end; ++ orig_data = xdp->data; ++ eth = (struct ethhdr *)xdp->data; ++ orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); ++ orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); ++ orig_eth_type = eth->h_proto; ++ ++ act = bpf_prog_run_xdp(xdp_prog, xdp); ++ ++ /* check if bpf_xdp_adjust_head was used */ ++ off = xdp->data - orig_data; ++ if (off) { ++ if (off > 0) ++ __skb_pull(skb, off); ++ else if (off < 0) ++ __skb_push(skb, -off); ++ ++ skb->mac_header += off; ++ skb_reset_network_header(skb); ++ } ++ ++ /* check if bpf_xdp_adjust_tail was used */ ++ off = xdp->data_end - orig_data_end; ++ if (off != 0) { ++ skb_set_tail_pointer(skb, xdp->data_end - xdp->data); ++ skb->len += off; /* positive on grow, negative on shrink */ ++ } ++ ++ /* check if XDP changed eth hdr such SKB needs update */ ++ eth = (struct ethhdr *)xdp->data; ++ if ((orig_eth_type != eth->h_proto) || ++ (orig_host != ether_addr_equal_64bits(eth->h_dest, ++ skb->dev->dev_addr)) || ++ (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { ++ __skb_push(skb, ETH_HLEN); ++ skb->pkt_type = PACKET_HOST; ++ skb->protocol = eth_type_trans(skb, skb->dev); ++ } ++ ++ /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull ++ * before calling us again on redirect path. We do not call do_redirect ++ * as we leave that up to the caller. ++ * ++ * Caller is responsible for managing lifetime of skb (i.e. calling ++ * kfree_skb in response to actions it cannot handle/XDP_DROP). ++ */ ++ switch (act) { ++ case XDP_REDIRECT: ++ case XDP_TX: ++ __skb_push(skb, mac_len); ++ break; ++ case XDP_PASS: ++ metalen = xdp->data - xdp->data_meta; ++ if (metalen) ++ skb_metadata_set(skb, metalen); ++ break; ++ } ++ ++ return act; ++} ++ ++static u32 netif_receive_generic_xdp(struct sk_buff *skb, ++ struct xdp_buff *xdp, ++ struct bpf_prog *xdp_prog) ++{ ++ u32 act = XDP_DROP; ++ ++ /* Reinjected packets coming from act_mirred or similar should ++ * not get XDP generic processing. ++ */ ++ if (skb_is_redirected(skb)) ++ return XDP_PASS; ++ ++ /* XDP packets must be linear and must have sufficient headroom ++ * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also ++ * native XDP provides, thus we need to do it here as well. ++ */ ++ if (skb_cloned(skb) || skb_is_nonlinear(skb) || ++ skb_headroom(skb) < XDP_PACKET_HEADROOM) { ++ int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); ++ int troom = skb->tail + skb->data_len - skb->end; ++ ++ /* In case we have to go down the path and also linearize, ++ * then lets do the pskb_expand_head() work just once here. ++ */ ++ if (pskb_expand_head(skb, ++ hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, ++ troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) ++ goto do_drop; ++ if (skb_linearize(skb)) ++ goto do_drop; ++ } ++ ++ act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); ++ switch (act) { ++ case XDP_REDIRECT: ++ case XDP_TX: ++ case XDP_PASS: ++ break; ++ default: ++ bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); ++ fallthrough; ++ case XDP_ABORTED: ++ trace_xdp_exception(skb->dev, xdp_prog, act); ++ fallthrough; ++ case XDP_DROP: ++ do_drop: ++ kfree_skb(skb); ++ break; ++ } ++ ++ return act; ++} ++ ++/* When doing generic XDP we have to bypass the qdisc layer and the ++ * network taps in order to match in-driver-XDP behavior. This also means ++ * that XDP packets are able to starve other packets going through a qdisc, ++ * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX ++ * queues, so they do not have this starvation issue. ++ */ ++void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) ++{ ++ struct net_device *dev = skb->dev; ++ struct netdev_queue *txq; ++ bool free_skb = true; ++ int cpu, rc; ++ ++ txq = netdev_core_pick_tx(dev, skb, NULL); ++ cpu = smp_processor_id(); ++ HARD_TX_LOCK(dev, txq, cpu); ++ if (!netif_xmit_frozen_or_drv_stopped(txq)) { ++ rc = netdev_start_xmit(skb, dev, txq, 0); ++ if (dev_xmit_complete(rc)) ++ free_skb = false; ++ } ++ HARD_TX_UNLOCK(dev, txq); ++ if (free_skb) { ++ trace_xdp_exception(dev, xdp_prog, XDP_TX); ++ dev_core_stats_tx_dropped_inc(dev); ++ kfree_skb(skb); ++ } ++} ++ ++static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); ++ ++int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) ++{ ++ if (xdp_prog) { ++ struct xdp_buff xdp; ++ u32 act; ++ int err; ++ ++ act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); ++ if (act != XDP_PASS) { ++ switch (act) { ++ case XDP_REDIRECT: ++ err = xdp_do_generic_redirect(skb->dev, skb, ++ &xdp, xdp_prog); ++ if (err) ++ goto out_redir; ++ break; ++ case XDP_TX: ++ generic_xdp_tx(skb, xdp_prog); ++ break; ++ } ++ return XDP_DROP; ++ } ++ } ++ return XDP_PASS; ++out_redir: ++ kfree_skb_reason(skb, SKB_DROP_REASON_XDP); ++ return XDP_DROP; ++} ++EXPORT_SYMBOL_GPL(do_xdp_generic); ++ ++static int netif_rx_internal(struct sk_buff *skb) ++{ ++ int ret; ++ ++ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); ++ ++ trace_netif_rx(skb); ++ ++#ifdef CONFIG_RPS ++ if (static_branch_unlikely(&rps_needed)) { ++ struct rps_dev_flow voidflow, *rflow = &voidflow; ++ int cpu; ++ ++ rcu_read_lock(); ++ ++ cpu = get_rps_cpu(skb->dev, skb, &rflow); ++ if (cpu < 0) ++ cpu = smp_processor_id(); ++ ++ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); ++ ++ rcu_read_unlock(); ++ } else ++#endif ++ { ++ unsigned int qtail; ++ ++ ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); ++ } ++ return ret; ++} ++ ++/** ++ * __netif_rx - Slightly optimized version of netif_rx ++ * @skb: buffer to post ++ * ++ * This behaves as netif_rx except that it does not disable bottom halves. ++ * As a result this function may only be invoked from the interrupt context ++ * (either hard or soft interrupt). ++ */ ++int __netif_rx(struct sk_buff *skb) ++{ ++ int ret; ++ ++ lockdep_assert_once(hardirq_count() | softirq_count()); ++ ++ trace_netif_rx_entry(skb); ++ ret = netif_rx_internal(skb); ++ trace_netif_rx_exit(ret); ++ return ret; ++} ++EXPORT_SYMBOL(__netif_rx); ++ ++/** ++ * netif_rx - post buffer to the network code ++ * @skb: buffer to post ++ * ++ * This function receives a packet from a device driver and queues it for ++ * the upper (protocol) levels to process via the backlog NAPI device. It ++ * always succeeds. The buffer may be dropped during processing for ++ * congestion control or by the protocol layers. ++ * The network buffer is passed via the backlog NAPI device. Modern NIC ++ * driver should use NAPI and GRO. ++ * This function can used from interrupt and from process context. The ++ * caller from process context must not disable interrupts before invoking ++ * this function. ++ * ++ * return values: ++ * NET_RX_SUCCESS (no congestion) ++ * NET_RX_DROP (packet was dropped) ++ * ++ */ ++int netif_rx(struct sk_buff *skb) ++{ ++ bool need_bh_off = !(hardirq_count() | softirq_count()); ++ int ret; ++ ++ if (need_bh_off) ++ local_bh_disable(); ++ trace_netif_rx_entry(skb); ++ ret = netif_rx_internal(skb); ++ trace_netif_rx_exit(ret); ++ if (need_bh_off) ++ local_bh_enable(); ++ return ret; ++} ++EXPORT_SYMBOL(netif_rx); ++ ++static __latent_entropy void net_tx_action(struct softirq_action *h) ++{ ++ struct softnet_data *sd = this_cpu_ptr(&softnet_data); ++ ++ if (sd->completion_queue) { ++ struct sk_buff *clist; ++ ++ local_irq_disable(); ++ clist = sd->completion_queue; ++ sd->completion_queue = NULL; ++ local_irq_enable(); ++ ++ while (clist) { ++ struct sk_buff *skb = clist; ++ ++ clist = clist->next; ++ ++ WARN_ON(refcount_read(&skb->users)); ++ if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) ++ trace_consume_skb(skb); ++ else ++ trace_kfree_skb(skb, net_tx_action, ++ SKB_DROP_REASON_NOT_SPECIFIED); ++ ++ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) ++ __kfree_skb(skb); ++ else ++ __kfree_skb_defer(skb); ++ } ++ } ++ ++ if (sd->output_queue) { ++ struct Qdisc *head; ++ ++ local_irq_disable(); ++ head = sd->output_queue; ++ sd->output_queue = NULL; ++ sd->output_queue_tailp = &sd->output_queue; ++ local_irq_enable(); ++ ++ rcu_read_lock(); ++ ++ while (head) { ++ struct Qdisc *q = head; ++ spinlock_t *root_lock = NULL; ++ ++ head = head->next_sched; ++ ++ /* We need to make sure head->next_sched is read ++ * before clearing __QDISC_STATE_SCHED ++ */ ++ smp_mb__before_atomic(); ++ ++ if (!(q->flags & TCQ_F_NOLOCK)) { ++ root_lock = qdisc_lock(q); ++ spin_lock(root_lock); ++ } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, ++ &q->state))) { ++ /* There is a synchronize_net() between ++ * STATE_DEACTIVATED flag being set and ++ * qdisc_reset()/some_qdisc_is_busy() in ++ * dev_deactivate(), so we can safely bail out ++ * early here to avoid data race between ++ * qdisc_deactivate() and some_qdisc_is_busy() ++ * for lockless qdisc. ++ */ ++ clear_bit(__QDISC_STATE_SCHED, &q->state); ++ continue; ++ } ++ ++ clear_bit(__QDISC_STATE_SCHED, &q->state); ++ qdisc_run(q); ++ if (root_lock) ++ spin_unlock(root_lock); ++ } ++ ++ rcu_read_unlock(); ++ } ++ ++ xfrm_dev_backlog(sd); ++} ++ ++#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) ++/* This hook is defined here for ATM LANE */ ++int (*br_fdb_test_addr_hook)(struct net_device *dev, ++ unsigned char *addr) __read_mostly; ++EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); ++#endif ++ ++static inline struct sk_buff * ++sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, ++ struct net_device *orig_dev, bool *another) ++{ ++#ifdef CONFIG_NET_CLS_ACT ++ struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); ++ struct tcf_result cl_res; ++ ++ /* If there's at least one ingress present somewhere (so ++ * we get here via enabled static key), remaining devices ++ * that are not configured with an ingress qdisc will bail ++ * out here. ++ */ ++ if (!miniq) ++ return skb; ++ ++ if (*pt_prev) { ++ *ret = deliver_skb(skb, *pt_prev, orig_dev); ++ *pt_prev = NULL; ++ } ++ ++ qdisc_skb_cb(skb)->pkt_len = skb->len; ++ tc_skb_cb(skb)->mru = 0; ++ tc_skb_cb(skb)->post_ct = false; ++ skb->tc_at_ingress = 1; ++ mini_qdisc_bstats_cpu_update(miniq, skb); ++ ++ switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { ++ case TC_ACT_OK: ++ case TC_ACT_RECLASSIFY: ++ skb->tc_index = TC_H_MIN(cl_res.classid); ++ break; ++ case TC_ACT_SHOT: ++ mini_qdisc_qstats_cpu_drop(miniq); ++ kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); ++ *ret = NET_RX_DROP; ++ return NULL; ++ case TC_ACT_STOLEN: ++ case TC_ACT_QUEUED: ++ case TC_ACT_TRAP: ++ consume_skb(skb); ++ *ret = NET_RX_SUCCESS; ++ return NULL; ++ case TC_ACT_REDIRECT: ++ /* skb_mac_header check was done by cls/act_bpf, so ++ * we can safely push the L2 header back before ++ * redirecting to another netdev ++ */ ++ __skb_push(skb, skb->mac_len); ++ if (skb_do_redirect(skb) == -EAGAIN) { ++ __skb_pull(skb, skb->mac_len); ++ *another = true; ++ break; ++ } ++ *ret = NET_RX_SUCCESS; ++ return NULL; ++ case TC_ACT_CONSUMED: ++ *ret = NET_RX_SUCCESS; ++ return NULL; ++ default: ++ break; ++ } ++#endif /* CONFIG_NET_CLS_ACT */ ++ return skb; ++} ++ ++/** ++ * netdev_is_rx_handler_busy - check if receive handler is registered ++ * @dev: device to check ++ * ++ * Check if a receive handler is already registered for a given device. ++ * Return true if there one. ++ * ++ * The caller must hold the rtnl_mutex. ++ */ ++bool netdev_is_rx_handler_busy(struct net_device *dev) ++{ ++ ASSERT_RTNL(); ++ return dev && rtnl_dereference(dev->rx_handler); ++} ++EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); ++ ++/** ++ * netdev_rx_handler_register - register receive handler ++ * @dev: device to register a handler for ++ * @rx_handler: receive handler to register ++ * @rx_handler_data: data pointer that is used by rx handler ++ * ++ * Register a receive handler for a device. This handler will then be ++ * called from __netif_receive_skb. A negative errno code is returned ++ * on a failure. ++ * ++ * The caller must hold the rtnl_mutex. ++ * ++ * For a general description of rx_handler, see enum rx_handler_result. ++ */ ++int netdev_rx_handler_register(struct net_device *dev, ++ rx_handler_func_t *rx_handler, ++ void *rx_handler_data) ++{ ++ if (netdev_is_rx_handler_busy(dev)) ++ return -EBUSY; ++ ++ if (dev->priv_flags & IFF_NO_RX_HANDLER) ++ return -EINVAL; ++ ++ /* Note: rx_handler_data must be set before rx_handler */ ++ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); ++ rcu_assign_pointer(dev->rx_handler, rx_handler); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(netdev_rx_handler_register); ++ ++/** ++ * netdev_rx_handler_unregister - unregister receive handler ++ * @dev: device to unregister a handler from ++ * ++ * Unregister a receive handler from a device. ++ * ++ * The caller must hold the rtnl_mutex. ++ */ ++void netdev_rx_handler_unregister(struct net_device *dev) ++{ ++ ++ ASSERT_RTNL(); ++ RCU_INIT_POINTER(dev->rx_handler, NULL); ++ /* a reader seeing a non NULL rx_handler in a rcu_read_lock() ++ * section has a guarantee to see a non NULL rx_handler_data ++ * as well. ++ */ ++ synchronize_net(); ++ RCU_INIT_POINTER(dev->rx_handler_data, NULL); ++} ++EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); ++ ++/* ++ * Limit the use of PFMEMALLOC reserves to those protocols that implement ++ * the special handling of PFMEMALLOC skbs. ++ */ ++static bool skb_pfmemalloc_protocol(struct sk_buff *skb) ++{ ++ switch (skb->protocol) { ++ case htons(ETH_P_ARP): ++ case htons(ETH_P_IP): ++ case htons(ETH_P_IPV6): ++ case htons(ETH_P_8021Q): ++ case htons(ETH_P_8021AD): ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, ++ int *ret, struct net_device *orig_dev) ++{ ++ if (nf_hook_ingress_active(skb)) { ++ int ingress_retval; ++ ++ if (*pt_prev) { ++ *ret = deliver_skb(skb, *pt_prev, orig_dev); ++ *pt_prev = NULL; ++ } ++ ++ rcu_read_lock(); ++ ingress_retval = nf_hook_ingress(skb); ++ rcu_read_unlock(); ++ return ingress_retval; ++ } ++ return 0; ++} ++ ++static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, ++ struct packet_type **ppt_prev) ++{ ++ struct packet_type *ptype, *pt_prev; ++ rx_handler_func_t *rx_handler; ++ struct sk_buff *skb = *pskb; ++ struct net_device *orig_dev; ++ bool deliver_exact = false; ++ int ret = NET_RX_DROP; ++ __be16 type; ++ ++ net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); ++ ++ trace_netif_receive_skb(skb); ++ ++ orig_dev = skb->dev; ++ ++ skb_reset_network_header(skb); ++ if (!skb_transport_header_was_set(skb)) ++ skb_reset_transport_header(skb); ++ skb_reset_mac_len(skb); ++ ++ pt_prev = NULL; ++ ++another_round: ++ skb->skb_iif = skb->dev->ifindex; ++ ++ __this_cpu_inc(softnet_data.processed); ++ ++ if (static_branch_unlikely(&generic_xdp_needed_key)) { ++ int ret2; ++ ++ migrate_disable(); ++ ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); ++ migrate_enable(); ++ ++ if (ret2 != XDP_PASS) { ++ ret = NET_RX_DROP; ++ goto out; ++ } ++ } ++ ++ if (eth_type_vlan(skb->protocol)) { ++ skb = skb_vlan_untag(skb); ++ if (unlikely(!skb)) ++ goto out; ++ } ++ ++ if (skb_skip_tc_classify(skb)) ++ goto skip_classify; ++ ++ if (pfmemalloc) ++ goto skip_taps; ++ ++ list_for_each_entry_rcu(ptype, &ptype_all, list) { ++ if (pt_prev) ++ ret = deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = ptype; ++ } ++ ++ list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { ++ if (pt_prev) ++ ret = deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = ptype; ++ } ++ ++skip_taps: ++#ifdef CONFIG_NET_INGRESS ++ if (static_branch_unlikely(&ingress_needed_key)) { ++ bool another = false; ++ ++ nf_skip_egress(skb, true); ++ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, ++ &another); ++ if (another) ++ goto another_round; ++ if (!skb) ++ goto out; ++ ++ nf_skip_egress(skb, false); ++ if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) ++ goto out; ++ } ++#endif ++ skb_reset_redirect(skb); ++skip_classify: ++ if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) ++ goto drop; ++ ++ if (skb_vlan_tag_present(skb)) { ++ if (pt_prev) { ++ ret = deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = NULL; ++ } ++ if (vlan_do_receive(&skb)) ++ goto another_round; ++ else if (unlikely(!skb)) ++ goto out; ++ } ++ ++ rx_handler = rcu_dereference(skb->dev->rx_handler); ++ if (rx_handler) { ++ if (pt_prev) { ++ ret = deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = NULL; ++ } ++ switch (rx_handler(&skb)) { ++ case RX_HANDLER_CONSUMED: ++ ret = NET_RX_SUCCESS; ++ goto out; ++ case RX_HANDLER_ANOTHER: ++ goto another_round; ++ case RX_HANDLER_EXACT: ++ deliver_exact = true; ++ break; ++ case RX_HANDLER_PASS: ++ break; ++ default: ++ BUG(); ++ } ++ } ++ ++ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { ++check_vlan_id: ++ if (skb_vlan_tag_get_id(skb)) { ++ /* Vlan id is non 0 and vlan_do_receive() above couldn't ++ * find vlan device. ++ */ ++ skb->pkt_type = PACKET_OTHERHOST; ++ } else if (eth_type_vlan(skb->protocol)) { ++ /* Outer header is 802.1P with vlan 0, inner header is ++ * 802.1Q or 802.1AD and vlan_do_receive() above could ++ * not find vlan dev for vlan id 0. ++ */ ++ __vlan_hwaccel_clear_tag(skb); ++ skb = skb_vlan_untag(skb); ++ if (unlikely(!skb)) ++ goto out; ++ if (vlan_do_receive(&skb)) ++ /* After stripping off 802.1P header with vlan 0 ++ * vlan dev is found for inner header. ++ */ ++ goto another_round; ++ else if (unlikely(!skb)) ++ goto out; ++ else ++ /* We have stripped outer 802.1P vlan 0 header. ++ * But could not find vlan dev. ++ * check again for vlan id to set OTHERHOST. ++ */ ++ goto check_vlan_id; ++ } ++ /* Note: we might in the future use prio bits ++ * and set skb->priority like in vlan_do_receive() ++ * For the time being, just ignore Priority Code Point ++ */ ++ __vlan_hwaccel_clear_tag(skb); ++ } ++ ++ type = skb->protocol; ++ ++ /* deliver only exact match when indicated */ ++ if (likely(!deliver_exact)) { ++ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, ++ &ptype_base[ntohs(type) & ++ PTYPE_HASH_MASK]); ++ } ++ ++ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, ++ &orig_dev->ptype_specific); ++ ++ if (unlikely(skb->dev != orig_dev)) { ++ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, ++ &skb->dev->ptype_specific); ++ } ++ ++ if (pt_prev) { ++ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) ++ goto drop; ++ *ppt_prev = pt_prev; ++ } else { ++drop: ++ if (!deliver_exact) ++ dev_core_stats_rx_dropped_inc(skb->dev); ++ else ++ dev_core_stats_rx_nohandler_inc(skb->dev); ++ kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); ++ /* Jamal, now you will not able to escape explaining ++ * me how you were going to use this. :-) ++ */ ++ ret = NET_RX_DROP; ++ } ++ ++out: ++ /* The invariant here is that if *ppt_prev is not NULL ++ * then skb should also be non-NULL. ++ * ++ * Apparently *ppt_prev assignment above holds this invariant due to ++ * skb dereferencing near it. ++ */ ++ *pskb = skb; ++ return ret; ++} ++ ++static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) ++{ ++ struct net_device *orig_dev = skb->dev; ++ struct packet_type *pt_prev = NULL; ++ int ret; ++ ++ ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); ++ if (pt_prev) ++ ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, ++ skb->dev, pt_prev, orig_dev); ++ return ret; ++} ++ ++/** ++ * netif_receive_skb_core - special purpose version of netif_receive_skb ++ * @skb: buffer to process ++ * ++ * More direct receive version of netif_receive_skb(). It should ++ * only be used by callers that have a need to skip RPS and Generic XDP. ++ * Caller must also take care of handling if ``(page_is_)pfmemalloc``. ++ * ++ * This function may only be called from softirq context and interrupts ++ * should be enabled. ++ * ++ * Return values (usually ignored): ++ * NET_RX_SUCCESS: no congestion ++ * NET_RX_DROP: packet was dropped ++ */ ++int netif_receive_skb_core(struct sk_buff *skb) ++{ ++ int ret; ++ ++ rcu_read_lock(); ++ ret = __netif_receive_skb_one_core(skb, false); ++ rcu_read_unlock(); ++ ++ return ret; ++} ++EXPORT_SYMBOL(netif_receive_skb_core); ++ ++static inline void __netif_receive_skb_list_ptype(struct list_head *head, ++ struct packet_type *pt_prev, ++ struct net_device *orig_dev) ++{ ++ struct sk_buff *skb, *next; ++ ++ if (!pt_prev) ++ return; ++ if (list_empty(head)) ++ return; ++ if (pt_prev->list_func != NULL) ++ INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, ++ ip_list_rcv, head, pt_prev, orig_dev); ++ else ++ list_for_each_entry_safe(skb, next, head, list) { ++ skb_list_del_init(skb); ++ pt_prev->func(skb, skb->dev, pt_prev, orig_dev); ++ } ++} ++ ++static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) ++{ ++ /* Fast-path assumptions: ++ * - There is no RX handler. ++ * - Only one packet_type matches. ++ * If either of these fails, we will end up doing some per-packet ++ * processing in-line, then handling the 'last ptype' for the whole ++ * sublist. This can't cause out-of-order delivery to any single ptype, ++ * because the 'last ptype' must be constant across the sublist, and all ++ * other ptypes are handled per-packet. ++ */ ++ /* Current (common) ptype of sublist */ ++ struct packet_type *pt_curr = NULL; ++ /* Current (common) orig_dev of sublist */ ++ struct net_device *od_curr = NULL; ++ struct list_head sublist; ++ struct sk_buff *skb, *next; ++ ++ INIT_LIST_HEAD(&sublist); ++ list_for_each_entry_safe(skb, next, head, list) { ++ struct net_device *orig_dev = skb->dev; ++ struct packet_type *pt_prev = NULL; ++ ++ skb_list_del_init(skb); ++ __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); ++ if (!pt_prev) ++ continue; ++ if (pt_curr != pt_prev || od_curr != orig_dev) { ++ /* dispatch old sublist */ ++ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); ++ /* start new sublist */ ++ INIT_LIST_HEAD(&sublist); ++ pt_curr = pt_prev; ++ od_curr = orig_dev; ++ } ++ list_add_tail(&skb->list, &sublist); ++ } ++ ++ /* dispatch final sublist */ ++ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); ++} ++ ++static int __netif_receive_skb(struct sk_buff *skb) ++{ ++ int ret; ++ ++ if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { ++ unsigned int noreclaim_flag; ++ ++ /* ++ * PFMEMALLOC skbs are special, they should ++ * - be delivered to SOCK_MEMALLOC sockets only ++ * - stay away from userspace ++ * - have bounded memory usage ++ * ++ * Use PF_MEMALLOC as this saves us from propagating the allocation ++ * context down to all allocation sites. ++ */ ++ noreclaim_flag = memalloc_noreclaim_save(); ++ ret = __netif_receive_skb_one_core(skb, true); ++ memalloc_noreclaim_restore(noreclaim_flag); ++ } else ++ ret = __netif_receive_skb_one_core(skb, false); ++ ++ return ret; ++} ++ ++static void __netif_receive_skb_list(struct list_head *head) ++{ ++ unsigned long noreclaim_flag = 0; ++ struct sk_buff *skb, *next; ++ bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ ++ ++ list_for_each_entry_safe(skb, next, head, list) { ++ if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { ++ struct list_head sublist; ++ ++ /* Handle the previous sublist */ ++ list_cut_before(&sublist, head, &skb->list); ++ if (!list_empty(&sublist)) ++ __netif_receive_skb_list_core(&sublist, pfmemalloc); ++ pfmemalloc = !pfmemalloc; ++ /* See comments in __netif_receive_skb */ ++ if (pfmemalloc) ++ noreclaim_flag = memalloc_noreclaim_save(); ++ else ++ memalloc_noreclaim_restore(noreclaim_flag); ++ } ++ } ++ /* Handle the remaining sublist */ ++ if (!list_empty(head)) ++ __netif_receive_skb_list_core(head, pfmemalloc); ++ /* Restore pflags */ ++ if (pfmemalloc) ++ memalloc_noreclaim_restore(noreclaim_flag); ++} ++ ++static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) ++{ ++ struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); ++ struct bpf_prog *new = xdp->prog; ++ int ret = 0; ++ ++ switch (xdp->command) { ++ case XDP_SETUP_PROG: ++ rcu_assign_pointer(dev->xdp_prog, new); ++ if (old) ++ bpf_prog_put(old); ++ ++ if (old && !new) { ++ static_branch_dec(&generic_xdp_needed_key); ++ } else if (new && !old) { ++ static_branch_inc(&generic_xdp_needed_key); ++ dev_disable_lro(dev); ++ dev_disable_gro_hw(dev); ++ } ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++static int netif_receive_skb_internal(struct sk_buff *skb) ++{ ++ int ret; ++ ++ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); ++ ++ if (skb_defer_rx_timestamp(skb)) ++ return NET_RX_SUCCESS; ++ ++ rcu_read_lock(); ++#ifdef CONFIG_RPS ++ if (static_branch_unlikely(&rps_needed)) { ++ struct rps_dev_flow voidflow, *rflow = &voidflow; ++ int cpu = get_rps_cpu(skb->dev, skb, &rflow); ++ ++ if (cpu >= 0) { ++ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); ++ rcu_read_unlock(); ++ return ret; ++ } ++ } ++#endif ++ ret = __netif_receive_skb(skb); ++ rcu_read_unlock(); ++ return ret; ++} ++ ++void netif_receive_skb_list_internal(struct list_head *head) ++{ ++ struct sk_buff *skb, *next; ++ struct list_head sublist; ++ ++ INIT_LIST_HEAD(&sublist); ++ list_for_each_entry_safe(skb, next, head, list) { ++ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); ++ skb_list_del_init(skb); ++ if (!skb_defer_rx_timestamp(skb)) ++ list_add_tail(&skb->list, &sublist); ++ } ++ list_splice_init(&sublist, head); ++ ++ rcu_read_lock(); ++#ifdef CONFIG_RPS ++ if (static_branch_unlikely(&rps_needed)) { ++ list_for_each_entry_safe(skb, next, head, list) { ++ struct rps_dev_flow voidflow, *rflow = &voidflow; ++ int cpu = get_rps_cpu(skb->dev, skb, &rflow); ++ ++ if (cpu >= 0) { ++ /* Will be handled, remove from list */ ++ skb_list_del_init(skb); ++ enqueue_to_backlog(skb, cpu, &rflow->last_qtail); ++ } ++ } ++ } ++#endif ++ __netif_receive_skb_list(head); ++ rcu_read_unlock(); ++} ++ ++/** ++ * netif_receive_skb - process receive buffer from network ++ * @skb: buffer to process ++ * ++ * netif_receive_skb() is the main receive data processing function. ++ * It always succeeds. The buffer may be dropped during processing ++ * for congestion control or by the protocol layers. ++ * ++ * This function may only be called from softirq context and interrupts ++ * should be enabled. ++ * ++ * Return values (usually ignored): ++ * NET_RX_SUCCESS: no congestion ++ * NET_RX_DROP: packet was dropped ++ */ ++int netif_receive_skb(struct sk_buff *skb) ++{ ++ int ret; ++ ++ trace_netif_receive_skb_entry(skb); ++ ++ ret = netif_receive_skb_internal(skb); ++ trace_netif_receive_skb_exit(ret); ++ ++ return ret; ++} ++EXPORT_SYMBOL(netif_receive_skb); ++ ++/** ++ * netif_receive_skb_list - process many receive buffers from network ++ * @head: list of skbs to process. ++ * ++ * Since return value of netif_receive_skb() is normally ignored, and ++ * wouldn't be meaningful for a list, this function returns void. ++ * ++ * This function may only be called from softirq context and interrupts ++ * should be enabled. ++ */ ++void netif_receive_skb_list(struct list_head *head) ++{ ++ struct sk_buff *skb; ++ ++ if (list_empty(head)) ++ return; ++ if (trace_netif_receive_skb_list_entry_enabled()) { ++ list_for_each_entry(skb, head, list) ++ trace_netif_receive_skb_list_entry(skb); ++ } ++ netif_receive_skb_list_internal(head); ++ trace_netif_receive_skb_list_exit(0); ++} ++EXPORT_SYMBOL(netif_receive_skb_list); ++ ++static DEFINE_PER_CPU(struct work_struct, flush_works); ++ ++/* Network device is going away, flush any packets still pending */ ++static void flush_backlog(struct work_struct *work) ++{ ++ struct sk_buff *skb, *tmp; ++ struct softnet_data *sd; ++ ++ local_bh_disable(); ++ sd = this_cpu_ptr(&softnet_data); ++ ++ rps_lock_irq_disable(sd); ++ skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { ++ if (skb->dev->reg_state == NETREG_UNREGISTERING) { ++ __skb_unlink(skb, &sd->input_pkt_queue); ++ dev_kfree_skb_irq(skb); ++ input_queue_head_incr(sd); ++ } ++ } ++ rps_unlock_irq_enable(sd); ++ ++ skb_queue_walk_safe(&sd->process_queue, skb, tmp) { ++ if (skb->dev->reg_state == NETREG_UNREGISTERING) { ++ __skb_unlink(skb, &sd->process_queue); ++ kfree_skb(skb); ++ input_queue_head_incr(sd); ++ } ++ } ++ local_bh_enable(); ++} ++ ++static bool flush_required(int cpu) ++{ ++#if IS_ENABLED(CONFIG_RPS) ++ struct softnet_data *sd = &per_cpu(softnet_data, cpu); ++ bool do_flush; ++ ++ rps_lock_irq_disable(sd); ++ ++ /* as insertion into process_queue happens with the rps lock held, ++ * process_queue access may race only with dequeue ++ */ ++ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || ++ !skb_queue_empty_lockless(&sd->process_queue); ++ rps_unlock_irq_enable(sd); ++ ++ return do_flush; ++#endif ++ /* without RPS we can't safely check input_pkt_queue: during a ++ * concurrent remote skb_queue_splice() we can detect as empty both ++ * input_pkt_queue and process_queue even if the latter could end-up ++ * containing a lot of packets. ++ */ ++ return true; ++} ++ ++static void flush_all_backlogs(void) ++{ ++ static cpumask_t flush_cpus; ++ unsigned int cpu; ++ ++ /* since we are under rtnl lock protection we can use static data ++ * for the cpumask and avoid allocating on stack the possibly ++ * large mask ++ */ ++ ASSERT_RTNL(); ++ ++ cpus_read_lock(); ++ ++ cpumask_clear(&flush_cpus); ++ for_each_online_cpu(cpu) { ++ if (flush_required(cpu)) { ++ queue_work_on(cpu, system_highpri_wq, ++ per_cpu_ptr(&flush_works, cpu)); ++ cpumask_set_cpu(cpu, &flush_cpus); ++ } ++ } ++ ++ /* we can have in flight packet[s] on the cpus we are not flushing, ++ * synchronize_net() in unregister_netdevice_many() will take care of ++ * them ++ */ ++ for_each_cpu(cpu, &flush_cpus) ++ flush_work(per_cpu_ptr(&flush_works, cpu)); ++ ++ cpus_read_unlock(); ++} ++ ++static void net_rps_send_ipi(struct softnet_data *remsd) ++{ ++#ifdef CONFIG_RPS ++ while (remsd) { ++ struct softnet_data *next = remsd->rps_ipi_next; ++ ++ if (cpu_online(remsd->cpu)) ++ smp_call_function_single_async(remsd->cpu, &remsd->csd); ++ remsd = next; ++ } ++#endif ++} ++ ++/* ++ * net_rps_action_and_irq_enable sends any pending IPI's for rps. ++ * Note: called with local irq disabled, but exits with local irq enabled. ++ */ ++static void net_rps_action_and_irq_enable(struct softnet_data *sd) ++{ ++#ifdef CONFIG_RPS ++ struct softnet_data *remsd = sd->rps_ipi_list; ++ ++ if (remsd) { ++ sd->rps_ipi_list = NULL; ++ ++ local_irq_enable(); ++ ++ /* Send pending IPI's to kick RPS processing on remote cpus. */ ++ net_rps_send_ipi(remsd); ++ } else ++#endif ++ local_irq_enable(); ++} ++ ++static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) ++{ ++#ifdef CONFIG_RPS ++ return sd->rps_ipi_list != NULL; ++#else ++ return false; ++#endif ++} ++ ++static int process_backlog(struct napi_struct *napi, int quota) ++{ ++ struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); ++ bool again = true; ++ int work = 0; ++ ++ /* Check if we have pending ipi, its better to send them now, ++ * not waiting net_rx_action() end. ++ */ ++ if (sd_has_rps_ipi_waiting(sd)) { ++ local_irq_disable(); ++ net_rps_action_and_irq_enable(sd); ++ } ++ ++ napi->weight = READ_ONCE(dev_rx_weight); ++ while (again) { ++ struct sk_buff *skb; ++ ++ while ((skb = __skb_dequeue(&sd->process_queue))) { ++ rcu_read_lock(); ++ __netif_receive_skb(skb); ++ rcu_read_unlock(); ++ input_queue_head_incr(sd); ++ if (++work >= quota) ++ return work; ++ ++ } ++ ++ rps_lock_irq_disable(sd); ++ if (skb_queue_empty(&sd->input_pkt_queue)) { ++ /* ++ * Inline a custom version of __napi_complete(). ++ * only current cpu owns and manipulates this napi, ++ * and NAPI_STATE_SCHED is the only possible flag set ++ * on backlog. ++ * We can use a plain write instead of clear_bit(), ++ * and we dont need an smp_mb() memory barrier. ++ */ ++ napi->state = 0; ++ again = false; ++ } else { ++ skb_queue_splice_tail_init(&sd->input_pkt_queue, ++ &sd->process_queue); ++ } ++ rps_unlock_irq_enable(sd); ++ } ++ ++ return work; ++} ++ ++/** ++ * __napi_schedule - schedule for receive ++ * @n: entry to schedule ++ * ++ * The entry's receive function will be scheduled to run. ++ * Consider using __napi_schedule_irqoff() if hard irqs are masked. ++ */ ++void __napi_schedule(struct napi_struct *n) ++{ ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ____napi_schedule(this_cpu_ptr(&softnet_data), n); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL(__napi_schedule); ++ ++/** ++ * napi_schedule_prep - check if napi can be scheduled ++ * @n: napi context ++ * ++ * Test if NAPI routine is already running, and if not mark ++ * it as running. This is used as a condition variable to ++ * insure only one NAPI poll instance runs. We also make ++ * sure there is no pending NAPI disable. ++ */ ++bool napi_schedule_prep(struct napi_struct *n) ++{ ++ unsigned long val, new; ++ ++ do { ++ val = READ_ONCE(n->state); ++ if (unlikely(val & NAPIF_STATE_DISABLE)) ++ return false; ++ new = val | NAPIF_STATE_SCHED; ++ ++ /* Sets STATE_MISSED bit if STATE_SCHED was already set ++ * This was suggested by Alexander Duyck, as compiler ++ * emits better code than : ++ * if (val & NAPIF_STATE_SCHED) ++ * new |= NAPIF_STATE_MISSED; ++ */ ++ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * ++ NAPIF_STATE_MISSED; ++ } while (cmpxchg(&n->state, val, new) != val); ++ ++ return !(val & NAPIF_STATE_SCHED); ++} ++EXPORT_SYMBOL(napi_schedule_prep); ++ ++/** ++ * __napi_schedule_irqoff - schedule for receive ++ * @n: entry to schedule ++ * ++ * Variant of __napi_schedule() assuming hard irqs are masked. ++ * ++ * On PREEMPT_RT enabled kernels this maps to __napi_schedule() ++ * because the interrupt disabled assumption might not be true ++ * due to force-threaded interrupts and spinlock substitution. ++ */ ++void __napi_schedule_irqoff(struct napi_struct *n) ++{ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ ____napi_schedule(this_cpu_ptr(&softnet_data), n); ++ else ++ __napi_schedule(n); ++} ++EXPORT_SYMBOL(__napi_schedule_irqoff); ++ ++bool napi_complete_done(struct napi_struct *n, int work_done) ++{ ++ unsigned long flags, val, new, timeout = 0; ++ bool ret = true; ++ ++ /* ++ * 1) Don't let napi dequeue from the cpu poll list ++ * just in case its running on a different cpu. ++ * 2) If we are busy polling, do nothing here, we have ++ * the guarantee we will be called later. ++ */ ++ if (unlikely(n->state & (NAPIF_STATE_NPSVC | ++ NAPIF_STATE_IN_BUSY_POLL))) ++ return false; ++ ++ if (work_done) { ++ if (n->gro_bitmask) ++ timeout = READ_ONCE(n->dev->gro_flush_timeout); ++ n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); ++ } ++ if (n->defer_hard_irqs_count > 0) { ++ n->defer_hard_irqs_count--; ++ timeout = READ_ONCE(n->dev->gro_flush_timeout); ++ if (timeout) ++ ret = false; ++ } ++ if (n->gro_bitmask) { ++ /* When the NAPI instance uses a timeout and keeps postponing ++ * it, we need to bound somehow the time packets are kept in ++ * the GRO layer ++ */ ++ napi_gro_flush(n, !!timeout); ++ } ++ ++ gro_normal_list(n); ++ ++ if (unlikely(!list_empty(&n->poll_list))) { ++ /* If n->poll_list is not empty, we need to mask irqs */ ++ local_irq_save(flags); ++ list_del_init(&n->poll_list); ++ local_irq_restore(flags); ++ } ++ ++ do { ++ val = READ_ONCE(n->state); ++ ++ WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); ++ ++ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | ++ NAPIF_STATE_SCHED_THREADED | ++ NAPIF_STATE_PREFER_BUSY_POLL); ++ ++ /* If STATE_MISSED was set, leave STATE_SCHED set, ++ * because we will call napi->poll() one more time. ++ * This C code was suggested by Alexander Duyck to help gcc. ++ */ ++ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * ++ NAPIF_STATE_SCHED; ++ } while (cmpxchg(&n->state, val, new) != val); ++ ++ if (unlikely(val & NAPIF_STATE_MISSED)) { ++ __napi_schedule(n); ++ return false; ++ } ++ ++ if (timeout) ++ hrtimer_start(&n->timer, ns_to_ktime(timeout), ++ HRTIMER_MODE_REL_PINNED); ++ return ret; ++} ++EXPORT_SYMBOL(napi_complete_done); ++ ++/* must be called under rcu_read_lock(), as we dont take a reference */ ++static struct napi_struct *napi_by_id(unsigned int napi_id) ++{ ++ unsigned int hash = napi_id % HASH_SIZE(napi_hash); ++ struct napi_struct *napi; ++ ++ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) ++ if (napi->napi_id == napi_id) ++ return napi; ++ ++ return NULL; ++} ++ ++#if defined(CONFIG_NET_RX_BUSY_POLL) ++ ++static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) ++{ ++ if (!skip_schedule) { ++ gro_normal_list(napi); ++ __napi_schedule(napi); ++ return; ++ } ++ ++ if (napi->gro_bitmask) { ++ /* flush too old packets ++ * If HZ < 1000, flush all packets. ++ */ ++ napi_gro_flush(napi, HZ >= 1000); ++ } ++ ++ gro_normal_list(napi); ++ clear_bit(NAPI_STATE_SCHED, &napi->state); ++} ++ ++static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, ++ u16 budget) ++{ ++ bool skip_schedule = false; ++ unsigned long timeout; ++ int rc; ++ ++ /* Busy polling means there is a high chance device driver hard irq ++ * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was ++ * set in napi_schedule_prep(). ++ * Since we are about to call napi->poll() once more, we can safely ++ * clear NAPI_STATE_MISSED. ++ * ++ * Note: x86 could use a single "lock and ..." instruction ++ * to perform these two clear_bit() ++ */ ++ clear_bit(NAPI_STATE_MISSED, &napi->state); ++ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); ++ ++ local_bh_disable(); ++ ++ if (prefer_busy_poll) { ++ napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); ++ timeout = READ_ONCE(napi->dev->gro_flush_timeout); ++ if (napi->defer_hard_irqs_count && timeout) { ++ hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); ++ skip_schedule = true; ++ } ++ } ++ ++ /* All we really want here is to re-enable device interrupts. ++ * Ideally, a new ndo_busy_poll_stop() could avoid another round. ++ */ ++ rc = napi->poll(napi, budget); ++ /* We can't gro_normal_list() here, because napi->poll() might have ++ * rearmed the napi (napi_complete_done()) in which case it could ++ * already be running on another CPU. ++ */ ++ trace_napi_poll(napi, rc, budget); ++ netpoll_poll_unlock(have_poll_lock); ++ if (rc == budget) ++ __busy_poll_stop(napi, skip_schedule); ++ local_bh_enable(); ++} ++ ++void napi_busy_loop(unsigned int napi_id, ++ bool (*loop_end)(void *, unsigned long), ++ void *loop_end_arg, bool prefer_busy_poll, u16 budget) ++{ ++ unsigned long start_time = loop_end ? busy_loop_current_time() : 0; ++ int (*napi_poll)(struct napi_struct *napi, int budget); ++ void *have_poll_lock = NULL; ++ struct napi_struct *napi; ++ ++restart: ++ napi_poll = NULL; ++ ++ rcu_read_lock(); ++ ++ napi = napi_by_id(napi_id); ++ if (!napi) ++ goto out; ++ ++ preempt_disable(); ++ for (;;) { ++ int work = 0; ++ ++ local_bh_disable(); ++ if (!napi_poll) { ++ unsigned long val = READ_ONCE(napi->state); ++ ++ /* If multiple threads are competing for this napi, ++ * we avoid dirtying napi->state as much as we can. ++ */ ++ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | ++ NAPIF_STATE_IN_BUSY_POLL)) { ++ if (prefer_busy_poll) ++ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); ++ goto count; ++ } ++ if (cmpxchg(&napi->state, val, ++ val | NAPIF_STATE_IN_BUSY_POLL | ++ NAPIF_STATE_SCHED) != val) { ++ if (prefer_busy_poll) ++ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); ++ goto count; ++ } ++ have_poll_lock = netpoll_poll_lock(napi); ++ napi_poll = napi->poll; ++ } ++ work = napi_poll(napi, budget); ++ trace_napi_poll(napi, work, budget); ++ gro_normal_list(napi); ++count: ++ if (work > 0) ++ __NET_ADD_STATS(dev_net(napi->dev), ++ LINUX_MIB_BUSYPOLLRXPACKETS, work); ++ local_bh_enable(); ++ ++ if (!loop_end || loop_end(loop_end_arg, start_time)) ++ break; ++ ++ if (unlikely(need_resched())) { ++ if (napi_poll) ++ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); ++ preempt_enable(); ++ rcu_read_unlock(); ++ cond_resched(); ++ if (loop_end(loop_end_arg, start_time)) ++ return; ++ goto restart; ++ } ++ cpu_relax(); ++ } ++ if (napi_poll) ++ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); ++ preempt_enable(); ++out: ++ rcu_read_unlock(); ++} ++EXPORT_SYMBOL(napi_busy_loop); ++ ++#endif /* CONFIG_NET_RX_BUSY_POLL */ ++ ++static void napi_hash_add(struct napi_struct *napi) ++{ ++ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) ++ return; ++ ++ spin_lock(&napi_hash_lock); ++ ++ /* 0..NR_CPUS range is reserved for sender_cpu use */ ++ do { ++ if (unlikely(++napi_gen_id < MIN_NAPI_ID)) ++ napi_gen_id = MIN_NAPI_ID; ++ } while (napi_by_id(napi_gen_id)); ++ napi->napi_id = napi_gen_id; ++ ++ hlist_add_head_rcu(&napi->napi_hash_node, ++ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); ++ ++ spin_unlock(&napi_hash_lock); ++} ++ ++/* Warning : caller is responsible to make sure rcu grace period ++ * is respected before freeing memory containing @napi ++ */ ++static void napi_hash_del(struct napi_struct *napi) ++{ ++ spin_lock(&napi_hash_lock); ++ ++ hlist_del_init_rcu(&napi->napi_hash_node); ++ ++ spin_unlock(&napi_hash_lock); ++} ++ ++static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) ++{ ++ struct napi_struct *napi; ++ ++ napi = container_of(timer, struct napi_struct, timer); ++ ++ /* Note : we use a relaxed variant of napi_schedule_prep() not setting ++ * NAPI_STATE_MISSED, since we do not react to a device IRQ. ++ */ ++ if (!napi_disable_pending(napi) && ++ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { ++ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); ++ __napi_schedule_irqoff(napi); ++ } ++ ++ return HRTIMER_NORESTART; ++} ++ ++static void init_gro_hash(struct napi_struct *napi) ++{ ++ int i; ++ ++ for (i = 0; i < GRO_HASH_BUCKETS; i++) { ++ INIT_LIST_HEAD(&napi->gro_hash[i].list); ++ napi->gro_hash[i].count = 0; ++ } ++ napi->gro_bitmask = 0; ++} ++ ++int dev_set_threaded(struct net_device *dev, bool threaded) ++{ ++ struct napi_struct *napi; ++ int err = 0; ++ ++ if (dev->threaded == threaded) ++ return 0; ++ ++ if (threaded) { ++ list_for_each_entry(napi, &dev->napi_list, dev_list) { ++ if (!napi->thread) { ++ err = napi_kthread_create(napi); ++ if (err) { ++ threaded = false; ++ break; ++ } ++ } ++ } ++ } ++ ++ dev->threaded = threaded; ++ ++ /* Make sure kthread is created before THREADED bit ++ * is set. ++ */ ++ smp_mb__before_atomic(); ++ ++ /* Setting/unsetting threaded mode on a napi might not immediately ++ * take effect, if the current napi instance is actively being ++ * polled. In this case, the switch between threaded mode and ++ * softirq mode will happen in the next round of napi_schedule(). ++ * This should not cause hiccups/stalls to the live traffic. ++ */ ++ list_for_each_entry(napi, &dev->napi_list, dev_list) { ++ if (threaded) ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++ else ++ clear_bit(NAPI_STATE_THREADED, &napi->state); ++ } ++ ++ return err; ++} ++EXPORT_SYMBOL(dev_set_threaded); ++ ++/* Double check that napi_get_frags() allocates skbs with ++ * skb->head being backed by slab, not a page fragment. ++ * This is to make sure bug fixed in 3226b158e67c ++ * ("net: avoid 32 x truesize under-estimation for tiny skbs") ++ * does not accidentally come back. ++ */ ++static void napi_get_frags_check(struct napi_struct *napi) ++{ ++ struct sk_buff *skb; ++ ++ local_bh_disable(); ++ skb = napi_get_frags(napi); ++ WARN_ON_ONCE(skb && skb->head_frag); ++ napi_free_frags(napi); ++ local_bh_enable(); ++} ++ ++void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int), int weight) ++{ ++ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) ++ return; ++ ++ INIT_LIST_HEAD(&napi->poll_list); ++ INIT_HLIST_NODE(&napi->napi_hash_node); ++ hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); ++ napi->timer.function = napi_watchdog; ++ init_gro_hash(napi); ++ napi->skb = NULL; ++ INIT_LIST_HEAD(&napi->rx_list); ++ napi->rx_count = 0; ++ napi->poll = poll; ++ if (weight > NAPI_POLL_WEIGHT) ++ netdev_err_once(dev, "%s() called with weight %d\n", __func__, ++ weight); ++ napi->weight = weight; ++ napi->dev = dev; ++#ifdef CONFIG_NETPOLL ++ napi->poll_owner = -1; ++#endif ++ set_bit(NAPI_STATE_SCHED, &napi->state); ++ set_bit(NAPI_STATE_NPSVC, &napi->state); ++ list_add_rcu(&napi->dev_list, &dev->napi_list); ++ napi_hash_add(napi); ++ napi_get_frags_check(napi); ++ /* Create kthread for this napi if dev->threaded is set. ++ * Clear dev->threaded if kthread creation failed so that ++ * threaded mode will not be enabled in napi_enable(). ++ */ ++ if (dev->threaded && napi_kthread_create(napi)) ++ dev->threaded = 0; ++} ++EXPORT_SYMBOL(netif_napi_add_weight); ++ ++void napi_disable(struct napi_struct *n) ++{ ++ unsigned long val, new; ++ ++ might_sleep(); ++ set_bit(NAPI_STATE_DISABLE, &n->state); ++ ++ for ( ; ; ) { ++ val = READ_ONCE(n->state); ++ if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { ++ usleep_range(20, 200); ++ continue; ++ } ++ ++ new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; ++ new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); ++ ++ if (cmpxchg(&n->state, val, new) == val) ++ break; ++ } ++ ++ hrtimer_cancel(&n->timer); ++ ++ clear_bit(NAPI_STATE_DISABLE, &n->state); ++} ++EXPORT_SYMBOL(napi_disable); ++ ++/** ++ * napi_enable - enable NAPI scheduling ++ * @n: NAPI context ++ * ++ * Resume NAPI from being scheduled on this context. ++ * Must be paired with napi_disable. ++ */ ++void napi_enable(struct napi_struct *n) ++{ ++ unsigned long val, new; ++ ++ do { ++ val = READ_ONCE(n->state); ++ BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); ++ ++ new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); ++ if (n->dev->threaded && n->thread) ++ new |= NAPIF_STATE_THREADED; ++ } while (cmpxchg(&n->state, val, new) != val); ++} ++EXPORT_SYMBOL(napi_enable); ++ ++static void flush_gro_hash(struct napi_struct *napi) ++{ ++ int i; ++ ++ for (i = 0; i < GRO_HASH_BUCKETS; i++) { ++ struct sk_buff *skb, *n; ++ ++ list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) ++ kfree_skb(skb); ++ napi->gro_hash[i].count = 0; ++ } ++} ++ ++/* Must be called in process context */ ++void __netif_napi_del(struct napi_struct *napi) ++{ ++ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) ++ return; ++ ++ napi_hash_del(napi); ++ list_del_rcu(&napi->dev_list); ++ napi_free_frags(napi); ++ ++ flush_gro_hash(napi); ++ napi->gro_bitmask = 0; ++ ++ if (napi->thread) { ++ kthread_stop(napi->thread); ++ napi->thread = NULL; ++ } ++} ++EXPORT_SYMBOL(__netif_napi_del); ++ ++static int __napi_poll(struct napi_struct *n, bool *repoll) ++{ ++ int work, weight; ++ ++ weight = n->weight; ++ ++ /* This NAPI_STATE_SCHED test is for avoiding a race ++ * with netpoll's poll_napi(). Only the entity which ++ * obtains the lock and sees NAPI_STATE_SCHED set will ++ * actually make the ->poll() call. Therefore we avoid ++ * accidentally calling ->poll() when NAPI is not scheduled. ++ */ ++ work = 0; ++ if (test_bit(NAPI_STATE_SCHED, &n->state)) { ++ work = n->poll(n, weight); ++ trace_napi_poll(n, work, weight); ++ } ++ ++ if (unlikely(work > weight)) ++ netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", ++ n->poll, work, weight); ++ ++ if (likely(work < weight)) ++ return work; ++ ++ /* Drivers must not modify the NAPI state if they ++ * consume the entire weight. In such cases this code ++ * still "owns" the NAPI instance and therefore can ++ * move the instance around on the list at-will. ++ */ ++ if (unlikely(napi_disable_pending(n))) { ++ napi_complete(n); ++ return work; ++ } ++ ++ /* The NAPI context has more processing work, but busy-polling ++ * is preferred. Exit early. ++ */ ++ if (napi_prefer_busy_poll(n)) { ++ if (napi_complete_done(n, work)) { ++ /* If timeout is not set, we need to make sure ++ * that the NAPI is re-scheduled. ++ */ ++ napi_schedule(n); ++ } ++ return work; ++ } ++ ++ if (n->gro_bitmask) { ++ /* flush too old packets ++ * If HZ < 1000, flush all packets. ++ */ ++ napi_gro_flush(n, HZ >= 1000); ++ } ++ ++ gro_normal_list(n); ++ ++ /* Some drivers may have called napi_schedule ++ * prior to exhausting their budget. ++ */ ++ if (unlikely(!list_empty(&n->poll_list))) { ++ pr_warn_once("%s: Budget exhausted after napi rescheduled\n", ++ n->dev ? n->dev->name : "backlog"); ++ return work; ++ } ++ ++ *repoll = true; ++ ++ return work; ++} ++ ++static int napi_poll(struct napi_struct *n, struct list_head *repoll) ++{ ++ bool do_repoll = false; ++ void *have; ++ int work; ++ ++ list_del_init(&n->poll_list); ++ ++ have = netpoll_poll_lock(n); ++ ++ work = __napi_poll(n, &do_repoll); ++ ++ if (do_repoll) ++ list_add_tail(&n->poll_list, repoll); ++ ++ netpoll_poll_unlock(have); ++ ++ return work; ++} ++ ++static int napi_thread_wait(struct napi_struct *napi) ++{ ++ bool woken = false; ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ while (!kthread_should_stop()) { ++ /* Testing SCHED_THREADED bit here to make sure the current ++ * kthread owns this napi and could poll on this napi. ++ * Testing SCHED bit is not enough because SCHED bit might be ++ * set by some other busy poll thread or by napi_disable(). ++ */ ++ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { ++ WARN_ON(!list_empty(&napi->poll_list)); ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ schedule(); ++ /* woken being true indicates this thread owns this napi. */ ++ woken = true; ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ __set_current_state(TASK_RUNNING); ++ ++ return -1; ++} ++ ++static int napi_threaded_poll(void *data) ++{ ++ struct napi_struct *napi = data; ++ void *have; ++ ++ while (!napi_thread_wait(napi)) { ++ for (;;) { ++ bool repoll = false; ++ ++ local_bh_disable(); ++ ++ have = netpoll_poll_lock(napi); ++ __napi_poll(napi, &repoll); ++ netpoll_poll_unlock(have); ++ ++ local_bh_enable(); ++ ++ if (!repoll) ++ break; ++ ++ cond_resched(); ++ } ++ } ++ return 0; ++} ++ ++static void skb_defer_free_flush(struct softnet_data *sd) ++{ ++ struct sk_buff *skb, *next; ++ unsigned long flags; ++ ++ /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ ++ if (!READ_ONCE(sd->defer_list)) ++ return; ++ ++ spin_lock_irqsave(&sd->defer_lock, flags); ++ skb = sd->defer_list; ++ sd->defer_list = NULL; ++ sd->defer_count = 0; ++ spin_unlock_irqrestore(&sd->defer_lock, flags); ++ ++ while (skb != NULL) { ++ next = skb->next; ++ napi_consume_skb(skb, 1); ++ skb = next; ++ } ++} ++ ++static __latent_entropy void net_rx_action(struct softirq_action *h) ++{ ++ struct softnet_data *sd = this_cpu_ptr(&softnet_data); ++ unsigned long time_limit = jiffies + ++ usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); ++ int budget = READ_ONCE(netdev_budget); ++ LIST_HEAD(list); ++ LIST_HEAD(repoll); ++ ++ local_irq_disable(); ++ list_splice_init(&sd->poll_list, &list); ++ local_irq_enable(); ++ ++ for (;;) { ++ struct napi_struct *n; ++ ++ skb_defer_free_flush(sd); ++ ++ if (list_empty(&list)) { ++ if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) ++ goto end; ++ break; ++ } ++ ++ n = list_first_entry(&list, struct napi_struct, poll_list); ++ budget -= napi_poll(n, &repoll); ++ ++ /* If softirq window is exhausted then punt. ++ * Allow this to run for 2 jiffies since which will allow ++ * an average latency of 1.5/HZ. ++ */ ++ if (unlikely(budget <= 0 || ++ time_after_eq(jiffies, time_limit))) { ++ sd->time_squeeze++; ++ break; ++ } ++ } ++ ++ local_irq_disable(); ++ ++ list_splice_tail_init(&sd->poll_list, &list); ++ list_splice_tail(&repoll, &list); ++ list_splice(&list, &sd->poll_list); ++ if (!list_empty(&sd->poll_list)) ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ ++ net_rps_action_and_irq_enable(sd); ++end:; ++} ++ ++struct netdev_adjacent { ++ struct net_device *dev; ++ netdevice_tracker dev_tracker; ++ ++ /* upper master flag, there can only be one master device per list */ ++ bool master; ++ ++ /* lookup ignore flag */ ++ bool ignore; ++ ++ /* counter for the number of times this device was added to us */ ++ u16 ref_nr; ++ ++ /* private field for the users */ ++ void *private; ++ ++ struct list_head list; ++ struct rcu_head rcu; ++}; ++ ++static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, ++ struct list_head *adj_list) ++{ ++ struct netdev_adjacent *adj; ++ ++ list_for_each_entry(adj, adj_list, list) { ++ if (adj->dev == adj_dev) ++ return adj; ++ } ++ return NULL; ++} ++ ++static int ____netdev_has_upper_dev(struct net_device *upper_dev, ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *dev = (struct net_device *)priv->data; ++ ++ return upper_dev == dev; ++} ++ ++/** ++ * netdev_has_upper_dev - Check if device is linked to an upper device ++ * @dev: device ++ * @upper_dev: upper device to check ++ * ++ * Find out if a device is linked to specified upper device and return true ++ * in case it is. Note that this checks only immediate upper device, ++ * not through a complete stack of devices. The caller must hold the RTNL lock. ++ */ ++bool netdev_has_upper_dev(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ struct netdev_nested_priv priv = { ++ .data = (void *)upper_dev, ++ }; ++ ++ ASSERT_RTNL(); ++ ++ return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, ++ &priv); ++} ++EXPORT_SYMBOL(netdev_has_upper_dev); ++ ++/** ++ * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device ++ * @dev: device ++ * @upper_dev: upper device to check ++ * ++ * Find out if a device is linked to specified upper device and return true ++ * in case it is. Note that this checks the entire upper device chain. ++ * The caller must hold rcu lock. ++ */ ++ ++bool netdev_has_upper_dev_all_rcu(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ struct netdev_nested_priv priv = { ++ .data = (void *)upper_dev, ++ }; ++ ++ return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, ++ &priv); ++} ++EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); ++ ++/** ++ * netdev_has_any_upper_dev - Check if device is linked to some device ++ * @dev: device ++ * ++ * Find out if a device is linked to an upper device and return true in case ++ * it is. The caller must hold the RTNL lock. ++ */ ++bool netdev_has_any_upper_dev(struct net_device *dev) ++{ ++ ASSERT_RTNL(); ++ ++ return !list_empty(&dev->adj_list.upper); ++} ++EXPORT_SYMBOL(netdev_has_any_upper_dev); ++ ++/** ++ * netdev_master_upper_dev_get - Get master upper device ++ * @dev: device ++ * ++ * Find a master upper device and return pointer to it or NULL in case ++ * it's not there. The caller must hold the RTNL lock. ++ */ ++struct net_device *netdev_master_upper_dev_get(struct net_device *dev) ++{ ++ struct netdev_adjacent *upper; ++ ++ ASSERT_RTNL(); ++ ++ if (list_empty(&dev->adj_list.upper)) ++ return NULL; ++ ++ upper = list_first_entry(&dev->adj_list.upper, ++ struct netdev_adjacent, list); ++ if (likely(upper->master)) ++ return upper->dev; ++ return NULL; ++} ++EXPORT_SYMBOL(netdev_master_upper_dev_get); ++ ++static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) ++{ ++ struct netdev_adjacent *upper; ++ ++ ASSERT_RTNL(); ++ ++ if (list_empty(&dev->adj_list.upper)) ++ return NULL; ++ ++ upper = list_first_entry(&dev->adj_list.upper, ++ struct netdev_adjacent, list); ++ if (likely(upper->master) && !upper->ignore) ++ return upper->dev; ++ return NULL; ++} ++ ++/** ++ * netdev_has_any_lower_dev - Check if device is linked to some device ++ * @dev: device ++ * ++ * Find out if a device is linked to a lower device and return true in case ++ * it is. The caller must hold the RTNL lock. ++ */ ++static bool netdev_has_any_lower_dev(struct net_device *dev) ++{ ++ ASSERT_RTNL(); ++ ++ return !list_empty(&dev->adj_list.lower); ++} ++ ++void *netdev_adjacent_get_private(struct list_head *adj_list) ++{ ++ struct netdev_adjacent *adj; ++ ++ adj = list_entry(adj_list, struct netdev_adjacent, list); ++ ++ return adj->private; ++} ++EXPORT_SYMBOL(netdev_adjacent_get_private); ++ ++/** ++ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list ++ * @dev: device ++ * @iter: list_head ** of the current position ++ * ++ * Gets the next device from the dev's upper list, starting from iter ++ * position. The caller must hold RCU read lock. ++ */ ++struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *upper; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); ++ ++ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&upper->list == &dev->adj_list.upper) ++ return NULL; ++ ++ *iter = &upper->list; ++ ++ return upper->dev; ++} ++EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); ++ ++static struct net_device *__netdev_next_upper_dev(struct net_device *dev, ++ struct list_head **iter, ++ bool *ignore) ++{ ++ struct netdev_adjacent *upper; ++ ++ upper = list_entry((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&upper->list == &dev->adj_list.upper) ++ return NULL; ++ ++ *iter = &upper->list; ++ *ignore = upper->ignore; ++ ++ return upper->dev; ++} ++ ++static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *upper; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); ++ ++ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&upper->list == &dev->adj_list.upper) ++ return NULL; ++ ++ *iter = &upper->list; ++ ++ return upper->dev; ++} ++ ++static int __netdev_walk_all_upper_dev(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ bool ignore; ++ ++ now = dev; ++ iter = &dev->adj_list.upper; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ udev = __netdev_next_upper_dev(now, &iter, &ignore); ++ if (!udev) ++ break; ++ if (ignore) ++ continue; ++ ++ next = udev; ++ niter = &udev->adj_list.upper; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++ ++int netdev_walk_all_upper_dev_rcu(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ ++ now = dev; ++ iter = &dev->adj_list.upper; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ udev = netdev_next_upper_dev_rcu(now, &iter); ++ if (!udev) ++ break; ++ ++ next = udev; ++ niter = &udev->adj_list.upper; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); ++ ++static bool __netdev_has_upper_dev(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = 0, ++ .data = (void *)upper_dev, ++ }; ++ ++ ASSERT_RTNL(); ++ ++ return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, ++ &priv); ++} ++ ++/** ++ * netdev_lower_get_next_private - Get the next ->private from the ++ * lower neighbour list ++ * @dev: device ++ * @iter: list_head ** of the current position ++ * ++ * Gets the next netdev_adjacent->private from the dev's lower neighbour ++ * list, starting from iter position. The caller must hold either hold the ++ * RTNL lock or its own locking that guarantees that the neighbour lower ++ * list will remain unchanged. ++ */ ++void *netdev_lower_get_next_private(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry(*iter, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = lower->list.next; ++ ++ return lower->private; ++} ++EXPORT_SYMBOL(netdev_lower_get_next_private); ++ ++/** ++ * netdev_lower_get_next_private_rcu - Get the next ->private from the ++ * lower neighbour list, RCU ++ * variant ++ * @dev: device ++ * @iter: list_head ** of the current position ++ * ++ * Gets the next netdev_adjacent->private from the dev's lower neighbour ++ * list, starting from iter position. The caller must hold RCU read lock. ++ */ ++void *netdev_lower_get_next_private_rcu(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); ++ ++ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = &lower->list; ++ ++ return lower->private; ++} ++EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); ++ ++/** ++ * netdev_lower_get_next - Get the next device from the lower neighbour ++ * list ++ * @dev: device ++ * @iter: list_head ** of the current position ++ * ++ * Gets the next netdev_adjacent from the dev's lower neighbour ++ * list, starting from iter position. The caller must hold RTNL lock or ++ * its own locking that guarantees that the neighbour lower ++ * list will remain unchanged. ++ */ ++void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry(*iter, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = lower->list.next; ++ ++ return lower->dev; ++} ++EXPORT_SYMBOL(netdev_lower_get_next); ++ ++static struct net_device *netdev_next_lower_dev(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = &lower->list; ++ ++ return lower->dev; ++} ++ ++static struct net_device *__netdev_next_lower_dev(struct net_device *dev, ++ struct list_head **iter, ++ bool *ignore) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = &lower->list; ++ *ignore = lower->ignore; ++ ++ return lower->dev; ++} ++ ++int netdev_walk_all_lower_dev(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ ++ now = dev; ++ iter = &dev->adj_list.lower; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ ldev = netdev_next_lower_dev(now, &iter); ++ if (!ldev) ++ break; ++ ++ next = ldev; ++ niter = &ldev->adj_list.lower; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); ++ ++static int __netdev_walk_all_lower_dev(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ bool ignore; ++ ++ now = dev; ++ iter = &dev->adj_list.lower; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ ldev = __netdev_next_lower_dev(now, &iter, &ignore); ++ if (!ldev) ++ break; ++ if (ignore) ++ continue; ++ ++ next = ldev; ++ niter = &ldev->adj_list.lower; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++ ++struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = &lower->list; ++ ++ return lower->dev; ++} ++EXPORT_SYMBOL(netdev_next_lower_dev_rcu); ++ ++static u8 __netdev_upper_depth(struct net_device *dev) ++{ ++ struct net_device *udev; ++ struct list_head *iter; ++ u8 max_depth = 0; ++ bool ignore; ++ ++ for (iter = &dev->adj_list.upper, ++ udev = __netdev_next_upper_dev(dev, &iter, &ignore); ++ udev; ++ udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { ++ if (ignore) ++ continue; ++ if (max_depth < udev->upper_level) ++ max_depth = udev->upper_level; ++ } ++ ++ return max_depth; ++} ++ ++static u8 __netdev_lower_depth(struct net_device *dev) ++{ ++ struct net_device *ldev; ++ struct list_head *iter; ++ u8 max_depth = 0; ++ bool ignore; ++ ++ for (iter = &dev->adj_list.lower, ++ ldev = __netdev_next_lower_dev(dev, &iter, &ignore); ++ ldev; ++ ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { ++ if (ignore) ++ continue; ++ if (max_depth < ldev->lower_level) ++ max_depth = ldev->lower_level; ++ } ++ ++ return max_depth; ++} ++ ++static int __netdev_update_upper_level(struct net_device *dev, ++ struct netdev_nested_priv *__unused) ++{ ++ dev->upper_level = __netdev_upper_depth(dev) + 1; ++ return 0; ++} ++ ++#ifdef CONFIG_LOCKDEP ++static LIST_HEAD(net_unlink_list); ++ ++static void net_unlink_todo(struct net_device *dev) ++{ ++ if (list_empty(&dev->unlink_list)) ++ list_add_tail(&dev->unlink_list, &net_unlink_list); ++} ++#endif ++ ++static int __netdev_update_lower_level(struct net_device *dev, ++ struct netdev_nested_priv *priv) ++{ ++ dev->lower_level = __netdev_lower_depth(dev) + 1; ++ ++#ifdef CONFIG_LOCKDEP ++ if (!priv) ++ return 0; ++ ++ if (priv->flags & NESTED_SYNC_IMM) ++ dev->nested_level = dev->lower_level - 1; ++ if (priv->flags & NESTED_SYNC_TODO) ++ net_unlink_todo(dev); ++#endif ++ return 0; ++} ++ ++int netdev_walk_all_lower_dev_rcu(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ ++ now = dev; ++ iter = &dev->adj_list.lower; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ ldev = netdev_next_lower_dev_rcu(now, &iter); ++ if (!ldev) ++ break; ++ ++ next = ldev; ++ niter = &ldev->adj_list.lower; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); ++ ++/** ++ * netdev_lower_get_first_private_rcu - Get the first ->private from the ++ * lower neighbour list, RCU ++ * variant ++ * @dev: device ++ * ++ * Gets the first netdev_adjacent->private from the dev's lower neighbour ++ * list. The caller must hold RCU read lock. ++ */ ++void *netdev_lower_get_first_private_rcu(struct net_device *dev) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_first_or_null_rcu(&dev->adj_list.lower, ++ struct netdev_adjacent, list); ++ if (lower) ++ return lower->private; ++ return NULL; ++} ++EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); ++ ++/** ++ * netdev_master_upper_dev_get_rcu - Get master upper device ++ * @dev: device ++ * ++ * Find a master upper device and return pointer to it or NULL in case ++ * it's not there. The caller must hold the RCU read lock. ++ */ ++struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) ++{ ++ struct netdev_adjacent *upper; ++ ++ upper = list_first_or_null_rcu(&dev->adj_list.upper, ++ struct netdev_adjacent, list); ++ if (upper && likely(upper->master)) ++ return upper->dev; ++ return NULL; ++} ++EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); ++ ++static int netdev_adjacent_sysfs_add(struct net_device *dev, ++ struct net_device *adj_dev, ++ struct list_head *dev_list) ++{ ++ char linkname[IFNAMSIZ+7]; ++ ++ sprintf(linkname, dev_list == &dev->adj_list.upper ? ++ "upper_%s" : "lower_%s", adj_dev->name); ++ return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), ++ linkname); ++} ++static void netdev_adjacent_sysfs_del(struct net_device *dev, ++ char *name, ++ struct list_head *dev_list) ++{ ++ char linkname[IFNAMSIZ+7]; ++ ++ sprintf(linkname, dev_list == &dev->adj_list.upper ? ++ "upper_%s" : "lower_%s", name); ++ sysfs_remove_link(&(dev->dev.kobj), linkname); ++} ++ ++static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, ++ struct net_device *adj_dev, ++ struct list_head *dev_list) ++{ ++ return (dev_list == &dev->adj_list.upper || ++ dev_list == &dev->adj_list.lower) && ++ net_eq(dev_net(dev), dev_net(adj_dev)); ++} ++ ++static int __netdev_adjacent_dev_insert(struct net_device *dev, ++ struct net_device *adj_dev, ++ struct list_head *dev_list, ++ void *private, bool master) ++{ ++ struct netdev_adjacent *adj; ++ int ret; ++ ++ adj = __netdev_find_adj(adj_dev, dev_list); ++ ++ if (adj) { ++ adj->ref_nr += 1; ++ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", ++ dev->name, adj_dev->name, adj->ref_nr); ++ ++ return 0; ++ } ++ ++ adj = kmalloc(sizeof(*adj), GFP_KERNEL); ++ if (!adj) ++ return -ENOMEM; ++ ++ adj->dev = adj_dev; ++ adj->master = master; ++ adj->ref_nr = 1; ++ adj->private = private; ++ adj->ignore = false; ++ netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); ++ ++ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", ++ dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); ++ ++ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ++ ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); ++ if (ret) ++ goto free_adj; ++ } ++ ++ /* Ensure that master link is always the first item in list. */ ++ if (master) { ++ ret = sysfs_create_link(&(dev->dev.kobj), ++ &(adj_dev->dev.kobj), "master"); ++ if (ret) ++ goto remove_symlinks; ++ ++ list_add_rcu(&adj->list, dev_list); ++ } else { ++ list_add_tail_rcu(&adj->list, dev_list); ++ } ++ ++ return 0; ++ ++remove_symlinks: ++ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) ++ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); ++free_adj: ++ netdev_put(adj_dev, &adj->dev_tracker); ++ kfree(adj); ++ ++ return ret; ++} ++ ++static void __netdev_adjacent_dev_remove(struct net_device *dev, ++ struct net_device *adj_dev, ++ u16 ref_nr, ++ struct list_head *dev_list) ++{ ++ struct netdev_adjacent *adj; ++ ++ pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", ++ dev->name, adj_dev->name, ref_nr); ++ ++ adj = __netdev_find_adj(adj_dev, dev_list); ++ ++ if (!adj) { ++ pr_err("Adjacency does not exist for device %s from %s\n", ++ dev->name, adj_dev->name); ++ WARN_ON(1); ++ return; ++ } ++ ++ if (adj->ref_nr > ref_nr) { ++ pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", ++ dev->name, adj_dev->name, ref_nr, ++ adj->ref_nr - ref_nr); ++ adj->ref_nr -= ref_nr; ++ return; ++ } ++ ++ if (adj->master) ++ sysfs_remove_link(&(dev->dev.kobj), "master"); ++ ++ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) ++ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); ++ ++ list_del_rcu(&adj->list); ++ pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", ++ adj_dev->name, dev->name, adj_dev->name); ++ netdev_put(adj_dev, &adj->dev_tracker); ++ kfree_rcu(adj, rcu); ++} ++ ++static int __netdev_adjacent_dev_link_lists(struct net_device *dev, ++ struct net_device *upper_dev, ++ struct list_head *up_list, ++ struct list_head *down_list, ++ void *private, bool master) ++{ ++ int ret; ++ ++ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, ++ private, master); ++ if (ret) ++ return ret; ++ ++ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, ++ private, false); ++ if (ret) { ++ __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, ++ struct net_device *upper_dev, ++ u16 ref_nr, ++ struct list_head *up_list, ++ struct list_head *down_list) ++{ ++ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); ++ __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); ++} ++ ++static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, ++ struct net_device *upper_dev, ++ void *private, bool master) ++{ ++ return __netdev_adjacent_dev_link_lists(dev, upper_dev, ++ &dev->adj_list.upper, ++ &upper_dev->adj_list.lower, ++ private, master); ++} ++ ++static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, ++ &dev->adj_list.upper, ++ &upper_dev->adj_list.lower); ++} ++ ++static int __netdev_upper_dev_link(struct net_device *dev, ++ struct net_device *upper_dev, bool master, ++ void *upper_priv, void *upper_info, ++ struct netdev_nested_priv *priv, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_changeupper_info changeupper_info = { ++ .info = { ++ .dev = dev, ++ .extack = extack, ++ }, ++ .upper_dev = upper_dev, ++ .master = master, ++ .linking = true, ++ .upper_info = upper_info, ++ }; ++ struct net_device *master_dev; ++ int ret = 0; ++ ++ ASSERT_RTNL(); ++ ++ if (dev == upper_dev) ++ return -EBUSY; ++ ++ /* To prevent loops, check if dev is not upper device to upper_dev. */ ++ if (__netdev_has_upper_dev(upper_dev, dev)) ++ return -EBUSY; ++ ++ if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) ++ return -EMLINK; ++ ++ if (!master) { ++ if (__netdev_has_upper_dev(dev, upper_dev)) ++ return -EEXIST; ++ } else { ++ master_dev = __netdev_master_upper_dev_get(dev); ++ if (master_dev) ++ return master_dev == upper_dev ? -EEXIST : -EBUSY; ++ } ++ ++ ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, ++ &changeupper_info.info); ++ ret = notifier_to_errno(ret); ++ if (ret) ++ return ret; ++ ++ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, ++ master); ++ if (ret) ++ return ret; ++ ++ ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, ++ &changeupper_info.info); ++ ret = notifier_to_errno(ret); ++ if (ret) ++ goto rollback; ++ ++ __netdev_update_upper_level(dev, NULL); ++ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); ++ ++ __netdev_update_lower_level(upper_dev, priv); ++ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, ++ priv); ++ ++ return 0; ++ ++rollback: ++ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); ++ ++ return ret; ++} ++ ++/** ++ * netdev_upper_dev_link - Add a link to the upper device ++ * @dev: device ++ * @upper_dev: new upper device ++ * @extack: netlink extended ack ++ * ++ * Adds a link to device which is upper to this one. The caller must hold ++ * the RTNL lock. On a failure a negative errno code is returned. ++ * On success the reference counts are adjusted and the function ++ * returns zero. ++ */ ++int netdev_upper_dev_link(struct net_device *dev, ++ struct net_device *upper_dev, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, ++ .data = NULL, ++ }; ++ ++ return __netdev_upper_dev_link(dev, upper_dev, false, ++ NULL, NULL, &priv, extack); ++} ++EXPORT_SYMBOL(netdev_upper_dev_link); ++ ++/** ++ * netdev_master_upper_dev_link - Add a master link to the upper device ++ * @dev: device ++ * @upper_dev: new upper device ++ * @upper_priv: upper device private ++ * @upper_info: upper info to be passed down via notifier ++ * @extack: netlink extended ack ++ * ++ * Adds a link to device which is upper to this one. In this case, only ++ * one master upper device can be linked, although other non-master devices ++ * might be linked as well. The caller must hold the RTNL lock. ++ * On a failure a negative errno code is returned. On success the reference ++ * counts are adjusted and the function returns zero. ++ */ ++int netdev_master_upper_dev_link(struct net_device *dev, ++ struct net_device *upper_dev, ++ void *upper_priv, void *upper_info, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, ++ .data = NULL, ++ }; ++ ++ return __netdev_upper_dev_link(dev, upper_dev, true, ++ upper_priv, upper_info, &priv, extack); ++} ++EXPORT_SYMBOL(netdev_master_upper_dev_link); ++ ++static void __netdev_upper_dev_unlink(struct net_device *dev, ++ struct net_device *upper_dev, ++ struct netdev_nested_priv *priv) ++{ ++ struct netdev_notifier_changeupper_info changeupper_info = { ++ .info = { ++ .dev = dev, ++ }, ++ .upper_dev = upper_dev, ++ .linking = false, ++ }; ++ ++ ASSERT_RTNL(); ++ ++ changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; ++ ++ call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, ++ &changeupper_info.info); ++ ++ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); ++ ++ call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, ++ &changeupper_info.info); ++ ++ __netdev_update_upper_level(dev, NULL); ++ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); ++ ++ __netdev_update_lower_level(upper_dev, priv); ++ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, ++ priv); ++} ++ ++/** ++ * netdev_upper_dev_unlink - Removes a link to upper device ++ * @dev: device ++ * @upper_dev: new upper device ++ * ++ * Removes a link to device which is upper to this one. The caller must hold ++ * the RTNL lock. ++ */ ++void netdev_upper_dev_unlink(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = NESTED_SYNC_TODO, ++ .data = NULL, ++ }; ++ ++ __netdev_upper_dev_unlink(dev, upper_dev, &priv); ++} ++EXPORT_SYMBOL(netdev_upper_dev_unlink); ++ ++static void __netdev_adjacent_dev_set(struct net_device *upper_dev, ++ struct net_device *lower_dev, ++ bool val) ++{ ++ struct netdev_adjacent *adj; ++ ++ adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); ++ if (adj) ++ adj->ignore = val; ++ ++ adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); ++ if (adj) ++ adj->ignore = val; ++} ++ ++static void netdev_adjacent_dev_disable(struct net_device *upper_dev, ++ struct net_device *lower_dev) ++{ ++ __netdev_adjacent_dev_set(upper_dev, lower_dev, true); ++} ++ ++static void netdev_adjacent_dev_enable(struct net_device *upper_dev, ++ struct net_device *lower_dev) ++{ ++ __netdev_adjacent_dev_set(upper_dev, lower_dev, false); ++} ++ ++int netdev_adjacent_change_prepare(struct net_device *old_dev, ++ struct net_device *new_dev, ++ struct net_device *dev, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = 0, ++ .data = NULL, ++ }; ++ int err; ++ ++ if (!new_dev) ++ return 0; ++ ++ if (old_dev && new_dev != old_dev) ++ netdev_adjacent_dev_disable(dev, old_dev); ++ err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, ++ extack); ++ if (err) { ++ if (old_dev && new_dev != old_dev) ++ netdev_adjacent_dev_enable(dev, old_dev); ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(netdev_adjacent_change_prepare); ++ ++void netdev_adjacent_change_commit(struct net_device *old_dev, ++ struct net_device *new_dev, ++ struct net_device *dev) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, ++ .data = NULL, ++ }; ++ ++ if (!new_dev || !old_dev) ++ return; ++ ++ if (new_dev == old_dev) ++ return; ++ ++ netdev_adjacent_dev_enable(dev, old_dev); ++ __netdev_upper_dev_unlink(old_dev, dev, &priv); ++} ++EXPORT_SYMBOL(netdev_adjacent_change_commit); ++ ++void netdev_adjacent_change_abort(struct net_device *old_dev, ++ struct net_device *new_dev, ++ struct net_device *dev) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = 0, ++ .data = NULL, ++ }; ++ ++ if (!new_dev) ++ return; ++ ++ if (old_dev && new_dev != old_dev) ++ netdev_adjacent_dev_enable(dev, old_dev); ++ ++ __netdev_upper_dev_unlink(new_dev, dev, &priv); ++} ++EXPORT_SYMBOL(netdev_adjacent_change_abort); ++ ++/** ++ * netdev_bonding_info_change - Dispatch event about slave change ++ * @dev: device ++ * @bonding_info: info to dispatch ++ * ++ * Send NETDEV_BONDING_INFO to netdev notifiers with info. ++ * The caller must hold the RTNL lock. ++ */ ++void netdev_bonding_info_change(struct net_device *dev, ++ struct netdev_bonding_info *bonding_info) ++{ ++ struct netdev_notifier_bonding_info info = { ++ .info.dev = dev, ++ }; ++ ++ memcpy(&info.bonding_info, bonding_info, ++ sizeof(struct netdev_bonding_info)); ++ call_netdevice_notifiers_info(NETDEV_BONDING_INFO, ++ &info.info); ++} ++EXPORT_SYMBOL(netdev_bonding_info_change); ++ ++static int netdev_offload_xstats_enable_l3(struct net_device *dev, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_offload_xstats_info info = { ++ .info.dev = dev, ++ .info.extack = extack, ++ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, ++ }; ++ int err; ++ int rc; ++ ++ dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), ++ GFP_KERNEL); ++ if (!dev->offload_xstats_l3) ++ return -ENOMEM; ++ ++ rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, ++ NETDEV_OFFLOAD_XSTATS_DISABLE, ++ &info.info); ++ err = notifier_to_errno(rc); ++ if (err) ++ goto free_stats; ++ ++ return 0; ++ ++free_stats: ++ kfree(dev->offload_xstats_l3); ++ dev->offload_xstats_l3 = NULL; ++ return err; ++} ++ ++int netdev_offload_xstats_enable(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ struct netlink_ext_ack *extack) ++{ ++ ASSERT_RTNL(); ++ ++ if (netdev_offload_xstats_enabled(dev, type)) ++ return -EALREADY; ++ ++ switch (type) { ++ case NETDEV_OFFLOAD_XSTATS_TYPE_L3: ++ return netdev_offload_xstats_enable_l3(dev, extack); ++ } ++ ++ WARN_ON(1); ++ return -EINVAL; ++} ++EXPORT_SYMBOL(netdev_offload_xstats_enable); ++ ++static void netdev_offload_xstats_disable_l3(struct net_device *dev) ++{ ++ struct netdev_notifier_offload_xstats_info info = { ++ .info.dev = dev, ++ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, ++ }; ++ ++ call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, ++ &info.info); ++ kfree(dev->offload_xstats_l3); ++ dev->offload_xstats_l3 = NULL; ++} ++ ++int netdev_offload_xstats_disable(struct net_device *dev, ++ enum netdev_offload_xstats_type type) ++{ ++ ASSERT_RTNL(); ++ ++ if (!netdev_offload_xstats_enabled(dev, type)) ++ return -EALREADY; ++ ++ switch (type) { ++ case NETDEV_OFFLOAD_XSTATS_TYPE_L3: ++ netdev_offload_xstats_disable_l3(dev); ++ return 0; ++ } ++ ++ WARN_ON(1); ++ return -EINVAL; ++} ++EXPORT_SYMBOL(netdev_offload_xstats_disable); ++ ++static void netdev_offload_xstats_disable_all(struct net_device *dev) ++{ ++ netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); ++} ++ ++static struct rtnl_hw_stats64 * ++netdev_offload_xstats_get_ptr(const struct net_device *dev, ++ enum netdev_offload_xstats_type type) ++{ ++ switch (type) { ++ case NETDEV_OFFLOAD_XSTATS_TYPE_L3: ++ return dev->offload_xstats_l3; ++ } ++ ++ WARN_ON(1); ++ return NULL; ++} ++ ++bool netdev_offload_xstats_enabled(const struct net_device *dev, ++ enum netdev_offload_xstats_type type) ++{ ++ ASSERT_RTNL(); ++ ++ return netdev_offload_xstats_get_ptr(dev, type); ++} ++EXPORT_SYMBOL(netdev_offload_xstats_enabled); ++ ++struct netdev_notifier_offload_xstats_ru { ++ bool used; ++}; ++ ++struct netdev_notifier_offload_xstats_rd { ++ struct rtnl_hw_stats64 stats; ++ bool used; ++}; ++ ++static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, ++ const struct rtnl_hw_stats64 *src) ++{ ++ dest->rx_packets += src->rx_packets; ++ dest->tx_packets += src->tx_packets; ++ dest->rx_bytes += src->rx_bytes; ++ dest->tx_bytes += src->tx_bytes; ++ dest->rx_errors += src->rx_errors; ++ dest->tx_errors += src->tx_errors; ++ dest->rx_dropped += src->rx_dropped; ++ dest->tx_dropped += src->tx_dropped; ++ dest->multicast += src->multicast; ++} ++ ++static int netdev_offload_xstats_get_used(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ bool *p_used, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_offload_xstats_ru report_used = {}; ++ struct netdev_notifier_offload_xstats_info info = { ++ .info.dev = dev, ++ .info.extack = extack, ++ .type = type, ++ .report_used = &report_used, ++ }; ++ int rc; ++ ++ WARN_ON(!netdev_offload_xstats_enabled(dev, type)); ++ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, ++ &info.info); ++ *p_used = report_used.used; ++ return notifier_to_errno(rc); ++} ++ ++static int netdev_offload_xstats_get_stats(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ struct rtnl_hw_stats64 *p_stats, ++ bool *p_used, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_offload_xstats_rd report_delta = {}; ++ struct netdev_notifier_offload_xstats_info info = { ++ .info.dev = dev, ++ .info.extack = extack, ++ .type = type, ++ .report_delta = &report_delta, ++ }; ++ struct rtnl_hw_stats64 *stats; ++ int rc; ++ ++ stats = netdev_offload_xstats_get_ptr(dev, type); ++ if (WARN_ON(!stats)) ++ return -EINVAL; ++ ++ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, ++ &info.info); ++ ++ /* Cache whatever we got, even if there was an error, otherwise the ++ * successful stats retrievals would get lost. ++ */ ++ netdev_hw_stats64_add(stats, &report_delta.stats); ++ ++ if (p_stats) ++ *p_stats = *stats; ++ *p_used = report_delta.used; ++ ++ return notifier_to_errno(rc); ++} ++ ++int netdev_offload_xstats_get(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ struct rtnl_hw_stats64 *p_stats, bool *p_used, ++ struct netlink_ext_ack *extack) ++{ ++ ASSERT_RTNL(); ++ ++ if (p_stats) ++ return netdev_offload_xstats_get_stats(dev, type, p_stats, ++ p_used, extack); ++ else ++ return netdev_offload_xstats_get_used(dev, type, p_used, ++ extack); ++} ++EXPORT_SYMBOL(netdev_offload_xstats_get); ++ ++void ++netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, ++ const struct rtnl_hw_stats64 *stats) ++{ ++ report_delta->used = true; ++ netdev_hw_stats64_add(&report_delta->stats, stats); ++} ++EXPORT_SYMBOL(netdev_offload_xstats_report_delta); ++ ++void ++netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) ++{ ++ report_used->used = true; ++} ++EXPORT_SYMBOL(netdev_offload_xstats_report_used); ++ ++void netdev_offload_xstats_push_delta(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ const struct rtnl_hw_stats64 *p_stats) ++{ ++ struct rtnl_hw_stats64 *stats; ++ ++ ASSERT_RTNL(); ++ ++ stats = netdev_offload_xstats_get_ptr(dev, type); ++ if (WARN_ON(!stats)) ++ return; ++ ++ netdev_hw_stats64_add(stats, p_stats); ++} ++EXPORT_SYMBOL(netdev_offload_xstats_push_delta); ++ ++/** ++ * netdev_get_xmit_slave - Get the xmit slave of master device ++ * @dev: device ++ * @skb: The packet ++ * @all_slaves: assume all the slaves are active ++ * ++ * The reference counters are not incremented so the caller must be ++ * careful with locks. The caller must hold RCU lock. ++ * %NULL is returned if no slave is found. ++ */ ++ ++struct net_device *netdev_get_xmit_slave(struct net_device *dev, ++ struct sk_buff *skb, ++ bool all_slaves) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (!ops->ndo_get_xmit_slave) ++ return NULL; ++ return ops->ndo_get_xmit_slave(dev, skb, all_slaves); ++} ++EXPORT_SYMBOL(netdev_get_xmit_slave); ++ ++static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, ++ struct sock *sk) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (!ops->ndo_sk_get_lower_dev) ++ return NULL; ++ return ops->ndo_sk_get_lower_dev(dev, sk); ++} ++ ++/** ++ * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket ++ * @dev: device ++ * @sk: the socket ++ * ++ * %NULL is returned if no lower device is found. ++ */ ++ ++struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, ++ struct sock *sk) ++{ ++ struct net_device *lower; ++ ++ lower = netdev_sk_get_lower_dev(dev, sk); ++ while (lower) { ++ dev = lower; ++ lower = netdev_sk_get_lower_dev(dev, sk); ++ } ++ ++ return dev; ++} ++EXPORT_SYMBOL(netdev_sk_get_lowest_dev); ++ ++static void netdev_adjacent_add_links(struct net_device *dev) ++{ ++ struct netdev_adjacent *iter; ++ ++ struct net *net = dev_net(dev); ++ ++ list_for_each_entry(iter, &dev->adj_list.upper, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_add(iter->dev, dev, ++ &iter->dev->adj_list.lower); ++ netdev_adjacent_sysfs_add(dev, iter->dev, ++ &dev->adj_list.upper); ++ } ++ ++ list_for_each_entry(iter, &dev->adj_list.lower, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_add(iter->dev, dev, ++ &iter->dev->adj_list.upper); ++ netdev_adjacent_sysfs_add(dev, iter->dev, ++ &dev->adj_list.lower); ++ } ++} ++ ++static void netdev_adjacent_del_links(struct net_device *dev) ++{ ++ struct netdev_adjacent *iter; ++ ++ struct net *net = dev_net(dev); ++ ++ list_for_each_entry(iter, &dev->adj_list.upper, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_del(iter->dev, dev->name, ++ &iter->dev->adj_list.lower); ++ netdev_adjacent_sysfs_del(dev, iter->dev->name, ++ &dev->adj_list.upper); ++ } ++ ++ list_for_each_entry(iter, &dev->adj_list.lower, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_del(iter->dev, dev->name, ++ &iter->dev->adj_list.upper); ++ netdev_adjacent_sysfs_del(dev, iter->dev->name, ++ &dev->adj_list.lower); ++ } ++} ++ ++void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) ++{ ++ struct netdev_adjacent *iter; ++ ++ struct net *net = dev_net(dev); ++ ++ list_for_each_entry(iter, &dev->adj_list.upper, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_del(iter->dev, oldname, ++ &iter->dev->adj_list.lower); ++ netdev_adjacent_sysfs_add(iter->dev, dev, ++ &iter->dev->adj_list.lower); ++ } ++ ++ list_for_each_entry(iter, &dev->adj_list.lower, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_del(iter->dev, oldname, ++ &iter->dev->adj_list.upper); ++ netdev_adjacent_sysfs_add(iter->dev, dev, ++ &iter->dev->adj_list.upper); ++ } ++} ++ ++void *netdev_lower_dev_get_private(struct net_device *dev, ++ struct net_device *lower_dev) ++{ ++ struct netdev_adjacent *lower; ++ ++ if (!lower_dev) ++ return NULL; ++ lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); ++ if (!lower) ++ return NULL; ++ ++ return lower->private; ++} ++EXPORT_SYMBOL(netdev_lower_dev_get_private); ++ ++ ++/** ++ * netdev_lower_state_changed - Dispatch event about lower device state change ++ * @lower_dev: device ++ * @lower_state_info: state to dispatch ++ * ++ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. ++ * The caller must hold the RTNL lock. ++ */ ++void netdev_lower_state_changed(struct net_device *lower_dev, ++ void *lower_state_info) ++{ ++ struct netdev_notifier_changelowerstate_info changelowerstate_info = { ++ .info.dev = lower_dev, ++ }; ++ ++ ASSERT_RTNL(); ++ changelowerstate_info.lower_state_info = lower_state_info; ++ call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, ++ &changelowerstate_info.info); ++} ++EXPORT_SYMBOL(netdev_lower_state_changed); ++ ++static void dev_change_rx_flags(struct net_device *dev, int flags) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (ops->ndo_change_rx_flags) ++ ops->ndo_change_rx_flags(dev, flags); ++} ++ ++static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) ++{ ++ unsigned int old_flags = dev->flags; ++ kuid_t uid; ++ kgid_t gid; ++ ++ ASSERT_RTNL(); ++ ++ dev->flags |= IFF_PROMISC; ++ dev->promiscuity += inc; ++ if (dev->promiscuity == 0) { ++ /* ++ * Avoid overflow. ++ * If inc causes overflow, untouch promisc and return error. ++ */ ++ if (inc < 0) ++ dev->flags &= ~IFF_PROMISC; ++ else { ++ dev->promiscuity -= inc; ++ netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); ++ return -EOVERFLOW; ++ } ++ } ++ if (dev->flags != old_flags) { ++ pr_info("device %s %s promiscuous mode\n", ++ dev->name, ++ dev->flags & IFF_PROMISC ? "entered" : "left"); ++ if (audit_enabled) { ++ current_uid_gid(&uid, &gid); ++ audit_log(audit_context(), GFP_ATOMIC, ++ AUDIT_ANOM_PROMISCUOUS, ++ "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", ++ dev->name, (dev->flags & IFF_PROMISC), ++ (old_flags & IFF_PROMISC), ++ from_kuid(&init_user_ns, audit_get_loginuid(current)), ++ from_kuid(&init_user_ns, uid), ++ from_kgid(&init_user_ns, gid), ++ audit_get_sessionid(current)); ++ } ++ ++ dev_change_rx_flags(dev, IFF_PROMISC); ++ } ++ if (notify) ++ __dev_notify_flags(dev, old_flags, IFF_PROMISC); ++ return 0; ++} ++ ++/** ++ * dev_set_promiscuity - update promiscuity count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove promiscuity from a device. While the count in the device ++ * remains above zero the interface remains promiscuous. Once it hits zero ++ * the device reverts back to normal filtering operation. A negative inc ++ * value is used to drop promiscuity on the device. ++ * Return 0 if successful or a negative errno code on error. ++ */ ++int dev_set_promiscuity(struct net_device *dev, int inc) ++{ ++ unsigned int old_flags = dev->flags; ++ int err; ++ ++ err = __dev_set_promiscuity(dev, inc, true); ++ if (err < 0) ++ return err; ++ if (dev->flags != old_flags) ++ dev_set_rx_mode(dev); ++ return err; ++} ++EXPORT_SYMBOL(dev_set_promiscuity); ++ ++static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) ++{ ++ unsigned int old_flags = dev->flags, old_gflags = dev->gflags; ++ ++ ASSERT_RTNL(); ++ ++ dev->flags |= IFF_ALLMULTI; ++ dev->allmulti += inc; ++ if (dev->allmulti == 0) { ++ /* ++ * Avoid overflow. ++ * If inc causes overflow, untouch allmulti and return error. ++ */ ++ if (inc < 0) ++ dev->flags &= ~IFF_ALLMULTI; ++ else { ++ dev->allmulti -= inc; ++ netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); ++ return -EOVERFLOW; ++ } ++ } ++ if (dev->flags ^ old_flags) { ++ dev_change_rx_flags(dev, IFF_ALLMULTI); ++ dev_set_rx_mode(dev); ++ if (notify) ++ __dev_notify_flags(dev, old_flags, ++ dev->gflags ^ old_gflags); ++ } ++ return 0; ++} ++ ++/** ++ * dev_set_allmulti - update allmulti count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove reception of all multicast frames to a device. While the ++ * count in the device remains above zero the interface remains listening ++ * to all interfaces. Once it hits zero the device reverts back to normal ++ * filtering operation. A negative @inc value is used to drop the counter ++ * when releasing a resource needing all multicasts. ++ * Return 0 if successful or a negative errno code on error. ++ */ ++ ++int dev_set_allmulti(struct net_device *dev, int inc) ++{ ++ return __dev_set_allmulti(dev, inc, true); ++} ++EXPORT_SYMBOL(dev_set_allmulti); ++ ++/* ++ * Upload unicast and multicast address lists to device and ++ * configure RX filtering. When the device doesn't support unicast ++ * filtering it is put in promiscuous mode while unicast addresses ++ * are present. ++ */ ++void __dev_set_rx_mode(struct net_device *dev) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ /* dev_open will call this function so the list will stay sane. */ ++ if (!(dev->flags&IFF_UP)) ++ return; ++ ++ if (!netif_device_present(dev)) ++ return; ++ ++ if (!(dev->priv_flags & IFF_UNICAST_FLT)) { ++ /* Unicast addresses changes may only happen under the rtnl, ++ * therefore calling __dev_set_promiscuity here is safe. ++ */ ++ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { ++ __dev_set_promiscuity(dev, 1, false); ++ dev->uc_promisc = true; ++ } else if (netdev_uc_empty(dev) && dev->uc_promisc) { ++ __dev_set_promiscuity(dev, -1, false); ++ dev->uc_promisc = false; ++ } ++ } ++ ++ if (ops->ndo_set_rx_mode) ++ ops->ndo_set_rx_mode(dev); ++} ++ ++void dev_set_rx_mode(struct net_device *dev) ++{ ++ netif_addr_lock_bh(dev); ++ __dev_set_rx_mode(dev); ++ netif_addr_unlock_bh(dev); ++} ++ ++/** ++ * dev_get_flags - get flags reported to userspace ++ * @dev: device ++ * ++ * Get the combination of flag bits exported through APIs to userspace. ++ */ ++unsigned int dev_get_flags(const struct net_device *dev) ++{ ++ unsigned int flags; ++ ++ flags = (dev->flags & ~(IFF_PROMISC | ++ IFF_ALLMULTI | ++ IFF_RUNNING | ++ IFF_LOWER_UP | ++ IFF_DORMANT)) | ++ (dev->gflags & (IFF_PROMISC | ++ IFF_ALLMULTI)); ++ ++ if (netif_running(dev)) { ++ if (netif_oper_up(dev)) ++ flags |= IFF_RUNNING; ++ if (netif_carrier_ok(dev)) ++ flags |= IFF_LOWER_UP; ++ if (netif_dormant(dev)) ++ flags |= IFF_DORMANT; ++ } ++ ++ return flags; ++} ++EXPORT_SYMBOL(dev_get_flags); ++ ++int __dev_change_flags(struct net_device *dev, unsigned int flags, ++ struct netlink_ext_ack *extack) ++{ ++ unsigned int old_flags = dev->flags; ++ int ret; ++ ++ ASSERT_RTNL(); ++ ++ /* ++ * Set the flags on our device. ++ */ ++ ++ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | ++ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | ++ IFF_AUTOMEDIA)) | ++ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | ++ IFF_ALLMULTI)); ++ ++ /* ++ * Load in the correct multicast list now the flags have changed. ++ */ ++ ++ if ((old_flags ^ flags) & IFF_MULTICAST) ++ dev_change_rx_flags(dev, IFF_MULTICAST); ++ ++ dev_set_rx_mode(dev); ++ ++ /* ++ * Have we downed the interface. We handle IFF_UP ourselves ++ * according to user attempts to set it, rather than blindly ++ * setting it. ++ */ ++ ++ ret = 0; ++ if ((old_flags ^ flags) & IFF_UP) { ++ if (old_flags & IFF_UP) ++ __dev_close(dev); ++ else ++ ret = __dev_open(dev, extack); ++ } ++ ++ if ((flags ^ dev->gflags) & IFF_PROMISC) { ++ int inc = (flags & IFF_PROMISC) ? 1 : -1; ++ unsigned int old_flags = dev->flags; ++ ++ dev->gflags ^= IFF_PROMISC; ++ ++ if (__dev_set_promiscuity(dev, inc, false) >= 0) ++ if (dev->flags != old_flags) ++ dev_set_rx_mode(dev); ++ } ++ ++ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI ++ * is important. Some (broken) drivers set IFF_PROMISC, when ++ * IFF_ALLMULTI is requested not asking us and not reporting. ++ */ ++ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { ++ int inc = (flags & IFF_ALLMULTI) ? 1 : -1; ++ ++ dev->gflags ^= IFF_ALLMULTI; ++ __dev_set_allmulti(dev, inc, false); ++ } ++ ++ return ret; ++} ++ ++void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, ++ unsigned int gchanges) ++{ ++ unsigned int changes = dev->flags ^ old_flags; ++ ++ if (gchanges) ++ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); ++ ++ if (changes & IFF_UP) { ++ if (dev->flags & IFF_UP) ++ call_netdevice_notifiers(NETDEV_UP, dev); ++ else ++ call_netdevice_notifiers(NETDEV_DOWN, dev); ++ } ++ ++ if (dev->flags & IFF_UP && ++ (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { ++ struct netdev_notifier_change_info change_info = { ++ .info = { ++ .dev = dev, ++ }, ++ .flags_changed = changes, ++ }; ++ ++ call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); ++ } ++} ++ ++/** ++ * dev_change_flags - change device settings ++ * @dev: device ++ * @flags: device state flags ++ * @extack: netlink extended ack ++ * ++ * Change settings on device based state flags. The flags are ++ * in the userspace exported format. ++ */ ++int dev_change_flags(struct net_device *dev, unsigned int flags, ++ struct netlink_ext_ack *extack) ++{ ++ int ret; ++ unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ++ ++ ret = __dev_change_flags(dev, flags, extack); ++ if (ret < 0) ++ return ret; ++ ++ changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); ++ __dev_notify_flags(dev, old_flags, changes); ++ return ret; ++} ++EXPORT_SYMBOL(dev_change_flags); ++ ++int __dev_set_mtu(struct net_device *dev, int new_mtu) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (ops->ndo_change_mtu) ++ return ops->ndo_change_mtu(dev, new_mtu); ++ ++ /* Pairs with all the lockless reads of dev->mtu in the stack */ ++ WRITE_ONCE(dev->mtu, new_mtu); ++ return 0; ++} ++EXPORT_SYMBOL(__dev_set_mtu); ++ ++int dev_validate_mtu(struct net_device *dev, int new_mtu, ++ struct netlink_ext_ack *extack) ++{ ++ /* MTU must be positive, and in range */ ++ if (new_mtu < 0 || new_mtu < dev->min_mtu) { ++ NL_SET_ERR_MSG(extack, "mtu less than device minimum"); ++ return -EINVAL; ++ } ++ ++ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { ++ NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++/** ++ * dev_set_mtu_ext - Change maximum transfer unit ++ * @dev: device ++ * @new_mtu: new transfer unit ++ * @extack: netlink extended ack ++ * ++ * Change the maximum transfer size of the network device. ++ */ ++int dev_set_mtu_ext(struct net_device *dev, int new_mtu, ++ struct netlink_ext_ack *extack) ++{ ++ int err, orig_mtu; ++ ++ if (new_mtu == dev->mtu) ++ return 0; ++ ++ err = dev_validate_mtu(dev, new_mtu, extack); ++ if (err) ++ return err; ++ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); ++ err = notifier_to_errno(err); ++ if (err) ++ return err; ++ ++ orig_mtu = dev->mtu; ++ err = __dev_set_mtu(dev, new_mtu); ++ ++ if (!err) { ++ err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, ++ orig_mtu); ++ err = notifier_to_errno(err); ++ if (err) { ++ /* setting mtu back and notifying everyone again, ++ * so that they have a chance to revert changes. ++ */ ++ __dev_set_mtu(dev, orig_mtu); ++ call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, ++ new_mtu); ++ } ++ } ++ return err; ++} ++ ++int dev_set_mtu(struct net_device *dev, int new_mtu) ++{ ++ struct netlink_ext_ack extack; ++ int err; ++ ++ memset(&extack, 0, sizeof(extack)); ++ err = dev_set_mtu_ext(dev, new_mtu, &extack); ++ if (err && extack._msg) ++ net_err_ratelimited("%s: %s\n", dev->name, extack._msg); ++ return err; ++} ++EXPORT_SYMBOL(dev_set_mtu); ++ ++/** ++ * dev_change_tx_queue_len - Change TX queue length of a netdevice ++ * @dev: device ++ * @new_len: new tx queue length ++ */ ++int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) ++{ ++ unsigned int orig_len = dev->tx_queue_len; ++ int res; ++ ++ if (new_len != (unsigned int)new_len) ++ return -ERANGE; ++ ++ if (new_len != orig_len) { ++ dev->tx_queue_len = new_len; ++ res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); ++ res = notifier_to_errno(res); ++ if (res) ++ goto err_rollback; ++ res = dev_qdisc_change_tx_queue_len(dev); ++ if (res) ++ goto err_rollback; ++ } ++ ++ return 0; ++ ++err_rollback: ++ netdev_err(dev, "refused to change device tx_queue_len\n"); ++ dev->tx_queue_len = orig_len; ++ return res; ++} ++ ++/** ++ * dev_set_group - Change group this device belongs to ++ * @dev: device ++ * @new_group: group this device should belong to ++ */ ++void dev_set_group(struct net_device *dev, int new_group) ++{ ++ dev->group = new_group; ++} ++ ++/** ++ * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. ++ * @dev: device ++ * @addr: new address ++ * @extack: netlink extended ack ++ */ ++int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_pre_changeaddr_info info = { ++ .info.dev = dev, ++ .info.extack = extack, ++ .dev_addr = addr, ++ }; ++ int rc; ++ ++ rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); ++ return notifier_to_errno(rc); ++} ++EXPORT_SYMBOL(dev_pre_changeaddr_notify); ++ ++/** ++ * dev_set_mac_address - Change Media Access Control Address ++ * @dev: device ++ * @sa: new address ++ * @extack: netlink extended ack ++ * ++ * Change the hardware (MAC) address of the device ++ */ ++int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, ++ struct netlink_ext_ack *extack) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ int err; ++ ++ if (!ops->ndo_set_mac_address) ++ return -EOPNOTSUPP; ++ if (sa->sa_family != dev->type) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); ++ if (err) ++ return err; ++ err = ops->ndo_set_mac_address(dev, sa); ++ if (err) ++ return err; ++ dev->addr_assign_type = NET_ADDR_SET; ++ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); ++ add_device_randomness(dev->dev_addr, dev->addr_len); ++ return 0; ++} ++EXPORT_SYMBOL(dev_set_mac_address); ++ ++static DECLARE_RWSEM(dev_addr_sem); ++ ++int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, ++ struct netlink_ext_ack *extack) ++{ ++ int ret; ++ ++ down_write(&dev_addr_sem); ++ ret = dev_set_mac_address(dev, sa, extack); ++ up_write(&dev_addr_sem); ++ return ret; ++} ++EXPORT_SYMBOL(dev_set_mac_address_user); ++ ++int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) ++{ ++ size_t size = sizeof(sa->sa_data); ++ struct net_device *dev; ++ int ret = 0; ++ ++ down_read(&dev_addr_sem); ++ rcu_read_lock(); ++ ++ dev = dev_get_by_name_rcu(net, dev_name); ++ if (!dev) { ++ ret = -ENODEV; ++ goto unlock; ++ } ++ if (!dev->addr_len) ++ memset(sa->sa_data, 0, size); ++ else ++ memcpy(sa->sa_data, dev->dev_addr, ++ min_t(size_t, size, dev->addr_len)); ++ sa->sa_family = dev->type; ++ ++unlock: ++ rcu_read_unlock(); ++ up_read(&dev_addr_sem); ++ return ret; ++} ++EXPORT_SYMBOL(dev_get_mac_address); ++ ++/** ++ * dev_change_carrier - Change device carrier ++ * @dev: device ++ * @new_carrier: new value ++ * ++ * Change device carrier ++ */ ++int dev_change_carrier(struct net_device *dev, bool new_carrier) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (!ops->ndo_change_carrier) ++ return -EOPNOTSUPP; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return ops->ndo_change_carrier(dev, new_carrier); ++} ++ ++/** ++ * dev_get_phys_port_id - Get device physical port ID ++ * @dev: device ++ * @ppid: port ID ++ * ++ * Get device physical port ID ++ */ ++int dev_get_phys_port_id(struct net_device *dev, ++ struct netdev_phys_item_id *ppid) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (!ops->ndo_get_phys_port_id) ++ return -EOPNOTSUPP; ++ return ops->ndo_get_phys_port_id(dev, ppid); ++} ++ ++/** ++ * dev_get_phys_port_name - Get device physical port name ++ * @dev: device ++ * @name: port name ++ * @len: limit of bytes to copy to name ++ * ++ * Get device physical port name ++ */ ++int dev_get_phys_port_name(struct net_device *dev, ++ char *name, size_t len) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ int err; ++ ++ if (ops->ndo_get_phys_port_name) { ++ err = ops->ndo_get_phys_port_name(dev, name, len); ++ if (err != -EOPNOTSUPP) ++ return err; ++ } ++ return devlink_compat_phys_port_name_get(dev, name, len); ++} ++ ++/** ++ * dev_get_port_parent_id - Get the device's port parent identifier ++ * @dev: network device ++ * @ppid: pointer to a storage for the port's parent identifier ++ * @recurse: allow/disallow recursion to lower devices ++ * ++ * Get the devices's port parent identifier ++ */ ++int dev_get_port_parent_id(struct net_device *dev, ++ struct netdev_phys_item_id *ppid, ++ bool recurse) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ struct netdev_phys_item_id first = { }; ++ struct net_device *lower_dev; ++ struct list_head *iter; ++ int err; ++ ++ if (ops->ndo_get_port_parent_id) { ++ err = ops->ndo_get_port_parent_id(dev, ppid); ++ if (err != -EOPNOTSUPP) ++ return err; ++ } ++ ++ err = devlink_compat_switch_id_get(dev, ppid); ++ if (!recurse || err != -EOPNOTSUPP) ++ return err; ++ ++ netdev_for_each_lower_dev(dev, lower_dev, iter) { ++ err = dev_get_port_parent_id(lower_dev, ppid, true); ++ if (err) ++ break; ++ if (!first.id_len) ++ first = *ppid; ++ else if (memcmp(&first, ppid, sizeof(*ppid))) ++ return -EOPNOTSUPP; ++ } ++ ++ return err; ++} ++EXPORT_SYMBOL(dev_get_port_parent_id); ++ ++/** ++ * netdev_port_same_parent_id - Indicate if two network devices have ++ * the same port parent identifier ++ * @a: first network device ++ * @b: second network device ++ */ ++bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) ++{ ++ struct netdev_phys_item_id a_id = { }; ++ struct netdev_phys_item_id b_id = { }; ++ ++ if (dev_get_port_parent_id(a, &a_id, true) || ++ dev_get_port_parent_id(b, &b_id, true)) ++ return false; ++ ++ return netdev_phys_item_id_same(&a_id, &b_id); ++} ++EXPORT_SYMBOL(netdev_port_same_parent_id); ++ ++/** ++ * dev_change_proto_down - set carrier according to proto_down. ++ * ++ * @dev: device ++ * @proto_down: new value ++ */ ++int dev_change_proto_down(struct net_device *dev, bool proto_down) ++{ ++ if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) ++ return -EOPNOTSUPP; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ if (proto_down) ++ netif_carrier_off(dev); ++ else ++ netif_carrier_on(dev); ++ dev->proto_down = proto_down; ++ return 0; ++} ++ ++/** ++ * dev_change_proto_down_reason - proto down reason ++ * ++ * @dev: device ++ * @mask: proto down mask ++ * @value: proto down value ++ */ ++void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, ++ u32 value) ++{ ++ int b; ++ ++ if (!mask) { ++ dev->proto_down_reason = value; ++ } else { ++ for_each_set_bit(b, &mask, 32) { ++ if (value & (1 << b)) ++ dev->proto_down_reason |= BIT(b); ++ else ++ dev->proto_down_reason &= ~BIT(b); ++ } ++ } ++} ++ ++struct bpf_xdp_link { ++ struct bpf_link link; ++ struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ ++ int flags; ++}; ++ ++static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) ++{ ++ if (flags & XDP_FLAGS_HW_MODE) ++ return XDP_MODE_HW; ++ if (flags & XDP_FLAGS_DRV_MODE) ++ return XDP_MODE_DRV; ++ if (flags & XDP_FLAGS_SKB_MODE) ++ return XDP_MODE_SKB; ++ return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; ++} ++ ++static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) ++{ ++ switch (mode) { ++ case XDP_MODE_SKB: ++ return generic_xdp_install; ++ case XDP_MODE_DRV: ++ case XDP_MODE_HW: ++ return dev->netdev_ops->ndo_bpf; ++ default: ++ return NULL; ++ } ++} ++ ++static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, ++ enum bpf_xdp_mode mode) ++{ ++ return dev->xdp_state[mode].link; ++} ++ ++static struct bpf_prog *dev_xdp_prog(struct net_device *dev, ++ enum bpf_xdp_mode mode) ++{ ++ struct bpf_xdp_link *link = dev_xdp_link(dev, mode); ++ ++ if (link) ++ return link->link.prog; ++ return dev->xdp_state[mode].prog; ++} ++ ++u8 dev_xdp_prog_count(struct net_device *dev) ++{ ++ u8 count = 0; ++ int i; ++ ++ for (i = 0; i < __MAX_XDP_MODE; i++) ++ if (dev->xdp_state[i].prog || dev->xdp_state[i].link) ++ count++; ++ return count; ++} ++EXPORT_SYMBOL_GPL(dev_xdp_prog_count); ++ ++u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) ++{ ++ struct bpf_prog *prog = dev_xdp_prog(dev, mode); ++ ++ return prog ? prog->aux->id : 0; ++} ++ ++static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, ++ struct bpf_xdp_link *link) ++{ ++ dev->xdp_state[mode].link = link; ++ dev->xdp_state[mode].prog = NULL; ++} ++ ++static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, ++ struct bpf_prog *prog) ++{ ++ dev->xdp_state[mode].link = NULL; ++ dev->xdp_state[mode].prog = prog; ++} ++ ++static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, ++ bpf_op_t bpf_op, struct netlink_ext_ack *extack, ++ u32 flags, struct bpf_prog *prog) ++{ ++ struct netdev_bpf xdp; ++ int err; ++ ++ memset(&xdp, 0, sizeof(xdp)); ++ xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; ++ xdp.extack = extack; ++ xdp.flags = flags; ++ xdp.prog = prog; ++ ++ /* Drivers assume refcnt is already incremented (i.e, prog pointer is ++ * "moved" into driver), so they don't increment it on their own, but ++ * they do decrement refcnt when program is detached or replaced. ++ * Given net_device also owns link/prog, we need to bump refcnt here ++ * to prevent drivers from underflowing it. ++ */ ++ if (prog) ++ bpf_prog_inc(prog); ++ err = bpf_op(dev, &xdp); ++ if (err) { ++ if (prog) ++ bpf_prog_put(prog); ++ return err; ++ } ++ ++ if (mode != XDP_MODE_HW) ++ bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); ++ ++ return 0; ++} ++ ++static void dev_xdp_uninstall(struct net_device *dev) ++{ ++ struct bpf_xdp_link *link; ++ struct bpf_prog *prog; ++ enum bpf_xdp_mode mode; ++ bpf_op_t bpf_op; ++ ++ ASSERT_RTNL(); ++ ++ for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { ++ prog = dev_xdp_prog(dev, mode); ++ if (!prog) ++ continue; ++ ++ bpf_op = dev_xdp_bpf_op(dev, mode); ++ if (!bpf_op) ++ continue; ++ ++ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); ++ ++ /* auto-detach link from net device */ ++ link = dev_xdp_link(dev, mode); ++ if (link) ++ link->dev = NULL; ++ else ++ bpf_prog_put(prog); ++ ++ dev_xdp_set_link(dev, mode, NULL); ++ } ++} ++ ++static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, ++ struct bpf_xdp_link *link, struct bpf_prog *new_prog, ++ struct bpf_prog *old_prog, u32 flags) ++{ ++ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); ++ struct bpf_prog *cur_prog; ++ struct net_device *upper; ++ struct list_head *iter; ++ enum bpf_xdp_mode mode; ++ bpf_op_t bpf_op; ++ int err; ++ ++ ASSERT_RTNL(); ++ ++ /* either link or prog attachment, never both */ ++ if (link && (new_prog || old_prog)) ++ return -EINVAL; ++ /* link supports only XDP mode flags */ ++ if (link && (flags & ~XDP_FLAGS_MODES)) { ++ NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); ++ return -EINVAL; ++ } ++ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ ++ if (num_modes > 1) { ++ NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); ++ return -EINVAL; ++ } ++ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ ++ if (!num_modes && dev_xdp_prog_count(dev) > 1) { ++ NL_SET_ERR_MSG(extack, ++ "More than one program loaded, unset mode is ambiguous"); ++ return -EINVAL; ++ } ++ /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ ++ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { ++ NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); ++ return -EINVAL; ++ } ++ ++ mode = dev_xdp_mode(dev, flags); ++ /* can't replace attached link */ ++ if (dev_xdp_link(dev, mode)) { ++ NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); ++ return -EBUSY; ++ } ++ ++ /* don't allow if an upper device already has a program */ ++ netdev_for_each_upper_dev_rcu(dev, upper, iter) { ++ if (dev_xdp_prog_count(upper) > 0) { ++ NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); ++ return -EEXIST; ++ } ++ } ++ ++ cur_prog = dev_xdp_prog(dev, mode); ++ /* can't replace attached prog with link */ ++ if (link && cur_prog) { ++ NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); ++ return -EBUSY; ++ } ++ if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { ++ NL_SET_ERR_MSG(extack, "Active program does not match expected"); ++ return -EEXIST; ++ } ++ ++ /* put effective new program into new_prog */ ++ if (link) ++ new_prog = link->link.prog; ++ ++ if (new_prog) { ++ bool offload = mode == XDP_MODE_HW; ++ enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB ++ ? XDP_MODE_DRV : XDP_MODE_SKB; ++ ++ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { ++ NL_SET_ERR_MSG(extack, "XDP program already attached"); ++ return -EBUSY; ++ } ++ if (!offload && dev_xdp_prog(dev, other_mode)) { ++ NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); ++ return -EEXIST; ++ } ++ if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { ++ NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); ++ return -EINVAL; ++ } ++ if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { ++ NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); ++ return -EINVAL; ++ } ++ if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { ++ NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); ++ return -EINVAL; ++ } ++ } ++ ++ /* don't call drivers if the effective program didn't change */ ++ if (new_prog != cur_prog) { ++ bpf_op = dev_xdp_bpf_op(dev, mode); ++ if (!bpf_op) { ++ NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); ++ return -EOPNOTSUPP; ++ } ++ ++ err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); ++ if (err) ++ return err; ++ } ++ ++ if (link) ++ dev_xdp_set_link(dev, mode, link); ++ else ++ dev_xdp_set_prog(dev, mode, new_prog); ++ if (cur_prog) ++ bpf_prog_put(cur_prog); ++ ++ return 0; ++} ++ ++static int dev_xdp_attach_link(struct net_device *dev, ++ struct netlink_ext_ack *extack, ++ struct bpf_xdp_link *link) ++{ ++ return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); ++} ++ ++static int dev_xdp_detach_link(struct net_device *dev, ++ struct netlink_ext_ack *extack, ++ struct bpf_xdp_link *link) ++{ ++ enum bpf_xdp_mode mode; ++ bpf_op_t bpf_op; ++ ++ ASSERT_RTNL(); ++ ++ mode = dev_xdp_mode(dev, link->flags); ++ if (dev_xdp_link(dev, mode) != link) ++ return -EINVAL; ++ ++ bpf_op = dev_xdp_bpf_op(dev, mode); ++ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); ++ dev_xdp_set_link(dev, mode, NULL); ++ return 0; ++} ++ ++static void bpf_xdp_link_release(struct bpf_link *link) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ ++ rtnl_lock(); ++ ++ /* if racing with net_device's tear down, xdp_link->dev might be ++ * already NULL, in which case link was already auto-detached ++ */ ++ if (xdp_link->dev) { ++ WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); ++ xdp_link->dev = NULL; ++ } ++ ++ rtnl_unlock(); ++} ++ ++static int bpf_xdp_link_detach(struct bpf_link *link) ++{ ++ bpf_xdp_link_release(link); ++ return 0; ++} ++ ++static void bpf_xdp_link_dealloc(struct bpf_link *link) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ ++ kfree(xdp_link); ++} ++ ++static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, ++ struct seq_file *seq) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ u32 ifindex = 0; ++ ++ rtnl_lock(); ++ if (xdp_link->dev) ++ ifindex = xdp_link->dev->ifindex; ++ rtnl_unlock(); ++ ++ seq_printf(seq, "ifindex:\t%u\n", ifindex); ++} ++ ++static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, ++ struct bpf_link_info *info) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ u32 ifindex = 0; ++ ++ rtnl_lock(); ++ if (xdp_link->dev) ++ ifindex = xdp_link->dev->ifindex; ++ rtnl_unlock(); ++ ++ info->xdp.ifindex = ifindex; ++ return 0; ++} ++ ++static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, ++ struct bpf_prog *old_prog) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ enum bpf_xdp_mode mode; ++ bpf_op_t bpf_op; ++ int err = 0; ++ ++ rtnl_lock(); ++ ++ /* link might have been auto-released already, so fail */ ++ if (!xdp_link->dev) { ++ err = -ENOLINK; ++ goto out_unlock; ++ } ++ ++ if (old_prog && link->prog != old_prog) { ++ err = -EPERM; ++ goto out_unlock; ++ } ++ old_prog = link->prog; ++ if (old_prog->type != new_prog->type || ++ old_prog->expected_attach_type != new_prog->expected_attach_type) { ++ err = -EINVAL; ++ goto out_unlock; ++ } ++ ++ if (old_prog == new_prog) { ++ /* no-op, don't disturb drivers */ ++ bpf_prog_put(new_prog); ++ goto out_unlock; ++ } ++ ++ mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); ++ bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); ++ err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, ++ xdp_link->flags, new_prog); ++ if (err) ++ goto out_unlock; ++ ++ old_prog = xchg(&link->prog, new_prog); ++ bpf_prog_put(old_prog); ++ ++out_unlock: ++ rtnl_unlock(); ++ return err; ++} ++ ++static const struct bpf_link_ops bpf_xdp_link_lops = { ++ .release = bpf_xdp_link_release, ++ .dealloc = bpf_xdp_link_dealloc, ++ .detach = bpf_xdp_link_detach, ++ .show_fdinfo = bpf_xdp_link_show_fdinfo, ++ .fill_link_info = bpf_xdp_link_fill_link_info, ++ .update_prog = bpf_xdp_link_update, ++}; ++ ++int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) ++{ ++ struct net *net = current->nsproxy->net_ns; ++ struct bpf_link_primer link_primer; ++ struct bpf_xdp_link *link; ++ struct net_device *dev; ++ int err, fd; ++ ++ rtnl_lock(); ++ dev = dev_get_by_index(net, attr->link_create.target_ifindex); ++ if (!dev) { ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ link = kzalloc(sizeof(*link), GFP_USER); ++ if (!link) { ++ err = -ENOMEM; ++ goto unlock; ++ } ++ ++ bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); ++ link->dev = dev; ++ link->flags = attr->link_create.flags; ++ ++ err = bpf_link_prime(&link->link, &link_primer); ++ if (err) { ++ kfree(link); ++ goto unlock; ++ } ++ ++ err = dev_xdp_attach_link(dev, NULL, link); ++ rtnl_unlock(); ++ ++ if (err) { ++ link->dev = NULL; ++ bpf_link_cleanup(&link_primer); ++ goto out_put_dev; ++ } ++ ++ fd = bpf_link_settle(&link_primer); ++ /* link itself doesn't hold dev's refcnt to not complicate shutdown */ ++ dev_put(dev); ++ return fd; ++ ++unlock: ++ rtnl_unlock(); ++ ++out_put_dev: ++ dev_put(dev); ++ return err; ++} ++ ++/** ++ * dev_change_xdp_fd - set or clear a bpf program for a device rx path ++ * @dev: device ++ * @extack: netlink extended ack ++ * @fd: new program fd or negative value to clear ++ * @expected_fd: old program fd that userspace expects to replace or clear ++ * @flags: xdp-related flags ++ * ++ * Set or clear a bpf program for a device ++ */ ++int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, ++ int fd, int expected_fd, u32 flags) ++{ ++ enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); ++ struct bpf_prog *new_prog = NULL, *old_prog = NULL; ++ int err; ++ ++ ASSERT_RTNL(); ++ ++ if (fd >= 0) { ++ new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, ++ mode != XDP_MODE_SKB); ++ if (IS_ERR(new_prog)) ++ return PTR_ERR(new_prog); ++ } ++ ++ if (expected_fd >= 0) { ++ old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, ++ mode != XDP_MODE_SKB); ++ if (IS_ERR(old_prog)) { ++ err = PTR_ERR(old_prog); ++ old_prog = NULL; ++ goto err_out; ++ } ++ } ++ ++ err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); ++ ++err_out: ++ if (err && new_prog) ++ bpf_prog_put(new_prog); ++ if (old_prog) ++ bpf_prog_put(old_prog); ++ return err; ++} ++ ++/** ++ * dev_new_index - allocate an ifindex ++ * @net: the applicable net namespace ++ * ++ * Returns a suitable unique value for a new device interface ++ * number. The caller must hold the rtnl semaphore or the ++ * dev_base_lock to be sure it remains unique. ++ */ ++static int dev_new_index(struct net *net) ++{ ++ int ifindex = net->ifindex; ++ ++ for (;;) { ++ if (++ifindex <= 0) ++ ifindex = 1; ++ if (!__dev_get_by_index(net, ifindex)) ++ return net->ifindex = ifindex; ++ } ++} ++ ++/* Delayed registration/unregisteration */ ++LIST_HEAD(net_todo_list); ++DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); ++ ++static void net_set_todo(struct net_device *dev) ++{ ++ list_add_tail(&dev->todo_list, &net_todo_list); ++ atomic_inc(&dev_net(dev)->dev_unreg_count); ++} ++ ++static netdev_features_t netdev_sync_upper_features(struct net_device *lower, ++ struct net_device *upper, netdev_features_t features) ++{ ++ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; ++ netdev_features_t feature; ++ int feature_bit; ++ ++ for_each_netdev_feature(upper_disables, feature_bit) { ++ feature = __NETIF_F_BIT(feature_bit); ++ if (!(upper->wanted_features & feature) ++ && (features & feature)) { ++ netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", ++ &feature, upper->name); ++ features &= ~feature; ++ } ++ } ++ ++ return features; ++} ++ ++static void netdev_sync_lower_features(struct net_device *upper, ++ struct net_device *lower, netdev_features_t features) ++{ ++ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; ++ netdev_features_t feature; ++ int feature_bit; ++ ++ for_each_netdev_feature(upper_disables, feature_bit) { ++ feature = __NETIF_F_BIT(feature_bit); ++ if (!(features & feature) && (lower->features & feature)) { ++ netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", ++ &feature, lower->name); ++ lower->wanted_features &= ~feature; ++ __netdev_update_features(lower); ++ ++ if (unlikely(lower->features & feature)) ++ netdev_WARN(upper, "failed to disable %pNF on %s!\n", ++ &feature, lower->name); ++ else ++ netdev_features_change(lower); ++ } ++ } ++} ++ ++static netdev_features_t netdev_fix_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ /* Fix illegal checksum combinations */ ++ if ((features & NETIF_F_HW_CSUM) && ++ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { ++ netdev_warn(dev, "mixed HW and IP checksum settings.\n"); ++ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); ++ } ++ ++ /* TSO requires that SG is present as well. */ ++ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { ++ netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); ++ features &= ~NETIF_F_ALL_TSO; ++ } ++ ++ if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && ++ !(features & NETIF_F_IP_CSUM)) { ++ netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); ++ features &= ~NETIF_F_TSO; ++ features &= ~NETIF_F_TSO_ECN; ++ } ++ ++ if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && ++ !(features & NETIF_F_IPV6_CSUM)) { ++ netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); ++ features &= ~NETIF_F_TSO6; ++ } ++ ++ /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ ++ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) ++ features &= ~NETIF_F_TSO_MANGLEID; ++ ++ /* TSO ECN requires that TSO is present as well. */ ++ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) ++ features &= ~NETIF_F_TSO_ECN; ++ ++ /* Software GSO depends on SG. */ ++ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { ++ netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); ++ features &= ~NETIF_F_GSO; ++ } ++ ++ /* GSO partial features require GSO partial be set */ ++ if ((features & dev->gso_partial_features) && ++ !(features & NETIF_F_GSO_PARTIAL)) { ++ netdev_dbg(dev, ++ "Dropping partially supported GSO features since no GSO partial.\n"); ++ features &= ~dev->gso_partial_features; ++ } ++ ++ if (!(features & NETIF_F_RXCSUM)) { ++ /* NETIF_F_GRO_HW implies doing RXCSUM since every packet ++ * successfully merged by hardware must also have the ++ * checksum verified by hardware. If the user does not ++ * want to enable RXCSUM, logically, we should disable GRO_HW. ++ */ ++ if (features & NETIF_F_GRO_HW) { ++ netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); ++ features &= ~NETIF_F_GRO_HW; ++ } ++ } ++ ++ /* LRO/HW-GRO features cannot be combined with RX-FCS */ ++ if (features & NETIF_F_RXFCS) { ++ if (features & NETIF_F_LRO) { ++ netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); ++ features &= ~NETIF_F_LRO; ++ } ++ ++ if (features & NETIF_F_GRO_HW) { ++ netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); ++ features &= ~NETIF_F_GRO_HW; ++ } ++ } ++ ++ if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { ++ netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); ++ features &= ~NETIF_F_LRO; ++ } ++ ++ if (features & NETIF_F_HW_TLS_TX) { ++ bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == ++ (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); ++ bool hw_csum = features & NETIF_F_HW_CSUM; ++ ++ if (!ip_csum && !hw_csum) { ++ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); ++ features &= ~NETIF_F_HW_TLS_TX; ++ } ++ } ++ ++ if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { ++ netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); ++ features &= ~NETIF_F_HW_TLS_RX; ++ } ++ ++ return features; ++} ++ ++int __netdev_update_features(struct net_device *dev) ++{ ++ struct net_device *upper, *lower; ++ netdev_features_t features; ++ struct list_head *iter; ++ int err = -1; ++ ++ ASSERT_RTNL(); ++ ++ features = netdev_get_wanted_features(dev); ++ ++ if (dev->netdev_ops->ndo_fix_features) ++ features = dev->netdev_ops->ndo_fix_features(dev, features); ++ ++ /* driver might be less strict about feature dependencies */ ++ features = netdev_fix_features(dev, features); ++ ++ /* some features can't be enabled if they're off on an upper device */ ++ netdev_for_each_upper_dev_rcu(dev, upper, iter) ++ features = netdev_sync_upper_features(dev, upper, features); ++ ++ if (dev->features == features) ++ goto sync_lower; ++ ++ netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", ++ &dev->features, &features); ++ ++ if (dev->netdev_ops->ndo_set_features) ++ err = dev->netdev_ops->ndo_set_features(dev, features); ++ else ++ err = 0; ++ ++ if (unlikely(err < 0)) { ++ netdev_err(dev, ++ "set_features() failed (%d); wanted %pNF, left %pNF\n", ++ err, &features, &dev->features); ++ /* return non-0 since some features might have changed and ++ * it's better to fire a spurious notification than miss it ++ */ ++ return -1; ++ } ++ ++sync_lower: ++ /* some features must be disabled on lower devices when disabled ++ * on an upper device (think: bonding master or bridge) ++ */ ++ netdev_for_each_lower_dev(dev, lower, iter) ++ netdev_sync_lower_features(dev, lower, features); ++ ++ if (!err) { ++ netdev_features_t diff = features ^ dev->features; ++ ++ if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { ++ /* udp_tunnel_{get,drop}_rx_info both need ++ * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the ++ * device, or they won't do anything. ++ * Thus we need to update dev->features ++ * *before* calling udp_tunnel_get_rx_info, ++ * but *after* calling udp_tunnel_drop_rx_info. ++ */ ++ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { ++ dev->features = features; ++ udp_tunnel_get_rx_info(dev); ++ } else { ++ udp_tunnel_drop_rx_info(dev); ++ } ++ } ++ ++ if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { ++ if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { ++ dev->features = features; ++ err |= vlan_get_rx_ctag_filter_info(dev); ++ } else { ++ vlan_drop_rx_ctag_filter_info(dev); ++ } ++ } ++ ++ if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { ++ if (features & NETIF_F_HW_VLAN_STAG_FILTER) { ++ dev->features = features; ++ err |= vlan_get_rx_stag_filter_info(dev); ++ } else { ++ vlan_drop_rx_stag_filter_info(dev); ++ } ++ } ++ ++ dev->features = features; ++ } ++ ++ return err < 0 ? 0 : 1; ++} ++ ++/** ++ * netdev_update_features - recalculate device features ++ * @dev: the device to check ++ * ++ * Recalculate dev->features set and send notifications if it ++ * has changed. Should be called after driver or hardware dependent ++ * conditions might have changed that influence the features. ++ */ ++void netdev_update_features(struct net_device *dev) ++{ ++ if (__netdev_update_features(dev)) ++ netdev_features_change(dev); ++} ++EXPORT_SYMBOL(netdev_update_features); ++ ++/** ++ * netdev_change_features - recalculate device features ++ * @dev: the device to check ++ * ++ * Recalculate dev->features set and send notifications even ++ * if they have not changed. Should be called instead of ++ * netdev_update_features() if also dev->vlan_features might ++ * have changed to allow the changes to be propagated to stacked ++ * VLAN devices. ++ */ ++void netdev_change_features(struct net_device *dev) ++{ ++ __netdev_update_features(dev); ++ netdev_features_change(dev); ++} ++EXPORT_SYMBOL(netdev_change_features); ++ ++/** ++ * netif_stacked_transfer_operstate - transfer operstate ++ * @rootdev: the root or lower level device to transfer state from ++ * @dev: the device to transfer operstate to ++ * ++ * Transfer operational state from root to device. This is normally ++ * called when a stacking relationship exists between the root ++ * device and the device(a leaf device). ++ */ ++void netif_stacked_transfer_operstate(const struct net_device *rootdev, ++ struct net_device *dev) ++{ ++ if (rootdev->operstate == IF_OPER_DORMANT) ++ netif_dormant_on(dev); ++ else ++ netif_dormant_off(dev); ++ ++ if (rootdev->operstate == IF_OPER_TESTING) ++ netif_testing_on(dev); ++ else ++ netif_testing_off(dev); ++ ++ if (netif_carrier_ok(rootdev)) ++ netif_carrier_on(dev); ++ else ++ netif_carrier_off(dev); ++} ++EXPORT_SYMBOL(netif_stacked_transfer_operstate); ++ ++static int netif_alloc_rx_queues(struct net_device *dev) ++{ ++ unsigned int i, count = dev->num_rx_queues; ++ struct netdev_rx_queue *rx; ++ size_t sz = count * sizeof(*rx); ++ int err = 0; ++ ++ BUG_ON(count < 1); ++ ++ rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); ++ if (!rx) ++ return -ENOMEM; ++ ++ dev->_rx = rx; ++ ++ for (i = 0; i < count; i++) { ++ rx[i].dev = dev; ++ ++ /* XDP RX-queue setup */ ++ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); ++ if (err < 0) ++ goto err_rxq_info; ++ } ++ return 0; ++ ++err_rxq_info: ++ /* Rollback successful reg's and free other resources */ ++ while (i--) ++ xdp_rxq_info_unreg(&rx[i].xdp_rxq); ++ kvfree(dev->_rx); ++ dev->_rx = NULL; ++ return err; ++} ++ ++static void netif_free_rx_queues(struct net_device *dev) ++{ ++ unsigned int i, count = dev->num_rx_queues; ++ ++ /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ ++ if (!dev->_rx) ++ return; ++ ++ for (i = 0; i < count; i++) ++ xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); ++ ++ kvfree(dev->_rx); ++} ++ ++static void netdev_init_one_queue(struct net_device *dev, ++ struct netdev_queue *queue, void *_unused) ++{ ++ /* Initialize queue lock */ ++ spin_lock_init(&queue->_xmit_lock); ++ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); ++ queue->xmit_lock_owner = -1; ++ netdev_queue_numa_node_write(queue, NUMA_NO_NODE); ++ queue->dev = dev; ++#ifdef CONFIG_BQL ++ dql_init(&queue->dql, HZ); ++#endif ++} ++ ++static void netif_free_tx_queues(struct net_device *dev) ++{ ++ kvfree(dev->_tx); ++} ++ ++static int netif_alloc_netdev_queues(struct net_device *dev) ++{ ++ unsigned int count = dev->num_tx_queues; ++ struct netdev_queue *tx; ++ size_t sz = count * sizeof(*tx); ++ ++ if (count < 1 || count > 0xffff) ++ return -EINVAL; ++ ++ tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); ++ if (!tx) ++ return -ENOMEM; ++ ++ dev->_tx = tx; ++ ++ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); ++ spin_lock_init(&dev->tx_global_lock); ++ ++ return 0; ++} ++ ++void netif_tx_stop_all_queues(struct net_device *dev) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < dev->num_tx_queues; i++) { ++ struct netdev_queue *txq = netdev_get_tx_queue(dev, i); ++ ++ netif_tx_stop_queue(txq); ++ } ++} ++EXPORT_SYMBOL(netif_tx_stop_all_queues); ++ ++/** ++ * register_netdevice() - register a network device ++ * @dev: device to register ++ * ++ * Take a prepared network device structure and make it externally accessible. ++ * A %NETDEV_REGISTER message is sent to the netdev notifier chain. ++ * Callers must hold the rtnl lock - you may want register_netdev() ++ * instead of this. ++ */ ++int register_netdevice(struct net_device *dev) ++{ ++ int ret; ++ struct net *net = dev_net(dev); ++ ++ BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < ++ NETDEV_FEATURE_COUNT); ++ BUG_ON(dev_boot_phase); ++ ASSERT_RTNL(); ++ ++ might_sleep(); ++ ++ /* When net_device's are persistent, this will be fatal. */ ++ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); ++ BUG_ON(!net); ++ ++ ret = ethtool_check_ops(dev->ethtool_ops); ++ if (ret) ++ return ret; ++ ++ spin_lock_init(&dev->addr_list_lock); ++ netdev_set_addr_lockdep_class(dev); ++ ++ ret = dev_get_valid_name(net, dev, dev->name); ++ if (ret < 0) ++ goto out; ++ ++ ret = -ENOMEM; ++ dev->name_node = netdev_name_node_head_alloc(dev); ++ if (!dev->name_node) ++ goto out; ++ ++ /* Init, if this function is available */ ++ if (dev->netdev_ops->ndo_init) { ++ ret = dev->netdev_ops->ndo_init(dev); ++ if (ret) { ++ if (ret > 0) ++ ret = -EIO; ++ goto err_free_name; ++ } ++ } ++ ++ if (((dev->hw_features | dev->features) & ++ NETIF_F_HW_VLAN_CTAG_FILTER) && ++ (!dev->netdev_ops->ndo_vlan_rx_add_vid || ++ !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { ++ netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); ++ ret = -EINVAL; ++ goto err_uninit; ++ } ++ ++ ret = -EBUSY; ++ if (!dev->ifindex) ++ dev->ifindex = dev_new_index(net); ++ else if (__dev_get_by_index(net, dev->ifindex)) ++ goto err_uninit; ++ ++ /* Transfer changeable features to wanted_features and enable ++ * software offloads (GSO and GRO). ++ */ ++ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); ++ dev->features |= NETIF_F_SOFT_FEATURES; ++ ++ if (dev->udp_tunnel_nic_info) { ++ dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; ++ dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; ++ } ++ ++ dev->wanted_features = dev->features & dev->hw_features; ++ ++ if (!(dev->flags & IFF_LOOPBACK)) ++ dev->hw_features |= NETIF_F_NOCACHE_COPY; ++ ++ /* If IPv4 TCP segmentation offload is supported we should also ++ * allow the device to enable segmenting the frame with the option ++ * of ignoring a static IP ID value. This doesn't enable the ++ * feature itself but allows the user to enable it later. ++ */ ++ if (dev->hw_features & NETIF_F_TSO) ++ dev->hw_features |= NETIF_F_TSO_MANGLEID; ++ if (dev->vlan_features & NETIF_F_TSO) ++ dev->vlan_features |= NETIF_F_TSO_MANGLEID; ++ if (dev->mpls_features & NETIF_F_TSO) ++ dev->mpls_features |= NETIF_F_TSO_MANGLEID; ++ if (dev->hw_enc_features & NETIF_F_TSO) ++ dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; ++ ++ /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. ++ */ ++ dev->vlan_features |= NETIF_F_HIGHDMA; ++ ++ /* Make NETIF_F_SG inheritable to tunnel devices. ++ */ ++ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; ++ ++ /* Make NETIF_F_SG inheritable to MPLS. ++ */ ++ dev->mpls_features |= NETIF_F_SG; ++ ++ ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ++ ret = notifier_to_errno(ret); ++ if (ret) ++ goto err_uninit; ++ ++ ret = netdev_register_kobject(dev); ++ write_lock(&dev_base_lock); ++ dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; ++ write_unlock(&dev_base_lock); ++ if (ret) ++ goto err_uninit; ++ ++ __netdev_update_features(dev); ++ ++ /* ++ * Default initial state at registry is that the ++ * device is present. ++ */ ++ ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ ++ linkwatch_init_dev(dev); ++ ++ dev_init_scheduler(dev); ++ ++ netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); ++ list_netdevice(dev); ++ ++ add_device_randomness(dev->dev_addr, dev->addr_len); ++ ++ /* If the device has permanent device address, driver should ++ * set dev_addr and also addr_assign_type should be set to ++ * NET_ADDR_PERM (default value). ++ */ ++ if (dev->addr_assign_type == NET_ADDR_PERM) ++ memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); ++ ++ /* Notify protocols, that a new device appeared. */ ++ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ++ ret = notifier_to_errno(ret); ++ if (ret) { ++ /* Expect explicit free_netdev() on failure */ ++ dev->needs_free_netdev = false; ++ unregister_netdevice_queue(dev, NULL); ++ goto out; ++ } ++ /* ++ * Prevent userspace races by waiting until the network ++ * device is fully setup before sending notifications. ++ */ ++ if (!dev->rtnl_link_ops || ++ dev->rtnl_link_state == RTNL_LINK_INITIALIZED) ++ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); ++ ++out: ++ return ret; ++ ++err_uninit: ++ if (dev->netdev_ops->ndo_uninit) ++ dev->netdev_ops->ndo_uninit(dev); ++ if (dev->priv_destructor) ++ dev->priv_destructor(dev); ++err_free_name: ++ netdev_name_node_free(dev->name_node); ++ goto out; ++} ++EXPORT_SYMBOL(register_netdevice); ++ ++/** ++ * init_dummy_netdev - init a dummy network device for NAPI ++ * @dev: device to init ++ * ++ * This takes a network device structure and initialize the minimum ++ * amount of fields so it can be used to schedule NAPI polls without ++ * registering a full blown interface. This is to be used by drivers ++ * that need to tie several hardware interfaces to a single NAPI ++ * poll scheduler due to HW limitations. ++ */ ++int init_dummy_netdev(struct net_device *dev) ++{ ++ /* Clear everything. Note we don't initialize spinlocks ++ * are they aren't supposed to be taken by any of the ++ * NAPI code and this dummy netdev is supposed to be ++ * only ever used for NAPI polls ++ */ ++ memset(dev, 0, sizeof(struct net_device)); ++ ++ /* make sure we BUG if trying to hit standard ++ * register/unregister code path ++ */ ++ dev->reg_state = NETREG_DUMMY; ++ ++ /* NAPI wants this */ ++ INIT_LIST_HEAD(&dev->napi_list); ++ ++ /* a dummy interface is started by default */ ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ set_bit(__LINK_STATE_START, &dev->state); ++ ++ /* napi_busy_loop stats accounting wants this */ ++ dev_net_set(dev, &init_net); ++ ++ /* Note : We dont allocate pcpu_refcnt for dummy devices, ++ * because users of this 'device' dont need to change ++ * its refcount. ++ */ ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(init_dummy_netdev); ++ ++ ++/** ++ * register_netdev - register a network device ++ * @dev: device to register ++ * ++ * Take a completed network device structure and add it to the kernel ++ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier ++ * chain. 0 is returned on success. A negative errno code is returned ++ * on a failure to set up the device, or if the name is a duplicate. ++ * ++ * This is a wrapper around register_netdevice that takes the rtnl semaphore ++ * and expands the device name if you passed a format string to ++ * alloc_netdev. ++ */ ++int register_netdev(struct net_device *dev) ++{ ++ int err; ++ ++ if (rtnl_lock_killable()) ++ return -EINTR; ++ err = register_netdevice(dev); ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(register_netdev); ++ ++int netdev_refcnt_read(const struct net_device *dev) ++{ ++#ifdef CONFIG_PCPU_DEV_REFCNT ++ int i, refcnt = 0; ++ ++ for_each_possible_cpu(i) ++ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); ++ return refcnt; ++#else ++ return refcount_read(&dev->dev_refcnt); ++#endif ++} ++EXPORT_SYMBOL(netdev_refcnt_read); ++ ++int netdev_unregister_timeout_secs __read_mostly = 10; ++ ++#define WAIT_REFS_MIN_MSECS 1 ++#define WAIT_REFS_MAX_MSECS 250 ++/** ++ * netdev_wait_allrefs_any - wait until all references are gone. ++ * @list: list of net_devices to wait on ++ * ++ * This is called when unregistering network devices. ++ * ++ * Any protocol or device that holds a reference should register ++ * for netdevice notification, and cleanup and put back the ++ * reference if they receive an UNREGISTER event. ++ * We can get stuck here if buggy protocols don't correctly ++ * call dev_put. ++ */ ++static struct net_device *netdev_wait_allrefs_any(struct list_head *list) ++{ ++ unsigned long rebroadcast_time, warning_time; ++ struct net_device *dev; ++ int wait = 0; ++ ++ rebroadcast_time = warning_time = jiffies; ++ ++ list_for_each_entry(dev, list, todo_list) ++ if (netdev_refcnt_read(dev) == 1) ++ return dev; ++ ++ while (true) { ++ if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { ++ rtnl_lock(); ++ ++ /* Rebroadcast unregister notification */ ++ list_for_each_entry(dev, list, todo_list) ++ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); ++ ++ __rtnl_unlock(); ++ rcu_barrier(); ++ rtnl_lock(); ++ ++ list_for_each_entry(dev, list, todo_list) ++ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, ++ &dev->state)) { ++ /* We must not have linkwatch events ++ * pending on unregister. If this ++ * happens, we simply run the queue ++ * unscheduled, resulting in a noop ++ * for this device. ++ */ ++ linkwatch_run_queue(); ++ break; ++ } ++ ++ __rtnl_unlock(); ++ ++ rebroadcast_time = jiffies; ++ } ++ ++ if (!wait) { ++ rcu_barrier(); ++ wait = WAIT_REFS_MIN_MSECS; ++ } else { ++ msleep(wait); ++ wait = min(wait << 1, WAIT_REFS_MAX_MSECS); ++ } ++ ++ list_for_each_entry(dev, list, todo_list) ++ if (netdev_refcnt_read(dev) == 1) ++ return dev; ++ ++ if (time_after(jiffies, warning_time + ++ READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { ++ list_for_each_entry(dev, list, todo_list) { ++ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", ++ dev->name, netdev_refcnt_read(dev)); ++ ref_tracker_dir_print(&dev->refcnt_tracker, 10); ++ } ++ ++ warning_time = jiffies; ++ } ++ } ++} ++ ++/* The sequence is: ++ * ++ * rtnl_lock(); ++ * ... ++ * register_netdevice(x1); ++ * register_netdevice(x2); ++ * ... ++ * unregister_netdevice(y1); ++ * unregister_netdevice(y2); ++ * ... ++ * rtnl_unlock(); ++ * free_netdev(y1); ++ * free_netdev(y2); ++ * ++ * We are invoked by rtnl_unlock(). ++ * This allows us to deal with problems: ++ * 1) We can delete sysfs objects which invoke hotplug ++ * without deadlocking with linkwatch via keventd. ++ * 2) Since we run with the RTNL semaphore not held, we can sleep ++ * safely in order to wait for the netdev refcnt to drop to zero. ++ * ++ * We must not return until all unregister events added during ++ * the interval the lock was held have been completed. ++ */ ++void netdev_run_todo(void) ++{ ++ struct net_device *dev, *tmp; ++ struct list_head list; ++#ifdef CONFIG_LOCKDEP ++ struct list_head unlink_list; ++ ++ list_replace_init(&net_unlink_list, &unlink_list); ++ ++ while (!list_empty(&unlink_list)) { ++ struct net_device *dev = list_first_entry(&unlink_list, ++ struct net_device, ++ unlink_list); ++ list_del_init(&dev->unlink_list); ++ dev->nested_level = dev->lower_level - 1; ++ } ++#endif ++ ++ /* Snapshot list, allow later requests */ ++ list_replace_init(&net_todo_list, &list); ++ ++ __rtnl_unlock(); ++ ++ /* Wait for rcu callbacks to finish before next phase */ ++ if (!list_empty(&list)) ++ rcu_barrier(); ++ ++ list_for_each_entry_safe(dev, tmp, &list, todo_list) { ++ if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { ++ netdev_WARN(dev, "run_todo but not unregistering\n"); ++ list_del(&dev->todo_list); ++ continue; ++ } ++ ++ write_lock(&dev_base_lock); ++ dev->reg_state = NETREG_UNREGISTERED; ++ write_unlock(&dev_base_lock); ++ linkwatch_forget_dev(dev); ++ } ++ ++ while (!list_empty(&list)) { ++ dev = netdev_wait_allrefs_any(&list); ++ list_del(&dev->todo_list); ++ ++ /* paranoia */ ++ BUG_ON(netdev_refcnt_read(dev) != 1); ++ BUG_ON(!list_empty(&dev->ptype_all)); ++ BUG_ON(!list_empty(&dev->ptype_specific)); ++ WARN_ON(rcu_access_pointer(dev->ip_ptr)); ++ WARN_ON(rcu_access_pointer(dev->ip6_ptr)); ++#if IS_ENABLED(CONFIG_DECNET) ++ WARN_ON(dev->dn_ptr); ++#endif ++ if (dev->priv_destructor) ++ dev->priv_destructor(dev); ++ if (dev->needs_free_netdev) ++ free_netdev(dev); ++ ++ if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) ++ wake_up(&netdev_unregistering_wq); ++ ++ /* Free network device */ ++ kobject_put(&dev->dev.kobj); ++ } ++} ++ ++/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has ++ * all the same fields in the same order as net_device_stats, with only ++ * the type differing, but rtnl_link_stats64 may have additional fields ++ * at the end for newer counters. ++ */ ++void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, ++ const struct net_device_stats *netdev_stats) ++{ ++#if BITS_PER_LONG == 64 ++ BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); ++ memcpy(stats64, netdev_stats, sizeof(*netdev_stats)); ++ /* zero out counters that only exist in rtnl_link_stats64 */ ++ memset((char *)stats64 + sizeof(*netdev_stats), 0, ++ sizeof(*stats64) - sizeof(*netdev_stats)); ++#else ++ size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); ++ const unsigned long *src = (const unsigned long *)netdev_stats; ++ u64 *dst = (u64 *)stats64; ++ ++ BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); ++ for (i = 0; i < n; i++) ++ dst[i] = src[i]; ++ /* zero out counters that only exist in rtnl_link_stats64 */ ++ memset((char *)stats64 + n * sizeof(u64), 0, ++ sizeof(*stats64) - n * sizeof(u64)); ++#endif ++} ++EXPORT_SYMBOL(netdev_stats_to_stats64); ++ ++struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) ++{ ++ struct net_device_core_stats __percpu *p; ++ ++ p = alloc_percpu_gfp(struct net_device_core_stats, ++ GFP_ATOMIC | __GFP_NOWARN); ++ ++ if (p && cmpxchg(&dev->core_stats, NULL, p)) ++ free_percpu(p); ++ ++ /* This READ_ONCE() pairs with the cmpxchg() above */ ++ return READ_ONCE(dev->core_stats); ++} ++EXPORT_SYMBOL(netdev_core_stats_alloc); ++ ++/** ++ * dev_get_stats - get network device statistics ++ * @dev: device to get statistics from ++ * @storage: place to store stats ++ * ++ * Get network statistics from device. Return @storage. ++ * The device driver may provide its own method by setting ++ * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; ++ * otherwise the internal statistics structure is used. ++ */ ++struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, ++ struct rtnl_link_stats64 *storage) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ const struct net_device_core_stats __percpu *p; ++ ++ if (ops->ndo_get_stats64) { ++ memset(storage, 0, sizeof(*storage)); ++ ops->ndo_get_stats64(dev, storage); ++ } else if (ops->ndo_get_stats) { ++ netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); ++ } else { ++ netdev_stats_to_stats64(storage, &dev->stats); ++ } ++ ++ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ ++ p = READ_ONCE(dev->core_stats); ++ if (p) { ++ const struct net_device_core_stats *core_stats; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ core_stats = per_cpu_ptr(p, i); ++ storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); ++ storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); ++ storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); ++ storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); ++ } ++ } ++ return storage; ++} ++EXPORT_SYMBOL(dev_get_stats); ++ ++/** ++ * dev_fetch_sw_netstats - get per-cpu network device statistics ++ * @s: place to store stats ++ * @netstats: per-cpu network stats to read from ++ * ++ * Read per-cpu network statistics and populate the related fields in @s. ++ */ ++void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, ++ const struct pcpu_sw_netstats __percpu *netstats) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ u64 rx_packets, rx_bytes, tx_packets, tx_bytes; ++ const struct pcpu_sw_netstats *stats; ++ unsigned int start; ++ ++ stats = per_cpu_ptr(netstats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&stats->syncp); ++ rx_packets = u64_stats_read(&stats->rx_packets); ++ rx_bytes = u64_stats_read(&stats->rx_bytes); ++ tx_packets = u64_stats_read(&stats->tx_packets); ++ tx_bytes = u64_stats_read(&stats->tx_bytes); ++ } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); ++ ++ s->rx_packets += rx_packets; ++ s->rx_bytes += rx_bytes; ++ s->tx_packets += tx_packets; ++ s->tx_bytes += tx_bytes; ++ } ++} ++EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); ++ ++/** ++ * dev_get_tstats64 - ndo_get_stats64 implementation ++ * @dev: device to get statistics from ++ * @s: place to store stats ++ * ++ * Populate @s from dev->stats and dev->tstats. Can be used as ++ * ndo_get_stats64() callback. ++ */ ++void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) ++{ ++ netdev_stats_to_stats64(s, &dev->stats); ++ dev_fetch_sw_netstats(s, dev->tstats); ++} ++EXPORT_SYMBOL_GPL(dev_get_tstats64); ++ ++struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) ++{ ++ struct netdev_queue *queue = dev_ingress_queue(dev); ++ ++#ifdef CONFIG_NET_CLS_ACT ++ if (queue) ++ return queue; ++ queue = kzalloc(sizeof(*queue), GFP_KERNEL); ++ if (!queue) ++ return NULL; ++ netdev_init_one_queue(dev, queue, NULL); ++ RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); ++ queue->qdisc_sleeping = &noop_qdisc; ++ rcu_assign_pointer(dev->ingress_queue, queue); ++#endif ++ return queue; ++} ++ ++static const struct ethtool_ops default_ethtool_ops; ++ ++void netdev_set_default_ethtool_ops(struct net_device *dev, ++ const struct ethtool_ops *ops) ++{ ++ if (dev->ethtool_ops == &default_ethtool_ops) ++ dev->ethtool_ops = ops; ++} ++EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); ++ ++void netdev_freemem(struct net_device *dev) ++{ ++ char *addr = (char *)dev - dev->padded; ++ ++ kvfree(addr); ++} ++ ++/** ++ * alloc_netdev_mqs - allocate network device ++ * @sizeof_priv: size of private data to allocate space for ++ * @name: device name format string ++ * @name_assign_type: origin of device name ++ * @setup: callback to initialize device ++ * @txqs: the number of TX subqueues to allocate ++ * @rxqs: the number of RX subqueues to allocate ++ * ++ * Allocates a struct net_device with private data area for driver use ++ * and performs basic initialization. Also allocates subqueue structs ++ * for each queue on the device. ++ */ ++struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, ++ unsigned char name_assign_type, ++ void (*setup)(struct net_device *), ++ unsigned int txqs, unsigned int rxqs) ++{ ++ struct net_device *dev; ++ unsigned int alloc_size; ++ struct net_device *p; ++ ++ BUG_ON(strlen(name) >= sizeof(dev->name)); ++ ++ if (txqs < 1) { ++ pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); ++ return NULL; ++ } ++ ++ if (rxqs < 1) { ++ pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); ++ return NULL; ++ } ++ ++ alloc_size = sizeof(struct net_device); ++ if (sizeof_priv) { ++ /* ensure 32-byte alignment of private area */ ++ alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); ++ alloc_size += sizeof_priv; ++ } ++ /* ensure 32-byte alignment of whole construct */ ++ alloc_size += NETDEV_ALIGN - 1; ++ ++ p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); ++ if (!p) ++ return NULL; ++ ++ dev = PTR_ALIGN(p, NETDEV_ALIGN); ++ dev->padded = (char *)dev - (char *)p; ++ ++ ref_tracker_dir_init(&dev->refcnt_tracker, 128); ++#ifdef CONFIG_PCPU_DEV_REFCNT ++ dev->pcpu_refcnt = alloc_percpu(int); ++ if (!dev->pcpu_refcnt) ++ goto free_dev; ++ __dev_hold(dev); ++#else ++ refcount_set(&dev->dev_refcnt, 1); ++#endif ++ ++ if (dev_addr_init(dev)) ++ goto free_pcpu; ++ ++ dev_mc_init(dev); ++ dev_uc_init(dev); ++ ++ dev_net_set(dev, &init_net); ++ ++ dev->gso_max_size = GSO_LEGACY_MAX_SIZE; ++ dev->gso_max_segs = GSO_MAX_SEGS; ++ dev->gro_max_size = GRO_LEGACY_MAX_SIZE; ++ dev->tso_max_size = TSO_LEGACY_MAX_SIZE; ++ dev->tso_max_segs = TSO_MAX_SEGS; ++ dev->upper_level = 1; ++ dev->lower_level = 1; ++#ifdef CONFIG_LOCKDEP ++ dev->nested_level = 0; ++ INIT_LIST_HEAD(&dev->unlink_list); ++#endif ++ ++ INIT_LIST_HEAD(&dev->napi_list); ++ INIT_LIST_HEAD(&dev->unreg_list); ++ INIT_LIST_HEAD(&dev->close_list); ++ INIT_LIST_HEAD(&dev->link_watch_list); ++ INIT_LIST_HEAD(&dev->adj_list.upper); ++ INIT_LIST_HEAD(&dev->adj_list.lower); ++ INIT_LIST_HEAD(&dev->ptype_all); ++ INIT_LIST_HEAD(&dev->ptype_specific); ++ INIT_LIST_HEAD(&dev->net_notifier_list); ++#ifdef CONFIG_NET_SCHED ++ hash_init(dev->qdisc_hash); ++#endif ++ dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; ++ setup(dev); ++ ++ if (!dev->tx_queue_len) { ++ dev->priv_flags |= IFF_NO_QUEUE; ++ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; ++ } ++ ++ dev->num_tx_queues = txqs; ++ dev->real_num_tx_queues = txqs; ++ if (netif_alloc_netdev_queues(dev)) ++ goto free_all; ++ ++ dev->num_rx_queues = rxqs; ++ dev->real_num_rx_queues = rxqs; ++ if (netif_alloc_rx_queues(dev)) ++ goto free_all; ++ ++ strcpy(dev->name, name); ++ dev->name_assign_type = name_assign_type; ++ dev->group = INIT_NETDEV_GROUP; ++ if (!dev->ethtool_ops) ++ dev->ethtool_ops = &default_ethtool_ops; ++ ++ nf_hook_netdev_init(dev); ++ ++ return dev; ++ ++free_all: ++ free_netdev(dev); ++ return NULL; ++ ++free_pcpu: ++#ifdef CONFIG_PCPU_DEV_REFCNT ++ free_percpu(dev->pcpu_refcnt); ++free_dev: ++#endif ++ netdev_freemem(dev); ++ return NULL; ++} ++EXPORT_SYMBOL(alloc_netdev_mqs); ++ ++/** ++ * free_netdev - free network device ++ * @dev: device ++ * ++ * This function does the last stage of destroying an allocated device ++ * interface. The reference to the device object is released. If this ++ * is the last reference then it will be freed.Must be called in process ++ * context. ++ */ ++void free_netdev(struct net_device *dev) ++{ ++ struct napi_struct *p, *n; ++ ++ might_sleep(); ++ ++ /* When called immediately after register_netdevice() failed the unwind ++ * handling may still be dismantling the device. Handle that case by ++ * deferring the free. ++ */ ++ if (dev->reg_state == NETREG_UNREGISTERING) { ++ ASSERT_RTNL(); ++ dev->needs_free_netdev = true; ++ return; ++ } ++ ++ netif_free_tx_queues(dev); ++ netif_free_rx_queues(dev); ++ ++ kfree(rcu_dereference_protected(dev->ingress_queue, 1)); ++ ++ /* Flush device addresses */ ++ dev_addr_flush(dev); ++ ++ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) ++ netif_napi_del(p); ++ ++ ref_tracker_dir_exit(&dev->refcnt_tracker); ++#ifdef CONFIG_PCPU_DEV_REFCNT ++ free_percpu(dev->pcpu_refcnt); ++ dev->pcpu_refcnt = NULL; ++#endif ++ free_percpu(dev->core_stats); ++ dev->core_stats = NULL; ++ free_percpu(dev->xdp_bulkq); ++ dev->xdp_bulkq = NULL; ++ ++ /* Compatibility with error handling in drivers */ ++ if (dev->reg_state == NETREG_UNINITIALIZED) { ++ netdev_freemem(dev); ++ return; ++ } ++ ++ BUG_ON(dev->reg_state != NETREG_UNREGISTERED); ++ dev->reg_state = NETREG_RELEASED; ++ ++ /* will free via device release */ ++ put_device(&dev->dev); ++} ++EXPORT_SYMBOL(free_netdev); ++ ++/** ++ * synchronize_net - Synchronize with packet receive processing ++ * ++ * Wait for packets currently being received to be done. ++ * Does not block later packets from starting. ++ */ ++void synchronize_net(void) ++{ ++ might_sleep(); ++ if (rtnl_is_locked()) ++ synchronize_rcu_expedited(); ++ else ++ synchronize_rcu(); ++} ++EXPORT_SYMBOL(synchronize_net); ++ ++/** ++ * unregister_netdevice_queue - remove device from the kernel ++ * @dev: device ++ * @head: list ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. ++ * If head not NULL, device is queued to be unregistered later. ++ * ++ * Callers must hold the rtnl semaphore. You may want ++ * unregister_netdev() instead of this. ++ */ ++ ++void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) ++{ ++ ASSERT_RTNL(); ++ ++ if (head) { ++ list_move_tail(&dev->unreg_list, head); ++ } else { ++ LIST_HEAD(single); ++ ++ list_add(&dev->unreg_list, &single); ++ unregister_netdevice_many(&single); ++ } ++} ++EXPORT_SYMBOL(unregister_netdevice_queue); ++ ++/** ++ * unregister_netdevice_many - unregister many devices ++ * @head: list of devices ++ * ++ * Note: As most callers use a stack allocated list_head, ++ * we force a list_del() to make sure stack wont be corrupted later. ++ */ ++void unregister_netdevice_many(struct list_head *head) ++{ ++ struct net_device *dev, *tmp; ++ LIST_HEAD(close_head); ++ ++ BUG_ON(dev_boot_phase); ++ ASSERT_RTNL(); ++ ++ if (list_empty(head)) ++ return; ++ ++ list_for_each_entry_safe(dev, tmp, head, unreg_list) { ++ /* Some devices call without registering ++ * for initialization unwind. Remove those ++ * devices and proceed with the remaining. ++ */ ++ if (dev->reg_state == NETREG_UNINITIALIZED) { ++ pr_debug("unregister_netdevice: device %s/%p never was registered\n", ++ dev->name, dev); ++ ++ WARN_ON(1); ++ list_del(&dev->unreg_list); ++ continue; ++ } ++ dev->dismantle = true; ++ BUG_ON(dev->reg_state != NETREG_REGISTERED); ++ } ++ ++ /* If device is running, close it first. */ ++ list_for_each_entry(dev, head, unreg_list) ++ list_add_tail(&dev->close_list, &close_head); ++ dev_close_many(&close_head, true); ++ ++ list_for_each_entry(dev, head, unreg_list) { ++ /* And unlink it from device chain. */ ++ write_lock(&dev_base_lock); ++ unlist_netdevice(dev, false); ++ dev->reg_state = NETREG_UNREGISTERING; ++ write_unlock(&dev_base_lock); ++ } ++ flush_all_backlogs(); ++ ++ synchronize_net(); ++ ++ list_for_each_entry(dev, head, unreg_list) { ++ struct sk_buff *skb = NULL; ++ ++ /* Shutdown queueing discipline. */ ++ dev_shutdown(dev); ++ ++ dev_xdp_uninstall(dev); ++ ++ netdev_offload_xstats_disable_all(dev); ++ ++ /* Notify protocols, that we are about to destroy ++ * this device. They should clean all the things. ++ */ ++ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); ++ ++ if (!dev->rtnl_link_ops || ++ dev->rtnl_link_state == RTNL_LINK_INITIALIZED) ++ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, ++ GFP_KERNEL, NULL, 0); ++ ++ /* ++ * Flush the unicast and multicast chains ++ */ ++ dev_uc_flush(dev); ++ dev_mc_flush(dev); ++ ++ netdev_name_node_alt_flush(dev); ++ netdev_name_node_free(dev->name_node); ++ ++ if (dev->netdev_ops->ndo_uninit) ++ dev->netdev_ops->ndo_uninit(dev); ++ ++ if (skb) ++ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); ++ ++ /* Notifier chain MUST detach us all upper devices. */ ++ WARN_ON(netdev_has_any_upper_dev(dev)); ++ WARN_ON(netdev_has_any_lower_dev(dev)); ++ ++ /* Remove entries from kobject tree */ ++ netdev_unregister_kobject(dev); ++#ifdef CONFIG_XPS ++ /* Remove XPS queueing entries */ ++ netif_reset_xps_queues_gt(dev, 0); ++#endif ++ } ++ ++ synchronize_net(); ++ ++ list_for_each_entry(dev, head, unreg_list) { ++ netdev_put(dev, &dev->dev_registered_tracker); ++ net_set_todo(dev); ++ } ++ ++ list_del(head); ++} ++EXPORT_SYMBOL(unregister_netdevice_many); ++ ++/** ++ * unregister_netdev - remove device from the kernel ++ * @dev: device ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. ++ * ++ * This is just a wrapper for unregister_netdevice that takes ++ * the rtnl semaphore. In general you want to use this and not ++ * unregister_netdevice. ++ */ ++void unregister_netdev(struct net_device *dev) ++{ ++ rtnl_lock(); ++ unregister_netdevice(dev); ++ rtnl_unlock(); ++} ++EXPORT_SYMBOL(unregister_netdev); ++ ++/** ++ * __dev_change_net_namespace - move device to different nethost namespace ++ * @dev: device ++ * @net: network namespace ++ * @pat: If not NULL name pattern to try if the current device name ++ * is already taken in the destination network namespace. ++ * @new_ifindex: If not zero, specifies device index in the target ++ * namespace. ++ * ++ * This function shuts down a device interface and moves it ++ * to a new network namespace. On success 0 is returned, on ++ * a failure a netagive errno code is returned. ++ * ++ * Callers must hold the rtnl semaphore. ++ */ ++ ++int __dev_change_net_namespace(struct net_device *dev, struct net *net, ++ const char *pat, int new_ifindex) ++{ ++ struct net *net_old = dev_net(dev); ++ int err, new_nsid; ++ ++ ASSERT_RTNL(); ++ ++ /* Don't allow namespace local devices to be moved. */ ++ err = -EINVAL; ++ if (dev->features & NETIF_F_NETNS_LOCAL) ++ goto out; ++ ++ /* Ensure the device has been registrered */ ++ if (dev->reg_state != NETREG_REGISTERED) ++ goto out; ++ ++ /* Get out if there is nothing todo */ ++ err = 0; ++ if (net_eq(net_old, net)) ++ goto out; ++ ++ /* Pick the destination device name, and ensure ++ * we can use it in the destination network namespace. ++ */ ++ err = -EEXIST; ++ if (netdev_name_in_use(net, dev->name)) { ++ /* We get here if we can't use the current device name */ ++ if (!pat) ++ goto out; ++ err = dev_get_valid_name(net, dev, pat); ++ if (err < 0) ++ goto out; ++ } ++ ++ /* Check that new_ifindex isn't used yet. */ ++ err = -EBUSY; ++ if (new_ifindex && __dev_get_by_index(net, new_ifindex)) ++ goto out; ++ ++ /* ++ * And now a mini version of register_netdevice unregister_netdevice. ++ */ ++ ++ /* If device is running close it first. */ ++ dev_close(dev); ++ ++ /* And unlink it from device chain */ ++ unlist_netdevice(dev, true); ++ ++ synchronize_net(); ++ ++ /* Shutdown queueing discipline. */ ++ dev_shutdown(dev); ++ ++ /* Notify protocols, that we are about to destroy ++ * this device. They should clean all the things. ++ * ++ * Note that dev->reg_state stays at NETREG_REGISTERED. ++ * This is wanted because this way 8021q and macvlan know ++ * the device is just moving and can keep their slaves up. ++ */ ++ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); ++ rcu_barrier(); ++ ++ new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); ++ /* If there is an ifindex conflict assign a new one */ ++ if (!new_ifindex) { ++ if (__dev_get_by_index(net, dev->ifindex)) ++ new_ifindex = dev_new_index(net); ++ else ++ new_ifindex = dev->ifindex; ++ } ++ ++ rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, ++ new_ifindex); ++ ++ /* ++ * Flush the unicast and multicast chains ++ */ ++ dev_uc_flush(dev); ++ dev_mc_flush(dev); ++ ++ /* Send a netdev-removed uevent to the old namespace */ ++ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); ++ netdev_adjacent_del_links(dev); ++ ++ /* Move per-net netdevice notifiers that are following the netdevice */ ++ move_netdevice_notifiers_dev_net(dev, net); ++ ++ /* Actually switch the network namespace */ ++ dev_net_set(dev, net); ++ dev->ifindex = new_ifindex; ++ ++ /* Send a netdev-add uevent to the new namespace */ ++ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); ++ netdev_adjacent_add_links(dev); ++ ++ /* Fixup kobjects */ ++ err = device_rename(&dev->dev, dev->name); ++ WARN_ON(err); ++ ++ /* Adapt owner in case owning user namespace of target network ++ * namespace is different from the original one. ++ */ ++ err = netdev_change_owner(dev, net_old, net); ++ WARN_ON(err); ++ ++ /* Add the device back in the hashes */ ++ list_netdevice(dev); ++ ++ /* Notify protocols, that a new device appeared. */ ++ call_netdevice_notifiers(NETDEV_REGISTER, dev); ++ ++ /* ++ * Prevent userspace races by waiting until the network ++ * device is fully setup before sending notifications. ++ */ ++ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); ++ ++ synchronize_net(); ++ err = 0; ++out: ++ return err; ++} ++EXPORT_SYMBOL_GPL(__dev_change_net_namespace); ++ ++static int dev_cpu_dead(unsigned int oldcpu) ++{ ++ struct sk_buff **list_skb; ++ struct sk_buff *skb; ++ unsigned int cpu; ++ struct softnet_data *sd, *oldsd, *remsd = NULL; ++ ++ local_irq_disable(); ++ cpu = smp_processor_id(); ++ sd = &per_cpu(softnet_data, cpu); ++ oldsd = &per_cpu(softnet_data, oldcpu); ++ ++ /* Find end of our completion_queue. */ ++ list_skb = &sd->completion_queue; ++ while (*list_skb) ++ list_skb = &(*list_skb)->next; ++ /* Append completion queue from offline CPU. */ ++ *list_skb = oldsd->completion_queue; ++ oldsd->completion_queue = NULL; ++ ++ /* Append output queue from offline CPU. */ ++ if (oldsd->output_queue) { ++ *sd->output_queue_tailp = oldsd->output_queue; ++ sd->output_queue_tailp = oldsd->output_queue_tailp; ++ oldsd->output_queue = NULL; ++ oldsd->output_queue_tailp = &oldsd->output_queue; ++ } ++ /* Append NAPI poll list from offline CPU, with one exception : ++ * process_backlog() must be called by cpu owning percpu backlog. ++ * We properly handle process_queue & input_pkt_queue later. ++ */ ++ while (!list_empty(&oldsd->poll_list)) { ++ struct napi_struct *napi = list_first_entry(&oldsd->poll_list, ++ struct napi_struct, ++ poll_list); ++ ++ list_del_init(&napi->poll_list); ++ if (napi->poll == process_backlog) ++ napi->state = 0; ++ else ++ ____napi_schedule(sd, napi); ++ } ++ ++ raise_softirq_irqoff(NET_TX_SOFTIRQ); ++ local_irq_enable(); ++ ++#ifdef CONFIG_RPS ++ remsd = oldsd->rps_ipi_list; ++ oldsd->rps_ipi_list = NULL; ++#endif ++ /* send out pending IPI's on offline CPU */ ++ net_rps_send_ipi(remsd); ++ ++ /* Process offline CPU's input_pkt_queue */ ++ while ((skb = __skb_dequeue(&oldsd->process_queue))) { ++ netif_rx(skb); ++ input_queue_head_incr(oldsd); ++ } ++ while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { ++ netif_rx(skb); ++ input_queue_head_incr(oldsd); ++ } ++ ++ return 0; ++} ++ ++/** ++ * netdev_increment_features - increment feature set by one ++ * @all: current feature set ++ * @one: new feature set ++ * @mask: mask feature set ++ * ++ * Computes a new feature set after adding a device with feature set ++ * @one to the master device with current feature set @all. Will not ++ * enable anything that is off in @mask. Returns the new feature set. ++ */ ++netdev_features_t netdev_increment_features(netdev_features_t all, ++ netdev_features_t one, netdev_features_t mask) ++{ ++ if (mask & NETIF_F_HW_CSUM) ++ mask |= NETIF_F_CSUM_MASK; ++ mask |= NETIF_F_VLAN_CHALLENGED; ++ ++ all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; ++ all &= one | ~NETIF_F_ALL_FOR_ALL; ++ ++ /* If one device supports hw checksumming, set for all. */ ++ if (all & NETIF_F_HW_CSUM) ++ all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); ++ ++ return all; ++} ++EXPORT_SYMBOL(netdev_increment_features); ++ ++static struct hlist_head * __net_init netdev_create_hash(void) ++{ ++ int i; ++ struct hlist_head *hash; ++ ++ hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); ++ if (hash != NULL) ++ for (i = 0; i < NETDEV_HASHENTRIES; i++) ++ INIT_HLIST_HEAD(&hash[i]); ++ ++ return hash; ++} ++ ++/* Initialize per network namespace state */ ++static int __net_init netdev_init(struct net *net) ++{ ++ BUILD_BUG_ON(GRO_HASH_BUCKETS > ++ 8 * sizeof_field(struct napi_struct, gro_bitmask)); ++ ++ INIT_LIST_HEAD(&net->dev_base_head); ++ ++ net->dev_name_head = netdev_create_hash(); ++ if (net->dev_name_head == NULL) ++ goto err_name; ++ ++ net->dev_index_head = netdev_create_hash(); ++ if (net->dev_index_head == NULL) ++ goto err_idx; ++ ++ RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); ++ ++ return 0; ++ ++err_idx: ++ kfree(net->dev_name_head); ++err_name: ++ return -ENOMEM; ++} ++ ++/** ++ * netdev_drivername - network driver for the device ++ * @dev: network device ++ * ++ * Determine network driver for device. ++ */ ++const char *netdev_drivername(const struct net_device *dev) ++{ ++ const struct device_driver *driver; ++ const struct device *parent; ++ const char *empty = ""; ++ ++ parent = dev->dev.parent; ++ if (!parent) ++ return empty; ++ ++ driver = parent->driver; ++ if (driver && driver->name) ++ return driver->name; ++ return empty; ++} ++ ++static void __netdev_printk(const char *level, const struct net_device *dev, ++ struct va_format *vaf) ++{ ++ if (dev && dev->dev.parent) { ++ dev_printk_emit(level[1] - '0', ++ dev->dev.parent, ++ "%s %s %s%s: %pV", ++ dev_driver_string(dev->dev.parent), ++ dev_name(dev->dev.parent), ++ netdev_name(dev), netdev_reg_state(dev), ++ vaf); ++ } else if (dev) { ++ printk("%s%s%s: %pV", ++ level, netdev_name(dev), netdev_reg_state(dev), vaf); ++ } else { ++ printk("%s(NULL net_device): %pV", level, vaf); ++ } ++} ++ ++void netdev_printk(const char *level, const struct net_device *dev, ++ const char *format, ...) ++{ ++ struct va_format vaf; ++ va_list args; ++ ++ va_start(args, format); ++ ++ vaf.fmt = format; ++ vaf.va = &args; ++ ++ __netdev_printk(level, dev, &vaf); ++ ++ va_end(args); ++} ++EXPORT_SYMBOL(netdev_printk); ++ ++#define define_netdev_printk_level(func, level) \ ++void func(const struct net_device *dev, const char *fmt, ...) \ ++{ \ ++ struct va_format vaf; \ ++ va_list args; \ ++ \ ++ va_start(args, fmt); \ ++ \ ++ vaf.fmt = fmt; \ ++ vaf.va = &args; \ ++ \ ++ __netdev_printk(level, dev, &vaf); \ ++ \ ++ va_end(args); \ ++} \ ++EXPORT_SYMBOL(func); ++ ++define_netdev_printk_level(netdev_emerg, KERN_EMERG); ++define_netdev_printk_level(netdev_alert, KERN_ALERT); ++define_netdev_printk_level(netdev_crit, KERN_CRIT); ++define_netdev_printk_level(netdev_err, KERN_ERR); ++define_netdev_printk_level(netdev_warn, KERN_WARNING); ++define_netdev_printk_level(netdev_notice, KERN_NOTICE); ++define_netdev_printk_level(netdev_info, KERN_INFO); ++ ++static void __net_exit netdev_exit(struct net *net) ++{ ++ kfree(net->dev_name_head); ++ kfree(net->dev_index_head); ++ if (net != &init_net) ++ WARN_ON_ONCE(!list_empty(&net->dev_base_head)); ++} ++ ++static struct pernet_operations __net_initdata netdev_net_ops = { ++ .init = netdev_init, ++ .exit = netdev_exit, ++}; ++ ++static void __net_exit default_device_exit_net(struct net *net) ++{ ++ struct net_device *dev, *aux; ++ /* ++ * Push all migratable network devices back to the ++ * initial network namespace ++ */ ++ ASSERT_RTNL(); ++ for_each_netdev_safe(net, dev, aux) { ++ int err; ++ char fb_name[IFNAMSIZ]; ++ ++ /* Ignore unmoveable devices (i.e. loopback) */ ++ if (dev->features & NETIF_F_NETNS_LOCAL) ++ continue; ++ ++ /* Leave virtual devices for the generic cleanup */ ++ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) ++ continue; ++ ++ /* Push remaining network devices to init_net */ ++ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); ++ if (netdev_name_in_use(&init_net, fb_name)) ++ snprintf(fb_name, IFNAMSIZ, "dev%%d"); ++ err = dev_change_net_namespace(dev, &init_net, fb_name); ++ if (err) { ++ pr_emerg("%s: failed to move %s to init_net: %d\n", ++ __func__, dev->name, err); ++ BUG(); ++ } ++ } ++} ++ ++static void __net_exit default_device_exit_batch(struct list_head *net_list) ++{ ++ /* At exit all network devices most be removed from a network ++ * namespace. Do this in the reverse order of registration. ++ * Do this across as many network namespaces as possible to ++ * improve batching efficiency. ++ */ ++ struct net_device *dev; ++ struct net *net; ++ LIST_HEAD(dev_kill_list); ++ ++ rtnl_lock(); ++ list_for_each_entry(net, net_list, exit_list) { ++ default_device_exit_net(net); ++ cond_resched(); ++ } ++ ++ list_for_each_entry(net, net_list, exit_list) { ++ for_each_netdev_reverse(net, dev) { ++ if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) ++ dev->rtnl_link_ops->dellink(dev, &dev_kill_list); ++ else ++ unregister_netdevice_queue(dev, &dev_kill_list); ++ } ++ } ++ unregister_netdevice_many(&dev_kill_list); ++ rtnl_unlock(); ++} ++ ++static struct pernet_operations __net_initdata default_device_ops = { ++ .exit_batch = default_device_exit_batch, ++}; ++ ++/* ++ * Initialize the DEV module. At boot time this walks the device list and ++ * unhooks any devices that fail to initialise (normally hardware not ++ * present) and leaves us with a valid list of present and active devices. ++ * ++ */ ++ ++/* ++ * This is called single threaded during boot, so no need ++ * to take the rtnl semaphore. ++ */ ++static int __init net_dev_init(void) ++{ ++ int i, rc = -ENOMEM; ++ ++ BUG_ON(!dev_boot_phase); ++ ++ if (dev_proc_init()) ++ goto out; ++ ++ if (netdev_kobject_init()) ++ goto out; ++ ++ INIT_LIST_HEAD(&ptype_all); ++ for (i = 0; i < PTYPE_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ptype_base[i]); ++ ++ if (register_pernet_subsys(&netdev_net_ops)) ++ goto out; ++ ++ /* ++ * Initialise the packet receive queues. ++ */ ++ ++ for_each_possible_cpu(i) { ++ struct work_struct *flush = per_cpu_ptr(&flush_works, i); ++ struct softnet_data *sd = &per_cpu(softnet_data, i); ++ ++ INIT_WORK(flush, flush_backlog); ++ ++ skb_queue_head_init(&sd->input_pkt_queue); ++ skb_queue_head_init(&sd->process_queue); ++#ifdef CONFIG_XFRM_OFFLOAD ++ skb_queue_head_init(&sd->xfrm_backlog); ++#endif ++ INIT_LIST_HEAD(&sd->poll_list); ++ sd->output_queue_tailp = &sd->output_queue; ++#ifdef CONFIG_RPS ++ INIT_CSD(&sd->csd, rps_trigger_softirq, sd); ++ sd->cpu = i; ++#endif ++ INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); ++ spin_lock_init(&sd->defer_lock); ++ ++ init_gro_hash(&sd->backlog); ++ sd->backlog.poll = process_backlog; ++ sd->backlog.weight = weight_p; ++ } ++ ++ dev_boot_phase = 0; ++ ++ /* The loopback device is special if any other network devices ++ * is present in a network namespace the loopback device must ++ * be present. Since we now dynamically allocate and free the ++ * loopback device ensure this invariant is maintained by ++ * keeping the loopback device as the first device on the ++ * list of network devices. Ensuring the loopback devices ++ * is the first device that appears and the last network device ++ * that disappears. ++ */ ++ if (register_pernet_device(&loopback_net_ops)) ++ goto out; ++ ++ if (register_pernet_device(&default_device_ops)) ++ goto out; ++ ++ open_softirq(NET_TX_SOFTIRQ, net_tx_action); ++ open_softirq(NET_RX_SOFTIRQ, net_rx_action); ++ ++ rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", ++ NULL, dev_cpu_dead); ++ WARN_ON(rc < 0); ++ rc = 0; ++out: ++ return rc; ++} ++ ++subsys_initcall(net_dev_init); +diff -rupN linux.orig/net/core/devlink.c linux/net/core/devlink.c +--- linux.orig/net/core/devlink.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/devlink.c 2022-12-04 10:40:26.732034003 -0500 +@@ -8268,10 +8268,10 @@ static void devlink_trap_stats_read(stru cpu_stats = per_cpu_ptr(trap_stats, i); do { @@ -8713,11 +50605,10 @@ index b50bcc18b8d9e..cfa6a099457ae 100644 u64_stats_add(&stats->rx_packets, rx_packets); u64_stats_add(&stats->rx_bytes, rx_bytes); -diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c -index 75501e1bdd25b..dfcaf61d972c7 100644 ---- a/net/core/drop_monitor.c -+++ b/net/core/drop_monitor.c -@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net_dm_stats *stats) +diff -rupN linux.orig/net/core/drop_monitor.c linux/net/core/drop_monitor.c +--- linux.orig/net/core/drop_monitor.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/drop_monitor.c 2022-12-04 10:40:26.732034003 -0500 +@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net u64 dropped; do { @@ -8729,7 +50620,7 @@ index 75501e1bdd25b..dfcaf61d972c7 100644 u64_stats_add(&stats->dropped, dropped); } -@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats) +@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct u64 dropped; do { @@ -8741,11 +50632,10 @@ index 75501e1bdd25b..dfcaf61d972c7 100644 u64_stats_add(&stats->dropped, dropped); } -diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c -index c8d137ef5980e..b71ccaec09914 100644 ---- a/net/core/gen_stats.c -+++ b/net/core/gen_stats.c -@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, +diff -rupN linux.orig/net/core/gen_stats.c linux/net/core/gen_stats.c +--- linux.orig/net/core/gen_stats.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/gen_stats.c 2022-12-04 10:40:26.732034003 -0500 +@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(str u64 bytes, packets; do { @@ -8758,7 +50648,7 @@ index c8d137ef5980e..b71ccaec09914 100644 t_bytes += bytes; t_packets += packets; -@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, +@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_st } do { if (running) @@ -8771,7 +50661,7 @@ index c8d137ef5980e..b71ccaec09914 100644 _bstats_update(bstats, bytes, packets); } -@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, +@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *r u64 bytes, packets; do { @@ -8784,7 +50674,7 @@ index c8d137ef5980e..b71ccaec09914 100644 t_bytes += bytes; t_packets += packets; -@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, +@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *r } do { if (running) @@ -8797,11 +50687,10 @@ index c8d137ef5980e..b71ccaec09914 100644 } static int -diff --git a/net/core/skbuff.c b/net/core/skbuff.c -index 417463da4fac7..505c72a9b1534 100644 ---- a/net/core/skbuff.c -+++ b/net/core/skbuff.c -@@ -6555,6 +6555,11 @@ nodefer: __kfree_skb(skb); +diff -rupN linux.orig/net/core/skbuff.c linux/net/core/skbuff.c +--- linux.orig/net/core/skbuff.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/skbuff.c 2022-12-04 10:40:26.732034003 -0500 +@@ -6557,6 +6557,11 @@ nodefer: __kfree_skb(skb); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ @@ -8814,11 +50703,6576 @@ index 417463da4fac7..505c72a9b1534 100644 +#endif + } } -diff --git a/net/dsa/slave.c b/net/dsa/slave.c -index 1291c2431d440..dcc550b871623 100644 ---- a/net/dsa/slave.c -+++ b/net/dsa/slave.c -@@ -934,12 +934,12 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, +diff -rupN linux.orig/net/core/skbuff.c.orig linux/net/core/skbuff.c.orig +--- linux.orig/net/core/skbuff.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/net/core/skbuff.c.orig 2022-12-04 10:40:18.728054516 -0500 +@@ -0,0 +1,6562 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * Routines having to do with the 'struct sk_buff' memory handlers. ++ * ++ * Authors: Alan Cox ++ * Florian La Roche ++ * ++ * Fixes: ++ * Alan Cox : Fixed the worst of the load ++ * balancer bugs. ++ * Dave Platt : Interrupt stacking fix. ++ * Richard Kooijman : Timestamp fixes. ++ * Alan Cox : Changed buffer format. ++ * Alan Cox : destructor hook for AF_UNIX etc. ++ * Linus Torvalds : Better skb_clone. ++ * Alan Cox : Added skb_copy. ++ * Alan Cox : Added all the changed routines Linus ++ * only put in the headers ++ * Ray VanTassle : Fixed --skb->lock in free ++ * Alan Cox : skb_copy copy arp field ++ * Andi Kleen : slabified it. ++ * Robert Olsson : Removed skb_head_pool ++ * ++ * NOTE: ++ * The __skb_ routines should be called with interrupts ++ * disabled, or you better be *real* sure that the operation is atomic ++ * with respect to whatever list is being frobbed (e.g. via lock_sock() ++ * or via disabling bottom half handlers, etc). ++ */ ++ ++/* ++ * The functions in this file will not compile correctly with gcc 2.4.x ++ */ ++ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_NET_CLS_ACT ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "dev.h" ++#include "sock_destructor.h" ++ ++struct kmem_cache *skbuff_head_cache __ro_after_init; ++static struct kmem_cache *skbuff_fclone_cache __ro_after_init; ++#ifdef CONFIG_SKB_EXTENSIONS ++static struct kmem_cache *skbuff_ext_cache __ro_after_init; ++#endif ++int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; ++EXPORT_SYMBOL(sysctl_max_skb_frags); ++ ++#undef FN ++#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, ++const char * const drop_reasons[] = { ++ DEFINE_DROP_REASON(FN, FN) ++}; ++EXPORT_SYMBOL(drop_reasons); ++ ++/** ++ * skb_panic - private function for out-of-line support ++ * @skb: buffer ++ * @sz: size ++ * @addr: address ++ * @msg: skb_over_panic or skb_under_panic ++ * ++ * Out-of-line support for skb_put() and skb_push(). ++ * Called via the wrapper skb_over_panic() or skb_under_panic(). ++ * Keep out of line to prevent kernel bloat. ++ * __builtin_return_address is not used because it is not always reliable. ++ */ ++static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, ++ const char msg[]) ++{ ++ pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", ++ msg, addr, skb->len, sz, skb->head, skb->data, ++ (unsigned long)skb->tail, (unsigned long)skb->end, ++ skb->dev ? skb->dev->name : ""); ++ BUG(); ++} ++ ++static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) ++{ ++ skb_panic(skb, sz, addr, __func__); ++} ++ ++static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) ++{ ++ skb_panic(skb, sz, addr, __func__); ++} ++ ++#define NAPI_SKB_CACHE_SIZE 64 ++#define NAPI_SKB_CACHE_BULK 16 ++#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) ++ ++struct napi_alloc_cache { ++ struct page_frag_cache page; ++ unsigned int skb_count; ++ void *skb_cache[NAPI_SKB_CACHE_SIZE]; ++}; ++ ++static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); ++static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); ++ ++void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) ++{ ++ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); ++ ++ fragsz = SKB_DATA_ALIGN(fragsz); ++ ++ return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); ++} ++EXPORT_SYMBOL(__napi_alloc_frag_align); ++ ++void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) ++{ ++ void *data; ++ ++ fragsz = SKB_DATA_ALIGN(fragsz); ++ if (in_hardirq() || irqs_disabled()) { ++ struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); ++ ++ data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); ++ } else { ++ struct napi_alloc_cache *nc; ++ ++ local_bh_disable(); ++ nc = this_cpu_ptr(&napi_alloc_cache); ++ data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); ++ local_bh_enable(); ++ } ++ return data; ++} ++EXPORT_SYMBOL(__netdev_alloc_frag_align); ++ ++static struct sk_buff *napi_skb_cache_get(void) ++{ ++ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); ++ struct sk_buff *skb; ++ ++ if (unlikely(!nc->skb_count)) { ++ nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, ++ GFP_ATOMIC, ++ NAPI_SKB_CACHE_BULK, ++ nc->skb_cache); ++ if (unlikely(!nc->skb_count)) ++ return NULL; ++ } ++ ++ skb = nc->skb_cache[--nc->skb_count]; ++ kasan_unpoison_object_data(skbuff_head_cache, skb); ++ ++ return skb; ++} ++ ++/* Caller must provide SKB that is memset cleared */ ++static void __build_skb_around(struct sk_buff *skb, void *data, ++ unsigned int frag_size) ++{ ++ struct skb_shared_info *shinfo; ++ unsigned int size = frag_size ? : ksize(data); ++ ++ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ ++ /* Assumes caller memset cleared SKB */ ++ skb->truesize = SKB_TRUESIZE(size); ++ refcount_set(&skb->users, 1); ++ skb->head = data; ++ skb->data = data; ++ skb_reset_tail_pointer(skb); ++ skb_set_end_offset(skb, size); ++ skb->mac_header = (typeof(skb->mac_header))~0U; ++ skb->transport_header = (typeof(skb->transport_header))~0U; ++ skb->alloc_cpu = raw_smp_processor_id(); ++ /* make sure we initialize shinfo sequentially */ ++ shinfo = skb_shinfo(skb); ++ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); ++ atomic_set(&shinfo->dataref, 1); ++ ++ skb_set_kcov_handle(skb, kcov_common_handle()); ++} ++ ++/** ++ * __build_skb - build a network buffer ++ * @data: data buffer provided by caller ++ * @frag_size: size of data, or 0 if head was kmalloced ++ * ++ * Allocate a new &sk_buff. Caller provides space holding head and ++ * skb_shared_info. @data must have been allocated by kmalloc() only if ++ * @frag_size is 0, otherwise data should come from the page allocator ++ * or vmalloc() ++ * The return is the new skb buffer. ++ * On a failure the return is %NULL, and @data is not freed. ++ * Notes : ++ * Before IO, driver allocates only data buffer where NIC put incoming frame ++ * Driver should add room at head (NET_SKB_PAD) and ++ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) ++ * After IO, driver calls build_skb(), to allocate sk_buff and populate it ++ * before giving packet to stack. ++ * RX rings only contains data buffers, not full skbs. ++ */ ++struct sk_buff *__build_skb(void *data, unsigned int frag_size) ++{ ++ struct sk_buff *skb; ++ ++ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); ++ if (unlikely(!skb)) ++ return NULL; ++ ++ memset(skb, 0, offsetof(struct sk_buff, tail)); ++ __build_skb_around(skb, data, frag_size); ++ ++ return skb; ++} ++ ++/* build_skb() is wrapper over __build_skb(), that specifically ++ * takes care of skb->head and skb->pfmemalloc ++ * This means that if @frag_size is not zero, then @data must be backed ++ * by a page fragment, not kmalloc() or vmalloc() ++ */ ++struct sk_buff *build_skb(void *data, unsigned int frag_size) ++{ ++ struct sk_buff *skb = __build_skb(data, frag_size); ++ ++ if (skb && frag_size) { ++ skb->head_frag = 1; ++ if (page_is_pfmemalloc(virt_to_head_page(data))) ++ skb->pfmemalloc = 1; ++ } ++ return skb; ++} ++EXPORT_SYMBOL(build_skb); ++ ++/** ++ * build_skb_around - build a network buffer around provided skb ++ * @skb: sk_buff provide by caller, must be memset cleared ++ * @data: data buffer provided by caller ++ * @frag_size: size of data, or 0 if head was kmalloced ++ */ ++struct sk_buff *build_skb_around(struct sk_buff *skb, ++ void *data, unsigned int frag_size) ++{ ++ if (unlikely(!skb)) ++ return NULL; ++ ++ __build_skb_around(skb, data, frag_size); ++ ++ if (frag_size) { ++ skb->head_frag = 1; ++ if (page_is_pfmemalloc(virt_to_head_page(data))) ++ skb->pfmemalloc = 1; ++ } ++ return skb; ++} ++EXPORT_SYMBOL(build_skb_around); ++ ++/** ++ * __napi_build_skb - build a network buffer ++ * @data: data buffer provided by caller ++ * @frag_size: size of data, or 0 if head was kmalloced ++ * ++ * Version of __build_skb() that uses NAPI percpu caches to obtain ++ * skbuff_head instead of inplace allocation. ++ * ++ * Returns a new &sk_buff on success, %NULL on allocation failure. ++ */ ++static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) ++{ ++ struct sk_buff *skb; ++ ++ skb = napi_skb_cache_get(); ++ if (unlikely(!skb)) ++ return NULL; ++ ++ memset(skb, 0, offsetof(struct sk_buff, tail)); ++ __build_skb_around(skb, data, frag_size); ++ ++ return skb; ++} ++ ++/** ++ * napi_build_skb - build a network buffer ++ * @data: data buffer provided by caller ++ * @frag_size: size of data, or 0 if head was kmalloced ++ * ++ * Version of __napi_build_skb() that takes care of skb->head_frag ++ * and skb->pfmemalloc when the data is a page or page fragment. ++ * ++ * Returns a new &sk_buff on success, %NULL on allocation failure. ++ */ ++struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) ++{ ++ struct sk_buff *skb = __napi_build_skb(data, frag_size); ++ ++ if (likely(skb) && frag_size) { ++ skb->head_frag = 1; ++ skb_propagate_pfmemalloc(virt_to_head_page(data), skb); ++ } ++ ++ return skb; ++} ++EXPORT_SYMBOL(napi_build_skb); ++ ++/* ++ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells ++ * the caller if emergency pfmemalloc reserves are being used. If it is and ++ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves ++ * may be used. Otherwise, the packet data may be discarded until enough ++ * memory is free ++ */ ++static void *kmalloc_reserve(size_t size, gfp_t flags, int node, ++ bool *pfmemalloc) ++{ ++ void *obj; ++ bool ret_pfmemalloc = false; ++ ++ /* ++ * Try a regular allocation, when that fails and we're not entitled ++ * to the reserves, fail. ++ */ ++ obj = kmalloc_node_track_caller(size, ++ flags | __GFP_NOMEMALLOC | __GFP_NOWARN, ++ node); ++ if (obj || !(gfp_pfmemalloc_allowed(flags))) ++ goto out; ++ ++ /* Try again but now we are using pfmemalloc reserves */ ++ ret_pfmemalloc = true; ++ obj = kmalloc_node_track_caller(size, flags, node); ++ ++out: ++ if (pfmemalloc) ++ *pfmemalloc = ret_pfmemalloc; ++ ++ return obj; ++} ++ ++/* Allocate a new skbuff. We do this ourselves so we can fill in a few ++ * 'private' fields and also do memory statistics to find all the ++ * [BEEP] leaks. ++ * ++ */ ++ ++/** ++ * __alloc_skb - allocate a network buffer ++ * @size: size to allocate ++ * @gfp_mask: allocation mask ++ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache ++ * instead of head cache and allocate a cloned (child) skb. ++ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for ++ * allocations in case the data is required for writeback ++ * @node: numa node to allocate memory on ++ * ++ * Allocate a new &sk_buff. The returned buffer has no headroom and a ++ * tail room of at least size bytes. The object has a reference count ++ * of one. The return is the buffer. On a failure the return is %NULL. ++ * ++ * Buffers may only be allocated from interrupts using a @gfp_mask of ++ * %GFP_ATOMIC. ++ */ ++struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, ++ int flags, int node) ++{ ++ struct kmem_cache *cache; ++ struct sk_buff *skb; ++ unsigned int osize; ++ bool pfmemalloc; ++ u8 *data; ++ ++ cache = (flags & SKB_ALLOC_FCLONE) ++ ? skbuff_fclone_cache : skbuff_head_cache; ++ ++ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ /* Get the HEAD */ ++ if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && ++ likely(node == NUMA_NO_NODE || node == numa_mem_id())) ++ skb = napi_skb_cache_get(); ++ else ++ skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); ++ if (unlikely(!skb)) ++ return NULL; ++ prefetchw(skb); ++ ++ /* We do our best to align skb_shared_info on a separate cache ++ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives ++ * aligned memory blocks, unless SLUB/SLAB debug is enabled. ++ * Both skb->head and skb_shared_info are cache line aligned. ++ */ ++ size = SKB_DATA_ALIGN(size); ++ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); ++ if (unlikely(!data)) ++ goto nodata; ++ /* kmalloc(size) might give us more room than requested. ++ * Put skb_shared_info exactly at the end of allocated zone, ++ * to allow max possible filling before reallocation. ++ */ ++ osize = ksize(data); ++ size = SKB_WITH_OVERHEAD(osize); ++ prefetchw(data + size); ++ ++ /* ++ * Only clear those fields we need to clear, not those that we will ++ * actually initialise below. Hence, don't put any more fields after ++ * the tail pointer in struct sk_buff! ++ */ ++ memset(skb, 0, offsetof(struct sk_buff, tail)); ++ __build_skb_around(skb, data, osize); ++ skb->pfmemalloc = pfmemalloc; ++ ++ if (flags & SKB_ALLOC_FCLONE) { ++ struct sk_buff_fclones *fclones; ++ ++ fclones = container_of(skb, struct sk_buff_fclones, skb1); ++ ++ skb->fclone = SKB_FCLONE_ORIG; ++ refcount_set(&fclones->fclone_ref, 1); ++ } ++ ++ return skb; ++ ++nodata: ++ kmem_cache_free(cache, skb); ++ return NULL; ++} ++EXPORT_SYMBOL(__alloc_skb); ++ ++/** ++ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device ++ * @dev: network device to receive on ++ * @len: length to allocate ++ * @gfp_mask: get_free_pages mask, passed to alloc_skb ++ * ++ * Allocate a new &sk_buff and assign it a usage count of one. The ++ * buffer has NET_SKB_PAD headroom built in. Users should allocate ++ * the headroom they think they need without accounting for the ++ * built in space. The built in space is used for optimisations. ++ * ++ * %NULL is returned if there is no free memory. ++ */ ++struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, ++ gfp_t gfp_mask) ++{ ++ struct page_frag_cache *nc; ++ struct sk_buff *skb; ++ bool pfmemalloc; ++ void *data; ++ ++ len += NET_SKB_PAD; ++ ++ /* If requested length is either too small or too big, ++ * we use kmalloc() for skb->head allocation. ++ */ ++ if (len <= SKB_WITH_OVERHEAD(1024) || ++ len > SKB_WITH_OVERHEAD(PAGE_SIZE) || ++ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { ++ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); ++ if (!skb) ++ goto skb_fail; ++ goto skb_success; ++ } ++ ++ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ len = SKB_DATA_ALIGN(len); ++ ++ if (sk_memalloc_socks()) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ if (in_hardirq() || irqs_disabled()) { ++ nc = this_cpu_ptr(&netdev_alloc_cache); ++ data = page_frag_alloc(nc, len, gfp_mask); ++ pfmemalloc = nc->pfmemalloc; ++ } else { ++ local_bh_disable(); ++ nc = this_cpu_ptr(&napi_alloc_cache.page); ++ data = page_frag_alloc(nc, len, gfp_mask); ++ pfmemalloc = nc->pfmemalloc; ++ local_bh_enable(); ++ } ++ ++ if (unlikely(!data)) ++ return NULL; ++ ++ skb = __build_skb(data, len); ++ if (unlikely(!skb)) { ++ skb_free_frag(data); ++ return NULL; ++ } ++ ++ if (pfmemalloc) ++ skb->pfmemalloc = 1; ++ skb->head_frag = 1; ++ ++skb_success: ++ skb_reserve(skb, NET_SKB_PAD); ++ skb->dev = dev; ++ ++skb_fail: ++ return skb; ++} ++EXPORT_SYMBOL(__netdev_alloc_skb); ++ ++/** ++ * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance ++ * @napi: napi instance this buffer was allocated for ++ * @len: length to allocate ++ * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages ++ * ++ * Allocate a new sk_buff for use in NAPI receive. This buffer will ++ * attempt to allocate the head from a special reserved region used ++ * only for NAPI Rx allocation. By doing this we can save several ++ * CPU cycles by avoiding having to disable and re-enable IRQs. ++ * ++ * %NULL is returned if there is no free memory. ++ */ ++struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, ++ gfp_t gfp_mask) ++{ ++ struct napi_alloc_cache *nc; ++ struct sk_buff *skb; ++ void *data; ++ ++ DEBUG_NET_WARN_ON_ONCE(!in_softirq()); ++ len += NET_SKB_PAD + NET_IP_ALIGN; ++ ++ /* If requested length is either too small or too big, ++ * we use kmalloc() for skb->head allocation. ++ */ ++ if (len <= SKB_WITH_OVERHEAD(1024) || ++ len > SKB_WITH_OVERHEAD(PAGE_SIZE) || ++ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { ++ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, ++ NUMA_NO_NODE); ++ if (!skb) ++ goto skb_fail; ++ goto skb_success; ++ } ++ ++ nc = this_cpu_ptr(&napi_alloc_cache); ++ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ len = SKB_DATA_ALIGN(len); ++ ++ if (sk_memalloc_socks()) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ data = page_frag_alloc(&nc->page, len, gfp_mask); ++ if (unlikely(!data)) ++ return NULL; ++ ++ skb = __napi_build_skb(data, len); ++ if (unlikely(!skb)) { ++ skb_free_frag(data); ++ return NULL; ++ } ++ ++ if (nc->page.pfmemalloc) ++ skb->pfmemalloc = 1; ++ skb->head_frag = 1; ++ ++skb_success: ++ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); ++ skb->dev = napi->dev; ++ ++skb_fail: ++ return skb; ++} ++EXPORT_SYMBOL(__napi_alloc_skb); ++ ++void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, ++ int size, unsigned int truesize) ++{ ++ skb_fill_page_desc(skb, i, page, off, size); ++ skb->len += size; ++ skb->data_len += size; ++ skb->truesize += truesize; ++} ++EXPORT_SYMBOL(skb_add_rx_frag); ++ ++void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, ++ unsigned int truesize) ++{ ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ skb_frag_size_add(frag, size); ++ skb->len += size; ++ skb->data_len += size; ++ skb->truesize += truesize; ++} ++EXPORT_SYMBOL(skb_coalesce_rx_frag); ++ ++static void skb_drop_list(struct sk_buff **listp) ++{ ++ kfree_skb_list(*listp); ++ *listp = NULL; ++} ++ ++static inline void skb_drop_fraglist(struct sk_buff *skb) ++{ ++ skb_drop_list(&skb_shinfo(skb)->frag_list); ++} ++ ++static void skb_clone_fraglist(struct sk_buff *skb) ++{ ++ struct sk_buff *list; ++ ++ skb_walk_frags(skb, list) ++ skb_get(list); ++} ++ ++static void skb_free_head(struct sk_buff *skb) ++{ ++ unsigned char *head = skb->head; ++ ++ if (skb->head_frag) { ++ if (skb_pp_recycle(skb, head)) ++ return; ++ skb_free_frag(head); ++ } else { ++ kfree(head); ++ } ++} ++ ++static void skb_release_data(struct sk_buff *skb) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ int i; ++ ++ if (skb->cloned && ++ atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, ++ &shinfo->dataref)) ++ goto exit; ++ ++ if (skb_zcopy(skb)) { ++ bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; ++ ++ skb_zcopy_clear(skb, true); ++ if (skip_unref) ++ goto free_head; ++ } ++ ++ for (i = 0; i < shinfo->nr_frags; i++) ++ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); ++ ++free_head: ++ if (shinfo->frag_list) ++ kfree_skb_list(shinfo->frag_list); ++ ++ skb_free_head(skb); ++exit: ++ /* When we clone an SKB we copy the reycling bit. The pp_recycle ++ * bit is only set on the head though, so in order to avoid races ++ * while trying to recycle fragments on __skb_frag_unref() we need ++ * to make one SKB responsible for triggering the recycle path. ++ * So disable the recycling bit if an SKB is cloned and we have ++ * additional references to the fragmented part of the SKB. ++ * Eventually the last SKB will have the recycling bit set and it's ++ * dataref set to 0, which will trigger the recycling ++ */ ++ skb->pp_recycle = 0; ++} ++ ++/* ++ * Free an skbuff by memory without cleaning the state. ++ */ ++static void kfree_skbmem(struct sk_buff *skb) ++{ ++ struct sk_buff_fclones *fclones; ++ ++ switch (skb->fclone) { ++ case SKB_FCLONE_UNAVAILABLE: ++ kmem_cache_free(skbuff_head_cache, skb); ++ return; ++ ++ case SKB_FCLONE_ORIG: ++ fclones = container_of(skb, struct sk_buff_fclones, skb1); ++ ++ /* We usually free the clone (TX completion) before original skb ++ * This test would have no chance to be true for the clone, ++ * while here, branch prediction will be good. ++ */ ++ if (refcount_read(&fclones->fclone_ref) == 1) ++ goto fastpath; ++ break; ++ ++ default: /* SKB_FCLONE_CLONE */ ++ fclones = container_of(skb, struct sk_buff_fclones, skb2); ++ break; ++ } ++ if (!refcount_dec_and_test(&fclones->fclone_ref)) ++ return; ++fastpath: ++ kmem_cache_free(skbuff_fclone_cache, fclones); ++} ++ ++void skb_release_head_state(struct sk_buff *skb) ++{ ++ skb_dst_drop(skb); ++ if (skb->destructor) { ++ DEBUG_NET_WARN_ON_ONCE(in_hardirq()); ++ skb->destructor(skb); ++ } ++#if IS_ENABLED(CONFIG_NF_CONNTRACK) ++ nf_conntrack_put(skb_nfct(skb)); ++#endif ++ skb_ext_put(skb); ++} ++ ++/* Free everything but the sk_buff shell. */ ++static void skb_release_all(struct sk_buff *skb) ++{ ++ skb_release_head_state(skb); ++ if (likely(skb->head)) ++ skb_release_data(skb); ++} ++ ++/** ++ * __kfree_skb - private function ++ * @skb: buffer ++ * ++ * Free an sk_buff. Release anything attached to the buffer. ++ * Clean the state. This is an internal helper function. Users should ++ * always call kfree_skb ++ */ ++ ++void __kfree_skb(struct sk_buff *skb) ++{ ++ skb_release_all(skb); ++ kfree_skbmem(skb); ++} ++EXPORT_SYMBOL(__kfree_skb); ++ ++/** ++ * kfree_skb_reason - free an sk_buff with special reason ++ * @skb: buffer to free ++ * @reason: reason why this skb is dropped ++ * ++ * Drop a reference to the buffer and free it if the usage count has ++ * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' ++ * tracepoint. ++ */ ++void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) ++{ ++ if (!skb_unref(skb)) ++ return; ++ ++ DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); ++ ++ trace_kfree_skb(skb, __builtin_return_address(0), reason); ++ __kfree_skb(skb); ++} ++EXPORT_SYMBOL(kfree_skb_reason); ++ ++void kfree_skb_list_reason(struct sk_buff *segs, ++ enum skb_drop_reason reason) ++{ ++ while (segs) { ++ struct sk_buff *next = segs->next; ++ ++ kfree_skb_reason(segs, reason); ++ segs = next; ++ } ++} ++EXPORT_SYMBOL(kfree_skb_list_reason); ++ ++/* Dump skb information and contents. ++ * ++ * Must only be called from net_ratelimit()-ed paths. ++ * ++ * Dumps whole packets if full_pkt, only headers otherwise. ++ */ ++void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) ++{ ++ struct skb_shared_info *sh = skb_shinfo(skb); ++ struct net_device *dev = skb->dev; ++ struct sock *sk = skb->sk; ++ struct sk_buff *list_skb; ++ bool has_mac, has_trans; ++ int headroom, tailroom; ++ int i, len, seg_len; ++ ++ if (full_pkt) ++ len = skb->len; ++ else ++ len = min_t(int, skb->len, MAX_HEADER + 128); ++ ++ headroom = skb_headroom(skb); ++ tailroom = skb_tailroom(skb); ++ ++ has_mac = skb_mac_header_was_set(skb); ++ has_trans = skb_transport_header_was_set(skb); ++ ++ printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" ++ "mac=(%d,%d) net=(%d,%d) trans=%d\n" ++ "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" ++ "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" ++ "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", ++ level, skb->len, headroom, skb_headlen(skb), tailroom, ++ has_mac ? skb->mac_header : -1, ++ has_mac ? skb_mac_header_len(skb) : -1, ++ skb->network_header, ++ has_trans ? skb_network_header_len(skb) : -1, ++ has_trans ? skb->transport_header : -1, ++ sh->tx_flags, sh->nr_frags, ++ sh->gso_size, sh->gso_type, sh->gso_segs, ++ skb->csum, skb->ip_summed, skb->csum_complete_sw, ++ skb->csum_valid, skb->csum_level, ++ skb->hash, skb->sw_hash, skb->l4_hash, ++ ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); ++ ++ if (dev) ++ printk("%sdev name=%s feat=%pNF\n", ++ level, dev->name, &dev->features); ++ if (sk) ++ printk("%ssk family=%hu type=%u proto=%u\n", ++ level, sk->sk_family, sk->sk_type, sk->sk_protocol); ++ ++ if (full_pkt && headroom) ++ print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, ++ 16, 1, skb->head, headroom, false); ++ ++ seg_len = min_t(int, skb_headlen(skb), len); ++ if (seg_len) ++ print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, ++ 16, 1, skb->data, seg_len, false); ++ len -= seg_len; ++ ++ if (full_pkt && tailroom) ++ print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, ++ 16, 1, skb_tail_pointer(skb), tailroom, false); ++ ++ for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ u32 p_off, p_len, copied; ++ struct page *p; ++ u8 *vaddr; ++ ++ skb_frag_foreach_page(frag, skb_frag_off(frag), ++ skb_frag_size(frag), p, p_off, p_len, ++ copied) { ++ seg_len = min_t(int, p_len, len); ++ vaddr = kmap_atomic(p); ++ print_hex_dump(level, "skb frag: ", ++ DUMP_PREFIX_OFFSET, ++ 16, 1, vaddr + p_off, seg_len, false); ++ kunmap_atomic(vaddr); ++ len -= seg_len; ++ if (!len) ++ break; ++ } ++ } ++ ++ if (full_pkt && skb_has_frag_list(skb)) { ++ printk("skb fraglist:\n"); ++ skb_walk_frags(skb, list_skb) ++ skb_dump(level, list_skb, true); ++ } ++} ++EXPORT_SYMBOL(skb_dump); ++ ++/** ++ * skb_tx_error - report an sk_buff xmit error ++ * @skb: buffer that triggered an error ++ * ++ * Report xmit error if a device callback is tracking this skb. ++ * skb must be freed afterwards. ++ */ ++void skb_tx_error(struct sk_buff *skb) ++{ ++ if (skb) { ++ skb_zcopy_downgrade_managed(skb); ++ skb_zcopy_clear(skb, true); ++ } ++} ++EXPORT_SYMBOL(skb_tx_error); ++ ++#ifdef CONFIG_TRACEPOINTS ++/** ++ * consume_skb - free an skbuff ++ * @skb: buffer to free ++ * ++ * Drop a ref to the buffer and free it if the usage count has hit zero ++ * Functions identically to kfree_skb, but kfree_skb assumes that the frame ++ * is being dropped after a failure and notes that ++ */ ++void consume_skb(struct sk_buff *skb) ++{ ++ if (!skb_unref(skb)) ++ return; ++ ++ trace_consume_skb(skb); ++ __kfree_skb(skb); ++} ++EXPORT_SYMBOL(consume_skb); ++#endif ++ ++/** ++ * __consume_stateless_skb - free an skbuff, assuming it is stateless ++ * @skb: buffer to free ++ * ++ * Alike consume_skb(), but this variant assumes that this is the last ++ * skb reference and all the head states have been already dropped ++ */ ++void __consume_stateless_skb(struct sk_buff *skb) ++{ ++ trace_consume_skb(skb); ++ skb_release_data(skb); ++ kfree_skbmem(skb); ++} ++ ++static void napi_skb_cache_put(struct sk_buff *skb) ++{ ++ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); ++ u32 i; ++ ++ kasan_poison_object_data(skbuff_head_cache, skb); ++ nc->skb_cache[nc->skb_count++] = skb; ++ ++ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { ++ for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) ++ kasan_unpoison_object_data(skbuff_head_cache, ++ nc->skb_cache[i]); ++ ++ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, ++ nc->skb_cache + NAPI_SKB_CACHE_HALF); ++ nc->skb_count = NAPI_SKB_CACHE_HALF; ++ } ++} ++ ++void __kfree_skb_defer(struct sk_buff *skb) ++{ ++ skb_release_all(skb); ++ napi_skb_cache_put(skb); ++} ++ ++void napi_skb_free_stolen_head(struct sk_buff *skb) ++{ ++ if (unlikely(skb->slow_gro)) { ++ nf_reset_ct(skb); ++ skb_dst_drop(skb); ++ skb_ext_put(skb); ++ skb_orphan(skb); ++ skb->slow_gro = 0; ++ } ++ napi_skb_cache_put(skb); ++} ++ ++void napi_consume_skb(struct sk_buff *skb, int budget) ++{ ++ /* Zero budget indicate non-NAPI context called us, like netpoll */ ++ if (unlikely(!budget)) { ++ dev_consume_skb_any(skb); ++ return; ++ } ++ ++ DEBUG_NET_WARN_ON_ONCE(!in_softirq()); ++ ++ if (!skb_unref(skb)) ++ return; ++ ++ /* if reaching here SKB is ready to free */ ++ trace_consume_skb(skb); ++ ++ /* if SKB is a clone, don't handle this case */ ++ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { ++ __kfree_skb(skb); ++ return; ++ } ++ ++ skb_release_all(skb); ++ napi_skb_cache_put(skb); ++} ++EXPORT_SYMBOL(napi_consume_skb); ++ ++/* Make sure a field is contained by headers group */ ++#define CHECK_SKB_FIELD(field) \ ++ BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ ++ offsetof(struct sk_buff, headers.field)); \ ++ ++static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) ++{ ++ new->tstamp = old->tstamp; ++ /* We do not copy old->sk */ ++ new->dev = old->dev; ++ memcpy(new->cb, old->cb, sizeof(old->cb)); ++ skb_dst_copy(new, old); ++ __skb_ext_copy(new, old); ++ __nf_copy(new, old, false); ++ ++ /* Note : this field could be in the headers group. ++ * It is not yet because we do not want to have a 16 bit hole ++ */ ++ new->queue_mapping = old->queue_mapping; ++ ++ memcpy(&new->headers, &old->headers, sizeof(new->headers)); ++ CHECK_SKB_FIELD(protocol); ++ CHECK_SKB_FIELD(csum); ++ CHECK_SKB_FIELD(hash); ++ CHECK_SKB_FIELD(priority); ++ CHECK_SKB_FIELD(skb_iif); ++ CHECK_SKB_FIELD(vlan_proto); ++ CHECK_SKB_FIELD(vlan_tci); ++ CHECK_SKB_FIELD(transport_header); ++ CHECK_SKB_FIELD(network_header); ++ CHECK_SKB_FIELD(mac_header); ++ CHECK_SKB_FIELD(inner_protocol); ++ CHECK_SKB_FIELD(inner_transport_header); ++ CHECK_SKB_FIELD(inner_network_header); ++ CHECK_SKB_FIELD(inner_mac_header); ++ CHECK_SKB_FIELD(mark); ++#ifdef CONFIG_NETWORK_SECMARK ++ CHECK_SKB_FIELD(secmark); ++#endif ++#ifdef CONFIG_NET_RX_BUSY_POLL ++ CHECK_SKB_FIELD(napi_id); ++#endif ++ CHECK_SKB_FIELD(alloc_cpu); ++#ifdef CONFIG_XPS ++ CHECK_SKB_FIELD(sender_cpu); ++#endif ++#ifdef CONFIG_NET_SCHED ++ CHECK_SKB_FIELD(tc_index); ++#endif ++ ++} ++ ++/* ++ * You should not add any new code to this function. Add it to ++ * __copy_skb_header above instead. ++ */ ++static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) ++{ ++#define C(x) n->x = skb->x ++ ++ n->next = n->prev = NULL; ++ n->sk = NULL; ++ __copy_skb_header(n, skb); ++ ++ C(len); ++ C(data_len); ++ C(mac_len); ++ n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; ++ n->cloned = 1; ++ n->nohdr = 0; ++ n->peeked = 0; ++ C(pfmemalloc); ++ C(pp_recycle); ++ n->destructor = NULL; ++ C(tail); ++ C(end); ++ C(head); ++ C(head_frag); ++ C(data); ++ C(truesize); ++ refcount_set(&n->users, 1); ++ ++ atomic_inc(&(skb_shinfo(skb)->dataref)); ++ skb->cloned = 1; ++ ++ return n; ++#undef C ++} ++ ++/** ++ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg ++ * @first: first sk_buff of the msg ++ */ ++struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) ++{ ++ struct sk_buff *n; ++ ++ n = alloc_skb(0, GFP_ATOMIC); ++ if (!n) ++ return NULL; ++ ++ n->len = first->len; ++ n->data_len = first->len; ++ n->truesize = first->truesize; ++ ++ skb_shinfo(n)->frag_list = first; ++ ++ __copy_skb_header(n, first); ++ n->destructor = NULL; ++ ++ return n; ++} ++EXPORT_SYMBOL_GPL(alloc_skb_for_msg); ++ ++/** ++ * skb_morph - morph one skb into another ++ * @dst: the skb to receive the contents ++ * @src: the skb to supply the contents ++ * ++ * This is identical to skb_clone except that the target skb is ++ * supplied by the user. ++ * ++ * The target skb is returned upon exit. ++ */ ++struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) ++{ ++ skb_release_all(dst); ++ return __skb_clone(dst, src); ++} ++EXPORT_SYMBOL_GPL(skb_morph); ++ ++int mm_account_pinned_pages(struct mmpin *mmp, size_t size) ++{ ++ unsigned long max_pg, num_pg, new_pg, old_pg; ++ struct user_struct *user; ++ ++ if (capable(CAP_IPC_LOCK) || !size) ++ return 0; ++ ++ num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ ++ max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; ++ user = mmp->user ? : current_user(); ++ ++ do { ++ old_pg = atomic_long_read(&user->locked_vm); ++ new_pg = old_pg + num_pg; ++ if (new_pg > max_pg) ++ return -ENOBUFS; ++ } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != ++ old_pg); ++ ++ if (!mmp->user) { ++ mmp->user = get_uid(user); ++ mmp->num_pg = num_pg; ++ } else { ++ mmp->num_pg += num_pg; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(mm_account_pinned_pages); ++ ++void mm_unaccount_pinned_pages(struct mmpin *mmp) ++{ ++ if (mmp->user) { ++ atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); ++ free_uid(mmp->user); ++ } ++} ++EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); ++ ++static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) ++{ ++ struct ubuf_info *uarg; ++ struct sk_buff *skb; ++ ++ WARN_ON_ONCE(!in_task()); ++ ++ skb = sock_omalloc(sk, 0, GFP_KERNEL); ++ if (!skb) ++ return NULL; ++ ++ BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); ++ uarg = (void *)skb->cb; ++ uarg->mmp.user = NULL; ++ ++ if (mm_account_pinned_pages(&uarg->mmp, size)) { ++ kfree_skb(skb); ++ return NULL; ++ } ++ ++ uarg->callback = msg_zerocopy_callback; ++ uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; ++ uarg->len = 1; ++ uarg->bytelen = size; ++ uarg->zerocopy = 1; ++ uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; ++ refcount_set(&uarg->refcnt, 1); ++ sock_hold(sk); ++ ++ return uarg; ++} ++ ++static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) ++{ ++ return container_of((void *)uarg, struct sk_buff, cb); ++} ++ ++struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, ++ struct ubuf_info *uarg) ++{ ++ if (uarg) { ++ const u32 byte_limit = 1 << 19; /* limit to a few TSO */ ++ u32 bytelen, next; ++ ++ /* there might be non MSG_ZEROCOPY users */ ++ if (uarg->callback != msg_zerocopy_callback) ++ return NULL; ++ ++ /* realloc only when socket is locked (TCP, UDP cork), ++ * so uarg->len and sk_zckey access is serialized ++ */ ++ if (!sock_owned_by_user(sk)) { ++ WARN_ON_ONCE(1); ++ return NULL; ++ } ++ ++ bytelen = uarg->bytelen + size; ++ if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { ++ /* TCP can create new skb to attach new uarg */ ++ if (sk->sk_type == SOCK_STREAM) ++ goto new_alloc; ++ return NULL; ++ } ++ ++ next = (u32)atomic_read(&sk->sk_zckey); ++ if ((u32)(uarg->id + uarg->len) == next) { ++ if (mm_account_pinned_pages(&uarg->mmp, size)) ++ return NULL; ++ uarg->len++; ++ uarg->bytelen = bytelen; ++ atomic_set(&sk->sk_zckey, ++next); ++ ++ /* no extra ref when appending to datagram (MSG_MORE) */ ++ if (sk->sk_type == SOCK_STREAM) ++ net_zcopy_get(uarg); ++ ++ return uarg; ++ } ++ } ++ ++new_alloc: ++ return msg_zerocopy_alloc(sk, size); ++} ++EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); ++ ++static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) ++{ ++ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); ++ u32 old_lo, old_hi; ++ u64 sum_len; ++ ++ old_lo = serr->ee.ee_info; ++ old_hi = serr->ee.ee_data; ++ sum_len = old_hi - old_lo + 1ULL + len; ++ ++ if (sum_len >= (1ULL << 32)) ++ return false; ++ ++ if (lo != old_hi + 1) ++ return false; ++ ++ serr->ee.ee_data += len; ++ return true; ++} ++ ++static void __msg_zerocopy_callback(struct ubuf_info *uarg) ++{ ++ struct sk_buff *tail, *skb = skb_from_uarg(uarg); ++ struct sock_exterr_skb *serr; ++ struct sock *sk = skb->sk; ++ struct sk_buff_head *q; ++ unsigned long flags; ++ bool is_zerocopy; ++ u32 lo, hi; ++ u16 len; ++ ++ mm_unaccount_pinned_pages(&uarg->mmp); ++ ++ /* if !len, there was only 1 call, and it was aborted ++ * so do not queue a completion notification ++ */ ++ if (!uarg->len || sock_flag(sk, SOCK_DEAD)) ++ goto release; ++ ++ len = uarg->len; ++ lo = uarg->id; ++ hi = uarg->id + len - 1; ++ is_zerocopy = uarg->zerocopy; ++ ++ serr = SKB_EXT_ERR(skb); ++ memset(serr, 0, sizeof(*serr)); ++ serr->ee.ee_errno = 0; ++ serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; ++ serr->ee.ee_data = hi; ++ serr->ee.ee_info = lo; ++ if (!is_zerocopy) ++ serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; ++ ++ q = &sk->sk_error_queue; ++ spin_lock_irqsave(&q->lock, flags); ++ tail = skb_peek_tail(q); ++ if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || ++ !skb_zerocopy_notify_extend(tail, lo, len)) { ++ __skb_queue_tail(q, skb); ++ skb = NULL; ++ } ++ spin_unlock_irqrestore(&q->lock, flags); ++ ++ sk_error_report(sk); ++ ++release: ++ consume_skb(skb); ++ sock_put(sk); ++} ++ ++void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, ++ bool success) ++{ ++ uarg->zerocopy = uarg->zerocopy & success; ++ ++ if (refcount_dec_and_test(&uarg->refcnt)) ++ __msg_zerocopy_callback(uarg); ++} ++EXPORT_SYMBOL_GPL(msg_zerocopy_callback); ++ ++void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) ++{ ++ struct sock *sk = skb_from_uarg(uarg)->sk; ++ ++ atomic_dec(&sk->sk_zckey); ++ uarg->len--; ++ ++ if (have_uref) ++ msg_zerocopy_callback(NULL, uarg, true); ++} ++EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); ++ ++int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, ++ struct msghdr *msg, int len, ++ struct ubuf_info *uarg) ++{ ++ struct ubuf_info *orig_uarg = skb_zcopy(skb); ++ int err, orig_len = skb->len; ++ ++ /* An skb can only point to one uarg. This edge case happens when ++ * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. ++ */ ++ if (orig_uarg && uarg != orig_uarg) ++ return -EEXIST; ++ ++ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); ++ if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { ++ struct sock *save_sk = skb->sk; ++ ++ /* Streams do not free skb on error. Reset to prev state. */ ++ iov_iter_revert(&msg->msg_iter, skb->len - orig_len); ++ skb->sk = sk; ++ ___pskb_trim(skb, orig_len); ++ skb->sk = save_sk; ++ return err; ++ } ++ ++ skb_zcopy_set(skb, uarg, NULL); ++ return skb->len - orig_len; ++} ++EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); ++ ++void __skb_zcopy_downgrade_managed(struct sk_buff *skb) ++{ ++ int i; ++ ++ skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_frag_ref(skb, i); ++} ++EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); ++ ++static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, ++ gfp_t gfp_mask) ++{ ++ if (skb_zcopy(orig)) { ++ if (skb_zcopy(nskb)) { ++ /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ ++ if (!gfp_mask) { ++ WARN_ON_ONCE(1); ++ return -ENOMEM; ++ } ++ if (skb_uarg(nskb) == skb_uarg(orig)) ++ return 0; ++ if (skb_copy_ubufs(nskb, GFP_ATOMIC)) ++ return -EIO; ++ } ++ skb_zcopy_set(nskb, skb_uarg(orig), NULL); ++ } ++ return 0; ++} ++ ++/** ++ * skb_copy_ubufs - copy userspace skb frags buffers to kernel ++ * @skb: the skb to modify ++ * @gfp_mask: allocation priority ++ * ++ * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. ++ * It will copy all frags into kernel and drop the reference ++ * to userspace pages. ++ * ++ * If this function is called from an interrupt gfp_mask() must be ++ * %GFP_ATOMIC. ++ * ++ * Returns 0 on success or a negative error code on failure ++ * to allocate kernel memory to copy to. ++ */ ++int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) ++{ ++ int num_frags = skb_shinfo(skb)->nr_frags; ++ struct page *page, *head = NULL; ++ int i, new_frags; ++ u32 d_off; ++ ++ if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) ++ return -EINVAL; ++ ++ if (!num_frags) ++ goto release; ++ ++ new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ for (i = 0; i < new_frags; i++) { ++ page = alloc_page(gfp_mask); ++ if (!page) { ++ while (head) { ++ struct page *next = (struct page *)page_private(head); ++ put_page(head); ++ head = next; ++ } ++ return -ENOMEM; ++ } ++ set_page_private(page, (unsigned long)head); ++ head = page; ++ } ++ ++ page = head; ++ d_off = 0; ++ for (i = 0; i < num_frags; i++) { ++ skb_frag_t *f = &skb_shinfo(skb)->frags[i]; ++ u32 p_off, p_len, copied; ++ struct page *p; ++ u8 *vaddr; ++ ++ skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), ++ p, p_off, p_len, copied) { ++ u32 copy, done = 0; ++ vaddr = kmap_atomic(p); ++ ++ while (done < p_len) { ++ if (d_off == PAGE_SIZE) { ++ d_off = 0; ++ page = (struct page *)page_private(page); ++ } ++ copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); ++ memcpy(page_address(page) + d_off, ++ vaddr + p_off + done, copy); ++ done += copy; ++ d_off += copy; ++ } ++ kunmap_atomic(vaddr); ++ } ++ } ++ ++ /* skb frags release userspace buffers */ ++ for (i = 0; i < num_frags; i++) ++ skb_frag_unref(skb, i); ++ ++ /* skb frags point to kernel buffers */ ++ for (i = 0; i < new_frags - 1; i++) { ++ __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); ++ head = (struct page *)page_private(head); ++ } ++ __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); ++ skb_shinfo(skb)->nr_frags = new_frags; ++ ++release: ++ skb_zcopy_clear(skb, false); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_copy_ubufs); ++ ++/** ++ * skb_clone - duplicate an sk_buff ++ * @skb: buffer to clone ++ * @gfp_mask: allocation priority ++ * ++ * Duplicate an &sk_buff. The new one is not owned by a socket. Both ++ * copies share the same packet data but not structure. The new ++ * buffer has a reference count of 1. If the allocation fails the ++ * function returns %NULL otherwise the new buffer is returned. ++ * ++ * If this function is called from an interrupt gfp_mask() must be ++ * %GFP_ATOMIC. ++ */ ++ ++struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) ++{ ++ struct sk_buff_fclones *fclones = container_of(skb, ++ struct sk_buff_fclones, ++ skb1); ++ struct sk_buff *n; ++ ++ if (skb_orphan_frags(skb, gfp_mask)) ++ return NULL; ++ ++ if (skb->fclone == SKB_FCLONE_ORIG && ++ refcount_read(&fclones->fclone_ref) == 1) { ++ n = &fclones->skb2; ++ refcount_set(&fclones->fclone_ref, 2); ++ n->fclone = SKB_FCLONE_CLONE; ++ } else { ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); ++ if (!n) ++ return NULL; ++ ++ n->fclone = SKB_FCLONE_UNAVAILABLE; ++ } ++ ++ return __skb_clone(n, skb); ++} ++EXPORT_SYMBOL(skb_clone); ++ ++void skb_headers_offset_update(struct sk_buff *skb, int off) ++{ ++ /* Only adjust this if it actually is csum_start rather than csum */ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) ++ skb->csum_start += off; ++ /* {transport,network,mac}_header and tail are relative to skb->head */ ++ skb->transport_header += off; ++ skb->network_header += off; ++ if (skb_mac_header_was_set(skb)) ++ skb->mac_header += off; ++ skb->inner_transport_header += off; ++ skb->inner_network_header += off; ++ skb->inner_mac_header += off; ++} ++EXPORT_SYMBOL(skb_headers_offset_update); ++ ++void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) ++{ ++ __copy_skb_header(new, old); ++ ++ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; ++ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; ++ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; ++} ++EXPORT_SYMBOL(skb_copy_header); ++ ++static inline int skb_alloc_rx_flag(const struct sk_buff *skb) ++{ ++ if (skb_pfmemalloc(skb)) ++ return SKB_ALLOC_RX; ++ return 0; ++} ++ ++/** ++ * skb_copy - create private copy of an sk_buff ++ * @skb: buffer to copy ++ * @gfp_mask: allocation priority ++ * ++ * Make a copy of both an &sk_buff and its data. This is used when the ++ * caller wishes to modify the data and needs a private copy of the ++ * data to alter. Returns %NULL on failure or the pointer to the buffer ++ * on success. The returned buffer has a reference count of 1. ++ * ++ * As by-product this function converts non-linear &sk_buff to linear ++ * one, so that &sk_buff becomes completely private and caller is allowed ++ * to modify all the data of returned buffer. This means that this ++ * function is not recommended for use in circumstances when only ++ * header is going to be modified. Use pskb_copy() instead. ++ */ ++ ++struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) ++{ ++ int headerlen = skb_headroom(skb); ++ unsigned int size = skb_end_offset(skb) + skb->data_len; ++ struct sk_buff *n = __alloc_skb(size, gfp_mask, ++ skb_alloc_rx_flag(skb), NUMA_NO_NODE); ++ ++ if (!n) ++ return NULL; ++ ++ /* Set the data pointer */ ++ skb_reserve(n, headerlen); ++ /* Set the tail pointer and length */ ++ skb_put(n, skb->len); ++ ++ BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); ++ ++ skb_copy_header(n, skb); ++ return n; ++} ++EXPORT_SYMBOL(skb_copy); ++ ++/** ++ * __pskb_copy_fclone - create copy of an sk_buff with private head. ++ * @skb: buffer to copy ++ * @headroom: headroom of new skb ++ * @gfp_mask: allocation priority ++ * @fclone: if true allocate the copy of the skb from the fclone ++ * cache instead of the head cache; it is recommended to set this ++ * to true for the cases where the copy will likely be cloned ++ * ++ * Make a copy of both an &sk_buff and part of its data, located ++ * in header. Fragmented data remain shared. This is used when ++ * the caller wishes to modify only header of &sk_buff and needs ++ * private copy of the header to alter. Returns %NULL on failure ++ * or the pointer to the buffer on success. ++ * The returned buffer has a reference count of 1. ++ */ ++ ++struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, ++ gfp_t gfp_mask, bool fclone) ++{ ++ unsigned int size = skb_headlen(skb) + headroom; ++ int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); ++ struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); ++ ++ if (!n) ++ goto out; ++ ++ /* Set the data pointer */ ++ skb_reserve(n, headroom); ++ /* Set the tail pointer and length */ ++ skb_put(n, skb_headlen(skb)); ++ /* Copy the bytes */ ++ skb_copy_from_linear_data(skb, n->data, n->len); ++ ++ n->truesize += skb->data_len; ++ n->data_len = skb->data_len; ++ n->len = skb->len; ++ ++ if (skb_shinfo(skb)->nr_frags) { ++ int i; ++ ++ if (skb_orphan_frags(skb, gfp_mask) || ++ skb_zerocopy_clone(n, skb, gfp_mask)) { ++ kfree_skb(n); ++ n = NULL; ++ goto out; ++ } ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; ++ skb_frag_ref(skb, i); ++ } ++ skb_shinfo(n)->nr_frags = i; ++ } ++ ++ if (skb_has_frag_list(skb)) { ++ skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; ++ skb_clone_fraglist(n); ++ } ++ ++ skb_copy_header(n, skb); ++out: ++ return n; ++} ++EXPORT_SYMBOL(__pskb_copy_fclone); ++ ++/** ++ * pskb_expand_head - reallocate header of &sk_buff ++ * @skb: buffer to reallocate ++ * @nhead: room to add at head ++ * @ntail: room to add at tail ++ * @gfp_mask: allocation priority ++ * ++ * Expands (or creates identical copy, if @nhead and @ntail are zero) ++ * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have ++ * reference count of 1. Returns zero in the case of success or error, ++ * if expansion failed. In the last case, &sk_buff is not changed. ++ * ++ * All the pointers pointing into skb header may change and must be ++ * reloaded after call to this function. ++ */ ++ ++int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, ++ gfp_t gfp_mask) ++{ ++ int i, osize = skb_end_offset(skb); ++ int size = osize + nhead + ntail; ++ long off; ++ u8 *data; ++ ++ BUG_ON(nhead < 0); ++ ++ BUG_ON(skb_shared(skb)); ++ ++ skb_zcopy_downgrade_managed(skb); ++ ++ size = SKB_DATA_ALIGN(size); ++ ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ++ gfp_mask, NUMA_NO_NODE, NULL); ++ if (!data) ++ goto nodata; ++ size = SKB_WITH_OVERHEAD(ksize(data)); ++ ++ /* Copy only real data... and, alas, header. This should be ++ * optimized for the cases when header is void. ++ */ ++ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); ++ ++ memcpy((struct skb_shared_info *)(data + size), ++ skb_shinfo(skb), ++ offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); ++ ++ /* ++ * if shinfo is shared we must drop the old head gracefully, but if it ++ * is not we can just drop the old head and let the existing refcount ++ * be since all we did is relocate the values ++ */ ++ if (skb_cloned(skb)) { ++ if (skb_orphan_frags(skb, gfp_mask)) ++ goto nofrags; ++ if (skb_zcopy(skb)) ++ refcount_inc(&skb_uarg(skb)->refcnt); ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_frag_ref(skb, i); ++ ++ if (skb_has_frag_list(skb)) ++ skb_clone_fraglist(skb); ++ ++ skb_release_data(skb); ++ } else { ++ skb_free_head(skb); ++ } ++ off = (data + nhead) - skb->head; ++ ++ skb->head = data; ++ skb->head_frag = 0; ++ skb->data += off; ++ ++ skb_set_end_offset(skb, size); ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ off = nhead; ++#endif ++ skb->tail += off; ++ skb_headers_offset_update(skb, nhead); ++ skb->cloned = 0; ++ skb->hdr_len = 0; ++ skb->nohdr = 0; ++ atomic_set(&skb_shinfo(skb)->dataref, 1); ++ ++ skb_metadata_clear(skb); ++ ++ /* It is not generally safe to change skb->truesize. ++ * For the moment, we really care of rx path, or ++ * when skb is orphaned (not attached to a socket). ++ */ ++ if (!skb->sk || skb->destructor == sock_edemux) ++ skb->truesize += size - osize; ++ ++ return 0; ++ ++nofrags: ++ kfree(data); ++nodata: ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(pskb_expand_head); ++ ++/* Make private copy of skb with writable head and some headroom */ ++ ++struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) ++{ ++ struct sk_buff *skb2; ++ int delta = headroom - skb_headroom(skb); ++ ++ if (delta <= 0) ++ skb2 = pskb_copy(skb, GFP_ATOMIC); ++ else { ++ skb2 = skb_clone(skb, GFP_ATOMIC); ++ if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ++ GFP_ATOMIC)) { ++ kfree_skb(skb2); ++ skb2 = NULL; ++ } ++ } ++ return skb2; ++} ++EXPORT_SYMBOL(skb_realloc_headroom); ++ ++int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) ++{ ++ unsigned int saved_end_offset, saved_truesize; ++ struct skb_shared_info *shinfo; ++ int res; ++ ++ saved_end_offset = skb_end_offset(skb); ++ saved_truesize = skb->truesize; ++ ++ res = pskb_expand_head(skb, 0, 0, pri); ++ if (res) ++ return res; ++ ++ skb->truesize = saved_truesize; ++ ++ if (likely(skb_end_offset(skb) == saved_end_offset)) ++ return 0; ++ ++ shinfo = skb_shinfo(skb); ++ ++ /* We are about to change back skb->end, ++ * we need to move skb_shinfo() to its new location. ++ */ ++ memmove(skb->head + saved_end_offset, ++ shinfo, ++ offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); ++ ++ skb_set_end_offset(skb, saved_end_offset); ++ ++ return 0; ++} ++ ++/** ++ * skb_expand_head - reallocate header of &sk_buff ++ * @skb: buffer to reallocate ++ * @headroom: needed headroom ++ * ++ * Unlike skb_realloc_headroom, this one does not allocate a new skb ++ * if possible; copies skb->sk to new skb as needed ++ * and frees original skb in case of failures. ++ * ++ * It expect increased headroom and generates warning otherwise. ++ */ ++ ++struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) ++{ ++ int delta = headroom - skb_headroom(skb); ++ int osize = skb_end_offset(skb); ++ struct sock *sk = skb->sk; ++ ++ if (WARN_ONCE(delta <= 0, ++ "%s is expecting an increase in the headroom", __func__)) ++ return skb; ++ ++ delta = SKB_DATA_ALIGN(delta); ++ /* pskb_expand_head() might crash, if skb is shared. */ ++ if (skb_shared(skb) || !is_skb_wmem(skb)) { ++ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); ++ ++ if (unlikely(!nskb)) ++ goto fail; ++ ++ if (sk) ++ skb_set_owner_w(nskb, sk); ++ consume_skb(skb); ++ skb = nskb; ++ } ++ if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) ++ goto fail; ++ ++ if (sk && is_skb_wmem(skb)) { ++ delta = skb_end_offset(skb) - osize; ++ refcount_add(delta, &sk->sk_wmem_alloc); ++ skb->truesize += delta; ++ } ++ return skb; ++ ++fail: ++ kfree_skb(skb); ++ return NULL; ++} ++EXPORT_SYMBOL(skb_expand_head); ++ ++/** ++ * skb_copy_expand - copy and expand sk_buff ++ * @skb: buffer to copy ++ * @newheadroom: new free bytes at head ++ * @newtailroom: new free bytes at tail ++ * @gfp_mask: allocation priority ++ * ++ * Make a copy of both an &sk_buff and its data and while doing so ++ * allocate additional space. ++ * ++ * This is used when the caller wishes to modify the data and needs a ++ * private copy of the data to alter as well as more space for new fields. ++ * Returns %NULL on failure or the pointer to the buffer ++ * on success. The returned buffer has a reference count of 1. ++ * ++ * You must pass %GFP_ATOMIC as the allocation priority if this function ++ * is called from an interrupt. ++ */ ++struct sk_buff *skb_copy_expand(const struct sk_buff *skb, ++ int newheadroom, int newtailroom, ++ gfp_t gfp_mask) ++{ ++ /* ++ * Allocate the copy buffer ++ */ ++ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, ++ gfp_mask, skb_alloc_rx_flag(skb), ++ NUMA_NO_NODE); ++ int oldheadroom = skb_headroom(skb); ++ int head_copy_len, head_copy_off; ++ ++ if (!n) ++ return NULL; ++ ++ skb_reserve(n, newheadroom); ++ ++ /* Set the tail pointer and length */ ++ skb_put(n, skb->len); ++ ++ head_copy_len = oldheadroom; ++ head_copy_off = 0; ++ if (newheadroom <= head_copy_len) ++ head_copy_len = newheadroom; ++ else ++ head_copy_off = newheadroom - head_copy_len; ++ ++ /* Copy the linear header and data. */ ++ BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, ++ skb->len + head_copy_len)); ++ ++ skb_copy_header(n, skb); ++ ++ skb_headers_offset_update(n, newheadroom - oldheadroom); ++ ++ return n; ++} ++EXPORT_SYMBOL(skb_copy_expand); ++ ++/** ++ * __skb_pad - zero pad the tail of an skb ++ * @skb: buffer to pad ++ * @pad: space to pad ++ * @free_on_error: free buffer on error ++ * ++ * Ensure that a buffer is followed by a padding area that is zero ++ * filled. Used by network drivers which may DMA or transfer data ++ * beyond the buffer end onto the wire. ++ * ++ * May return error in out of memory cases. The skb is freed on error ++ * if @free_on_error is true. ++ */ ++ ++int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) ++{ ++ int err; ++ int ntail; ++ ++ /* If the skbuff is non linear tailroom is always zero.. */ ++ if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { ++ memset(skb->data+skb->len, 0, pad); ++ return 0; ++ } ++ ++ ntail = skb->data_len + pad - (skb->end - skb->tail); ++ if (likely(skb_cloned(skb) || ntail > 0)) { ++ err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); ++ if (unlikely(err)) ++ goto free_skb; ++ } ++ ++ /* FIXME: The use of this function with non-linear skb's really needs ++ * to be audited. ++ */ ++ err = skb_linearize(skb); ++ if (unlikely(err)) ++ goto free_skb; ++ ++ memset(skb->data + skb->len, 0, pad); ++ return 0; ++ ++free_skb: ++ if (free_on_error) ++ kfree_skb(skb); ++ return err; ++} ++EXPORT_SYMBOL(__skb_pad); ++ ++/** ++ * pskb_put - add data to the tail of a potentially fragmented buffer ++ * @skb: start of the buffer to use ++ * @tail: tail fragment of the buffer to use ++ * @len: amount of data to add ++ * ++ * This function extends the used data area of the potentially ++ * fragmented buffer. @tail must be the last fragment of @skb -- or ++ * @skb itself. If this would exceed the total buffer size the kernel ++ * will panic. A pointer to the first byte of the extra data is ++ * returned. ++ */ ++ ++void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) ++{ ++ if (tail != skb) { ++ skb->data_len += len; ++ skb->len += len; ++ } ++ return skb_put(tail, len); ++} ++EXPORT_SYMBOL_GPL(pskb_put); ++ ++/** ++ * skb_put - add data to a buffer ++ * @skb: buffer to use ++ * @len: amount of data to add ++ * ++ * This function extends the used data area of the buffer. If this would ++ * exceed the total buffer size the kernel will panic. A pointer to the ++ * first byte of the extra data is returned. ++ */ ++void *skb_put(struct sk_buff *skb, unsigned int len) ++{ ++ void *tmp = skb_tail_pointer(skb); ++ SKB_LINEAR_ASSERT(skb); ++ skb->tail += len; ++ skb->len += len; ++ if (unlikely(skb->tail > skb->end)) ++ skb_over_panic(skb, len, __builtin_return_address(0)); ++ return tmp; ++} ++EXPORT_SYMBOL(skb_put); ++ ++/** ++ * skb_push - add data to the start of a buffer ++ * @skb: buffer to use ++ * @len: amount of data to add ++ * ++ * This function extends the used data area of the buffer at the buffer ++ * start. If this would exceed the total buffer headroom the kernel will ++ * panic. A pointer to the first byte of the extra data is returned. ++ */ ++void *skb_push(struct sk_buff *skb, unsigned int len) ++{ ++ skb->data -= len; ++ skb->len += len; ++ if (unlikely(skb->data < skb->head)) ++ skb_under_panic(skb, len, __builtin_return_address(0)); ++ return skb->data; ++} ++EXPORT_SYMBOL(skb_push); ++ ++/** ++ * skb_pull - remove data from the start of a buffer ++ * @skb: buffer to use ++ * @len: amount of data to remove ++ * ++ * This function removes data from the start of a buffer, returning ++ * the memory to the headroom. A pointer to the next data in the buffer ++ * is returned. Once the data has been pulled future pushes will overwrite ++ * the old data. ++ */ ++void *skb_pull(struct sk_buff *skb, unsigned int len) ++{ ++ return skb_pull_inline(skb, len); ++} ++EXPORT_SYMBOL(skb_pull); ++ ++/** ++ * skb_pull_data - remove data from the start of a buffer returning its ++ * original position. ++ * @skb: buffer to use ++ * @len: amount of data to remove ++ * ++ * This function removes data from the start of a buffer, returning ++ * the memory to the headroom. A pointer to the original data in the buffer ++ * is returned after checking if there is enough data to pull. Once the ++ * data has been pulled future pushes will overwrite the old data. ++ */ ++void *skb_pull_data(struct sk_buff *skb, size_t len) ++{ ++ void *data = skb->data; ++ ++ if (skb->len < len) ++ return NULL; ++ ++ skb_pull(skb, len); ++ ++ return data; ++} ++EXPORT_SYMBOL(skb_pull_data); ++ ++/** ++ * skb_trim - remove end from a buffer ++ * @skb: buffer to alter ++ * @len: new length ++ * ++ * Cut the length of a buffer down by removing data from the tail. If ++ * the buffer is already under the length specified it is not modified. ++ * The skb must be linear. ++ */ ++void skb_trim(struct sk_buff *skb, unsigned int len) ++{ ++ if (skb->len > len) ++ __skb_trim(skb, len); ++} ++EXPORT_SYMBOL(skb_trim); ++ ++/* Trims skb to length len. It can change skb pointers. ++ */ ++ ++int ___pskb_trim(struct sk_buff *skb, unsigned int len) ++{ ++ struct sk_buff **fragp; ++ struct sk_buff *frag; ++ int offset = skb_headlen(skb); ++ int nfrags = skb_shinfo(skb)->nr_frags; ++ int i; ++ int err; ++ ++ if (skb_cloned(skb) && ++ unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) ++ return err; ++ ++ i = 0; ++ if (offset >= len) ++ goto drop_pages; ++ ++ for (; i < nfrags; i++) { ++ int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (end < len) { ++ offset = end; ++ continue; ++ } ++ ++ skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); ++ ++drop_pages: ++ skb_shinfo(skb)->nr_frags = i; ++ ++ for (; i < nfrags; i++) ++ skb_frag_unref(skb, i); ++ ++ if (skb_has_frag_list(skb)) ++ skb_drop_fraglist(skb); ++ goto done; ++ } ++ ++ for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); ++ fragp = &frag->next) { ++ int end = offset + frag->len; ++ ++ if (skb_shared(frag)) { ++ struct sk_buff *nfrag; ++ ++ nfrag = skb_clone(frag, GFP_ATOMIC); ++ if (unlikely(!nfrag)) ++ return -ENOMEM; ++ ++ nfrag->next = frag->next; ++ consume_skb(frag); ++ frag = nfrag; ++ *fragp = frag; ++ } ++ ++ if (end < len) { ++ offset = end; ++ continue; ++ } ++ ++ if (end > len && ++ unlikely((err = pskb_trim(frag, len - offset)))) ++ return err; ++ ++ if (frag->next) ++ skb_drop_list(&frag->next); ++ break; ++ } ++ ++done: ++ if (len > skb_headlen(skb)) { ++ skb->data_len -= skb->len - len; ++ skb->len = len; ++ } else { ++ skb->len = len; ++ skb->data_len = 0; ++ skb_set_tail_pointer(skb, len); ++ } ++ ++ if (!skb->sk || skb->destructor == sock_edemux) ++ skb_condense(skb); ++ return 0; ++} ++EXPORT_SYMBOL(___pskb_trim); ++ ++/* Note : use pskb_trim_rcsum() instead of calling this directly ++ */ ++int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) ++{ ++ if (skb->ip_summed == CHECKSUM_COMPLETE) { ++ int delta = skb->len - len; ++ ++ skb->csum = csum_block_sub(skb->csum, ++ skb_checksum(skb, len, delta, 0), ++ len); ++ } else if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; ++ int offset = skb_checksum_start_offset(skb) + skb->csum_offset; ++ ++ if (offset + sizeof(__sum16) > hdlen) ++ return -EINVAL; ++ } ++ return __pskb_trim(skb, len); ++} ++EXPORT_SYMBOL(pskb_trim_rcsum_slow); ++ ++/** ++ * __pskb_pull_tail - advance tail of skb header ++ * @skb: buffer to reallocate ++ * @delta: number of bytes to advance tail ++ * ++ * The function makes a sense only on a fragmented &sk_buff, ++ * it expands header moving its tail forward and copying necessary ++ * data from fragmented part. ++ * ++ * &sk_buff MUST have reference count of 1. ++ * ++ * Returns %NULL (and &sk_buff does not change) if pull failed ++ * or value of new tail of skb in the case of success. ++ * ++ * All the pointers pointing into skb header may change and must be ++ * reloaded after call to this function. ++ */ ++ ++/* Moves tail of skb head forward, copying data from fragmented part, ++ * when it is necessary. ++ * 1. It may fail due to malloc failure. ++ * 2. It may change skb pointers. ++ * ++ * It is pretty complicated. Luckily, it is called only in exceptional cases. ++ */ ++void *__pskb_pull_tail(struct sk_buff *skb, int delta) ++{ ++ /* If skb has not enough free space at tail, get new one ++ * plus 128 bytes for future expansions. If we have enough ++ * room at tail, reallocate without expansion only if skb is cloned. ++ */ ++ int i, k, eat = (skb->tail + delta) - skb->end; ++ ++ if (eat > 0 || skb_cloned(skb)) { ++ if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, ++ GFP_ATOMIC)) ++ return NULL; ++ } ++ ++ BUG_ON(skb_copy_bits(skb, skb_headlen(skb), ++ skb_tail_pointer(skb), delta)); ++ ++ /* Optimization: no fragments, no reasons to preestimate ++ * size of pulled pages. Superb. ++ */ ++ if (!skb_has_frag_list(skb)) ++ goto pull_pages; ++ ++ /* Estimate size of pulled pages. */ ++ eat = delta; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (size >= eat) ++ goto pull_pages; ++ eat -= size; ++ } ++ ++ /* If we need update frag list, we are in troubles. ++ * Certainly, it is possible to add an offset to skb data, ++ * but taking into account that pulling is expected to ++ * be very rare operation, it is worth to fight against ++ * further bloating skb head and crucify ourselves here instead. ++ * Pure masohism, indeed. 8)8) ++ */ ++ if (eat) { ++ struct sk_buff *list = skb_shinfo(skb)->frag_list; ++ struct sk_buff *clone = NULL; ++ struct sk_buff *insp = NULL; ++ ++ do { ++ if (list->len <= eat) { ++ /* Eaten as whole. */ ++ eat -= list->len; ++ list = list->next; ++ insp = list; ++ } else { ++ /* Eaten partially. */ ++ ++ if (skb_shared(list)) { ++ /* Sucks! We need to fork list. :-( */ ++ clone = skb_clone(list, GFP_ATOMIC); ++ if (!clone) ++ return NULL; ++ insp = list->next; ++ list = clone; ++ } else { ++ /* This may be pulled without ++ * problems. */ ++ insp = list; ++ } ++ if (!pskb_pull(list, eat)) { ++ kfree_skb(clone); ++ return NULL; ++ } ++ break; ++ } ++ } while (eat); ++ ++ /* Free pulled out fragments. */ ++ while ((list = skb_shinfo(skb)->frag_list) != insp) { ++ skb_shinfo(skb)->frag_list = list->next; ++ consume_skb(list); ++ } ++ /* And insert new clone at head. */ ++ if (clone) { ++ clone->next = list; ++ skb_shinfo(skb)->frag_list = clone; ++ } ++ } ++ /* Success! Now we may commit changes to skb data. */ ++ ++pull_pages: ++ eat = delta; ++ k = 0; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (size <= eat) { ++ skb_frag_unref(skb, i); ++ eat -= size; ++ } else { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; ++ ++ *frag = skb_shinfo(skb)->frags[i]; ++ if (eat) { ++ skb_frag_off_add(frag, eat); ++ skb_frag_size_sub(frag, eat); ++ if (!i) ++ goto end; ++ eat = 0; ++ } ++ k++; ++ } ++ } ++ skb_shinfo(skb)->nr_frags = k; ++ ++end: ++ skb->tail += delta; ++ skb->data_len -= delta; ++ ++ if (!skb->data_len) ++ skb_zcopy_clear(skb, false); ++ ++ return skb_tail_pointer(skb); ++} ++EXPORT_SYMBOL(__pskb_pull_tail); ++ ++/** ++ * skb_copy_bits - copy bits from skb to kernel buffer ++ * @skb: source skb ++ * @offset: offset in source ++ * @to: destination buffer ++ * @len: number of bytes to copy ++ * ++ * Copy the specified number of bytes from the source skb to the ++ * destination buffer. ++ * ++ * CAUTION ! : ++ * If its prototype is ever changed, ++ * check arch/{*}/net/{*}.S files, ++ * since it is called from BPF assembly code. ++ */ ++int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) ++{ ++ int start = skb_headlen(skb); ++ struct sk_buff *frag_iter; ++ int i, copy; ++ ++ if (offset > (int)skb->len - len) ++ goto fault; ++ ++ /* Copy header. */ ++ if ((copy = start - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ skb_copy_from_linear_data_offset(skb, offset, to, copy); ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ to += copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ skb_frag_t *f = &skb_shinfo(skb)->frags[i]; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(f); ++ if ((copy = end - offset) > 0) { ++ u32 p_off, p_len, copied; ++ struct page *p; ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ skb_frag_foreach_page(f, ++ skb_frag_off(f) + offset - start, ++ copy, p, p_off, p_len, copied) { ++ vaddr = kmap_atomic(p); ++ memcpy(to + copied, vaddr + p_off, p_len); ++ kunmap_atomic(vaddr); ++ } ++ ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ to += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ if (skb_copy_bits(frag_iter, offset - start, to, copy)) ++ goto fault; ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ to += copy; ++ } ++ start = end; ++ } ++ ++ if (!len) ++ return 0; ++ ++fault: ++ return -EFAULT; ++} ++EXPORT_SYMBOL(skb_copy_bits); ++ ++/* ++ * Callback from splice_to_pipe(), if we need to release some pages ++ * at the end of the spd in case we error'ed out in filling the pipe. ++ */ ++static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) ++{ ++ put_page(spd->pages[i]); ++} ++ ++static struct page *linear_to_page(struct page *page, unsigned int *len, ++ unsigned int *offset, ++ struct sock *sk) ++{ ++ struct page_frag *pfrag = sk_page_frag(sk); ++ ++ if (!sk_page_frag_refill(sk, pfrag)) ++ return NULL; ++ ++ *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); ++ ++ memcpy(page_address(pfrag->page) + pfrag->offset, ++ page_address(page) + *offset, *len); ++ *offset = pfrag->offset; ++ pfrag->offset += *len; ++ ++ return pfrag->page; ++} ++ ++static bool spd_can_coalesce(const struct splice_pipe_desc *spd, ++ struct page *page, ++ unsigned int offset) ++{ ++ return spd->nr_pages && ++ spd->pages[spd->nr_pages - 1] == page && ++ (spd->partial[spd->nr_pages - 1].offset + ++ spd->partial[spd->nr_pages - 1].len == offset); ++} ++ ++/* ++ * Fill page/offset/length into spd, if it can hold more pages. ++ */ ++static bool spd_fill_page(struct splice_pipe_desc *spd, ++ struct pipe_inode_info *pipe, struct page *page, ++ unsigned int *len, unsigned int offset, ++ bool linear, ++ struct sock *sk) ++{ ++ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) ++ return true; ++ ++ if (linear) { ++ page = linear_to_page(page, len, &offset, sk); ++ if (!page) ++ return true; ++ } ++ if (spd_can_coalesce(spd, page, offset)) { ++ spd->partial[spd->nr_pages - 1].len += *len; ++ return false; ++ } ++ get_page(page); ++ spd->pages[spd->nr_pages] = page; ++ spd->partial[spd->nr_pages].len = *len; ++ spd->partial[spd->nr_pages].offset = offset; ++ spd->nr_pages++; ++ ++ return false; ++} ++ ++static bool __splice_segment(struct page *page, unsigned int poff, ++ unsigned int plen, unsigned int *off, ++ unsigned int *len, ++ struct splice_pipe_desc *spd, bool linear, ++ struct sock *sk, ++ struct pipe_inode_info *pipe) ++{ ++ if (!*len) ++ return true; ++ ++ /* skip this segment if already processed */ ++ if (*off >= plen) { ++ *off -= plen; ++ return false; ++ } ++ ++ /* ignore any bits we already processed */ ++ poff += *off; ++ plen -= *off; ++ *off = 0; ++ ++ do { ++ unsigned int flen = min(*len, plen); ++ ++ if (spd_fill_page(spd, pipe, page, &flen, poff, ++ linear, sk)) ++ return true; ++ poff += flen; ++ plen -= flen; ++ *len -= flen; ++ } while (*len && plen); ++ ++ return false; ++} ++ ++/* ++ * Map linear and fragment data from the skb to spd. It reports true if the ++ * pipe is full or if we already spliced the requested length. ++ */ ++static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, ++ unsigned int *offset, unsigned int *len, ++ struct splice_pipe_desc *spd, struct sock *sk) ++{ ++ int seg; ++ struct sk_buff *iter; ++ ++ /* map the linear part : ++ * If skb->head_frag is set, this 'linear' part is backed by a ++ * fragment, and if the head is not shared with any clones then ++ * we can avoid a copy since we own the head portion of this page. ++ */ ++ if (__splice_segment(virt_to_page(skb->data), ++ (unsigned long) skb->data & (PAGE_SIZE - 1), ++ skb_headlen(skb), ++ offset, len, spd, ++ skb_head_is_locked(skb), ++ sk, pipe)) ++ return true; ++ ++ /* ++ * then map the fragments ++ */ ++ for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { ++ const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; ++ ++ if (__splice_segment(skb_frag_page(f), ++ skb_frag_off(f), skb_frag_size(f), ++ offset, len, spd, false, sk, pipe)) ++ return true; ++ } ++ ++ skb_walk_frags(skb, iter) { ++ if (*offset >= iter->len) { ++ *offset -= iter->len; ++ continue; ++ } ++ /* __skb_splice_bits() only fails if the output has no room ++ * left, so no point in going over the frag_list for the error ++ * case. ++ */ ++ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Map data from the skb to a pipe. Should handle both the linear part, ++ * the fragments, and the frag list. ++ */ ++int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, ++ struct pipe_inode_info *pipe, unsigned int tlen, ++ unsigned int flags) ++{ ++ struct partial_page partial[MAX_SKB_FRAGS]; ++ struct page *pages[MAX_SKB_FRAGS]; ++ struct splice_pipe_desc spd = { ++ .pages = pages, ++ .partial = partial, ++ .nr_pages_max = MAX_SKB_FRAGS, ++ .ops = &nosteal_pipe_buf_ops, ++ .spd_release = sock_spd_release, ++ }; ++ int ret = 0; ++ ++ __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); ++ ++ if (spd.nr_pages) ++ ret = splice_to_pipe(pipe, &spd); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(skb_splice_bits); ++ ++static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, ++ struct kvec *vec, size_t num, size_t size) ++{ ++ struct socket *sock = sk->sk_socket; ++ ++ if (!sock) ++ return -EINVAL; ++ return kernel_sendmsg(sock, msg, vec, num, size); ++} ++ ++static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, ++ size_t size, int flags) ++{ ++ struct socket *sock = sk->sk_socket; ++ ++ if (!sock) ++ return -EINVAL; ++ return kernel_sendpage(sock, page, offset, size, flags); ++} ++ ++typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, ++ struct kvec *vec, size_t num, size_t size); ++typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, ++ size_t size, int flags); ++static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, ++ int len, sendmsg_func sendmsg, sendpage_func sendpage) ++{ ++ unsigned int orig_len = len; ++ struct sk_buff *head = skb; ++ unsigned short fragidx; ++ int slen, ret; ++ ++do_frag_list: ++ ++ /* Deal with head data */ ++ while (offset < skb_headlen(skb) && len) { ++ struct kvec kv; ++ struct msghdr msg; ++ ++ slen = min_t(int, len, skb_headlen(skb) - offset); ++ kv.iov_base = skb->data + offset; ++ kv.iov_len = slen; ++ memset(&msg, 0, sizeof(msg)); ++ msg.msg_flags = MSG_DONTWAIT; ++ ++ ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, ++ sendmsg_unlocked, sk, &msg, &kv, 1, slen); ++ if (ret <= 0) ++ goto error; ++ ++ offset += ret; ++ len -= ret; ++ } ++ ++ /* All the data was skb head? */ ++ if (!len) ++ goto out; ++ ++ /* Make offset relative to start of frags */ ++ offset -= skb_headlen(skb); ++ ++ /* Find where we are in frag list */ ++ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; ++ ++ if (offset < skb_frag_size(frag)) ++ break; ++ ++ offset -= skb_frag_size(frag); ++ } ++ ++ for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; ++ ++ slen = min_t(size_t, len, skb_frag_size(frag) - offset); ++ ++ while (slen) { ++ ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, ++ sendpage_unlocked, sk, ++ skb_frag_page(frag), ++ skb_frag_off(frag) + offset, ++ slen, MSG_DONTWAIT); ++ if (ret <= 0) ++ goto error; ++ ++ len -= ret; ++ offset += ret; ++ slen -= ret; ++ } ++ ++ offset = 0; ++ } ++ ++ if (len) { ++ /* Process any frag lists */ ++ ++ if (skb == head) { ++ if (skb_has_frag_list(skb)) { ++ skb = skb_shinfo(skb)->frag_list; ++ goto do_frag_list; ++ } ++ } else if (skb->next) { ++ skb = skb->next; ++ goto do_frag_list; ++ } ++ } ++ ++out: ++ return orig_len - len; ++ ++error: ++ return orig_len == len ? ret : orig_len - len; ++} ++ ++/* Send skb data on a socket. Socket must be locked. */ ++int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, ++ int len) ++{ ++ return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, ++ kernel_sendpage_locked); ++} ++EXPORT_SYMBOL_GPL(skb_send_sock_locked); ++ ++/* Send skb data on a socket. Socket must be unlocked. */ ++int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) ++{ ++ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, ++ sendpage_unlocked); ++} ++ ++/** ++ * skb_store_bits - store bits from kernel buffer to skb ++ * @skb: destination buffer ++ * @offset: offset in destination ++ * @from: source buffer ++ * @len: number of bytes to copy ++ * ++ * Copy the specified number of bytes from the source buffer to the ++ * destination skb. This function handles all the messy bits of ++ * traversing fragment lists and such. ++ */ ++ ++int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) ++{ ++ int start = skb_headlen(skb); ++ struct sk_buff *frag_iter; ++ int i, copy; ++ ++ if (offset > (int)skb->len - len) ++ goto fault; ++ ++ if ((copy = start - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ skb_copy_to_linear_data_offset(skb, offset, from, copy); ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ from += copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(frag); ++ if ((copy = end - offset) > 0) { ++ u32 p_off, p_len, copied; ++ struct page *p; ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ skb_frag_foreach_page(frag, ++ skb_frag_off(frag) + offset - start, ++ copy, p, p_off, p_len, copied) { ++ vaddr = kmap_atomic(p); ++ memcpy(vaddr + p_off, from + copied, p_len); ++ kunmap_atomic(vaddr); ++ } ++ ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ from += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ if (skb_store_bits(frag_iter, offset - start, ++ from, copy)) ++ goto fault; ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ from += copy; ++ } ++ start = end; ++ } ++ if (!len) ++ return 0; ++ ++fault: ++ return -EFAULT; ++} ++EXPORT_SYMBOL(skb_store_bits); ++ ++/* Checksum skb data. */ ++__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, ++ __wsum csum, const struct skb_checksum_ops *ops) ++{ ++ int start = skb_headlen(skb); ++ int i, copy = start - offset; ++ struct sk_buff *frag_iter; ++ int pos = 0; ++ ++ /* Checksum header. */ ++ if (copy > 0) { ++ if (copy > len) ++ copy = len; ++ csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, ++ skb->data + offset, copy, csum); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ pos = copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(frag); ++ if ((copy = end - offset) > 0) { ++ u32 p_off, p_len, copied; ++ struct page *p; ++ __wsum csum2; ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ skb_frag_foreach_page(frag, ++ skb_frag_off(frag) + offset - start, ++ copy, p, p_off, p_len, copied) { ++ vaddr = kmap_atomic(p); ++ csum2 = INDIRECT_CALL_1(ops->update, ++ csum_partial_ext, ++ vaddr + p_off, p_len, 0); ++ kunmap_atomic(vaddr); ++ csum = INDIRECT_CALL_1(ops->combine, ++ csum_block_add_ext, csum, ++ csum2, pos, p_len); ++ pos += p_len; ++ } ++ ++ if (!(len -= copy)) ++ return csum; ++ offset += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ __wsum csum2; ++ if (copy > len) ++ copy = len; ++ csum2 = __skb_checksum(frag_iter, offset - start, ++ copy, 0, ops); ++ csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, ++ csum, csum2, pos, copy); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ pos += copy; ++ } ++ start = end; ++ } ++ BUG_ON(len); ++ ++ return csum; ++} ++EXPORT_SYMBOL(__skb_checksum); ++ ++__wsum skb_checksum(const struct sk_buff *skb, int offset, ++ int len, __wsum csum) ++{ ++ const struct skb_checksum_ops ops = { ++ .update = csum_partial_ext, ++ .combine = csum_block_add_ext, ++ }; ++ ++ return __skb_checksum(skb, offset, len, csum, &ops); ++} ++EXPORT_SYMBOL(skb_checksum); ++ ++/* Both of above in one bottle. */ ++ ++__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, ++ u8 *to, int len) ++{ ++ int start = skb_headlen(skb); ++ int i, copy = start - offset; ++ struct sk_buff *frag_iter; ++ int pos = 0; ++ __wsum csum = 0; ++ ++ /* Copy header. */ ++ if (copy > 0) { ++ if (copy > len) ++ copy = len; ++ csum = csum_partial_copy_nocheck(skb->data + offset, to, ++ copy); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ to += copy; ++ pos = copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ if ((copy = end - offset) > 0) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ u32 p_off, p_len, copied; ++ struct page *p; ++ __wsum csum2; ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ skb_frag_foreach_page(frag, ++ skb_frag_off(frag) + offset - start, ++ copy, p, p_off, p_len, copied) { ++ vaddr = kmap_atomic(p); ++ csum2 = csum_partial_copy_nocheck(vaddr + p_off, ++ to + copied, ++ p_len); ++ kunmap_atomic(vaddr); ++ csum = csum_block_add(csum, csum2, pos); ++ pos += p_len; ++ } ++ ++ if (!(len -= copy)) ++ return csum; ++ offset += copy; ++ to += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ __wsum csum2; ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ csum2 = skb_copy_and_csum_bits(frag_iter, ++ offset - start, ++ to, copy); ++ csum = csum_block_add(csum, csum2, pos); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ to += copy; ++ pos += copy; ++ } ++ start = end; ++ } ++ BUG_ON(len); ++ return csum; ++} ++EXPORT_SYMBOL(skb_copy_and_csum_bits); ++ ++__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) ++{ ++ __sum16 sum; ++ ++ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); ++ /* See comments in __skb_checksum_complete(). */ ++ if (likely(!sum)) { ++ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && ++ !skb->csum_complete_sw) ++ netdev_rx_csum_fault(skb->dev, skb); ++ } ++ if (!skb_shared(skb)) ++ skb->csum_valid = !sum; ++ return sum; ++} ++EXPORT_SYMBOL(__skb_checksum_complete_head); ++ ++/* This function assumes skb->csum already holds pseudo header's checksum, ++ * which has been changed from the hardware checksum, for example, by ++ * __skb_checksum_validate_complete(). And, the original skb->csum must ++ * have been validated unsuccessfully for CHECKSUM_COMPLETE case. ++ * ++ * It returns non-zero if the recomputed checksum is still invalid, otherwise ++ * zero. The new checksum is stored back into skb->csum unless the skb is ++ * shared. ++ */ ++__sum16 __skb_checksum_complete(struct sk_buff *skb) ++{ ++ __wsum csum; ++ __sum16 sum; ++ ++ csum = skb_checksum(skb, 0, skb->len, 0); ++ ++ sum = csum_fold(csum_add(skb->csum, csum)); ++ /* This check is inverted, because we already knew the hardware ++ * checksum is invalid before calling this function. So, if the ++ * re-computed checksum is valid instead, then we have a mismatch ++ * between the original skb->csum and skb_checksum(). This means either ++ * the original hardware checksum is incorrect or we screw up skb->csum ++ * when moving skb->data around. ++ */ ++ if (likely(!sum)) { ++ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && ++ !skb->csum_complete_sw) ++ netdev_rx_csum_fault(skb->dev, skb); ++ } ++ ++ if (!skb_shared(skb)) { ++ /* Save full packet checksum */ ++ skb->csum = csum; ++ skb->ip_summed = CHECKSUM_COMPLETE; ++ skb->csum_complete_sw = 1; ++ skb->csum_valid = !sum; ++ } ++ ++ return sum; ++} ++EXPORT_SYMBOL(__skb_checksum_complete); ++ ++static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) ++{ ++ net_warn_ratelimited( ++ "%s: attempt to compute crc32c without libcrc32c.ko\n", ++ __func__); ++ return 0; ++} ++ ++static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, ++ int offset, int len) ++{ ++ net_warn_ratelimited( ++ "%s: attempt to compute crc32c without libcrc32c.ko\n", ++ __func__); ++ return 0; ++} ++ ++static const struct skb_checksum_ops default_crc32c_ops = { ++ .update = warn_crc32c_csum_update, ++ .combine = warn_crc32c_csum_combine, ++}; ++ ++const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = ++ &default_crc32c_ops; ++EXPORT_SYMBOL(crc32c_csum_stub); ++ ++ /** ++ * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() ++ * @from: source buffer ++ * ++ * Calculates the amount of linear headroom needed in the 'to' skb passed ++ * into skb_zerocopy(). ++ */ ++unsigned int ++skb_zerocopy_headlen(const struct sk_buff *from) ++{ ++ unsigned int hlen = 0; ++ ++ if (!from->head_frag || ++ skb_headlen(from) < L1_CACHE_BYTES || ++ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { ++ hlen = skb_headlen(from); ++ if (!hlen) ++ hlen = from->len; ++ } ++ ++ if (skb_has_frag_list(from)) ++ hlen = from->len; ++ ++ return hlen; ++} ++EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); ++ ++/** ++ * skb_zerocopy - Zero copy skb to skb ++ * @to: destination buffer ++ * @from: source buffer ++ * @len: number of bytes to copy from source buffer ++ * @hlen: size of linear headroom in destination buffer ++ * ++ * Copies up to `len` bytes from `from` to `to` by creating references ++ * to the frags in the source buffer. ++ * ++ * The `hlen` as calculated by skb_zerocopy_headlen() specifies the ++ * headroom in the `to` buffer. ++ * ++ * Return value: ++ * 0: everything is OK ++ * -ENOMEM: couldn't orphan frags of @from due to lack of memory ++ * -EFAULT: skb_copy_bits() found some problem with skb geometry ++ */ ++int ++skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) ++{ ++ int i, j = 0; ++ int plen = 0; /* length of skb->head fragment */ ++ int ret; ++ struct page *page; ++ unsigned int offset; ++ ++ BUG_ON(!from->head_frag && !hlen); ++ ++ /* dont bother with small payloads */ ++ if (len <= skb_tailroom(to)) ++ return skb_copy_bits(from, 0, skb_put(to, len), len); ++ ++ if (hlen) { ++ ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); ++ if (unlikely(ret)) ++ return ret; ++ len -= hlen; ++ } else { ++ plen = min_t(int, skb_headlen(from), len); ++ if (plen) { ++ page = virt_to_head_page(from->head); ++ offset = from->data - (unsigned char *)page_address(page); ++ __skb_fill_page_desc(to, 0, page, offset, plen); ++ get_page(page); ++ j = 1; ++ len -= plen; ++ } ++ } ++ ++ skb_len_add(to, len + plen); ++ ++ if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { ++ skb_tx_error(from); ++ return -ENOMEM; ++ } ++ skb_zerocopy_clone(to, from, GFP_ATOMIC); ++ ++ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { ++ int size; ++ ++ if (!len) ++ break; ++ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; ++ size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), ++ len); ++ skb_frag_size_set(&skb_shinfo(to)->frags[j], size); ++ len -= size; ++ skb_frag_ref(to, j); ++ j++; ++ } ++ skb_shinfo(to)->nr_frags = j; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_zerocopy); ++ ++void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) ++{ ++ __wsum csum; ++ long csstart; ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) ++ csstart = skb_checksum_start_offset(skb); ++ else ++ csstart = skb_headlen(skb); ++ ++ BUG_ON(csstart > skb_headlen(skb)); ++ ++ skb_copy_from_linear_data(skb, to, csstart); ++ ++ csum = 0; ++ if (csstart != skb->len) ++ csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, ++ skb->len - csstart); ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ long csstuff = csstart + skb->csum_offset; ++ ++ *((__sum16 *)(to + csstuff)) = csum_fold(csum); ++ } ++} ++EXPORT_SYMBOL(skb_copy_and_csum_dev); ++ ++/** ++ * skb_dequeue - remove from the head of the queue ++ * @list: list to dequeue from ++ * ++ * Remove the head of the list. The list lock is taken so the function ++ * may be used safely with other locking list functions. The head item is ++ * returned or %NULL if the list is empty. ++ */ ++ ++struct sk_buff *skb_dequeue(struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ struct sk_buff *result; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ result = __skb_dequeue(list); ++ spin_unlock_irqrestore(&list->lock, flags); ++ return result; ++} ++EXPORT_SYMBOL(skb_dequeue); ++ ++/** ++ * skb_dequeue_tail - remove from the tail of the queue ++ * @list: list to dequeue from ++ * ++ * Remove the tail of the list. The list lock is taken so the function ++ * may be used safely with other locking list functions. The tail item is ++ * returned or %NULL if the list is empty. ++ */ ++struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ struct sk_buff *result; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ result = __skb_dequeue_tail(list); ++ spin_unlock_irqrestore(&list->lock, flags); ++ return result; ++} ++EXPORT_SYMBOL(skb_dequeue_tail); ++ ++/** ++ * skb_queue_purge - empty a list ++ * @list: list to empty ++ * ++ * Delete all buffers on an &sk_buff list. Each buffer is removed from ++ * the list and one reference dropped. This function takes the list ++ * lock and is atomic with respect to other list locking functions. ++ */ ++void skb_queue_purge(struct sk_buff_head *list) ++{ ++ struct sk_buff *skb; ++ while ((skb = skb_dequeue(list)) != NULL) ++ kfree_skb(skb); ++} ++EXPORT_SYMBOL(skb_queue_purge); ++ ++/** ++ * skb_rbtree_purge - empty a skb rbtree ++ * @root: root of the rbtree to empty ++ * Return value: the sum of truesizes of all purged skbs. ++ * ++ * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from ++ * the list and one reference dropped. This function does not take ++ * any lock. Synchronization should be handled by the caller (e.g., TCP ++ * out-of-order queue is protected by the socket lock). ++ */ ++unsigned int skb_rbtree_purge(struct rb_root *root) ++{ ++ struct rb_node *p = rb_first(root); ++ unsigned int sum = 0; ++ ++ while (p) { ++ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); ++ ++ p = rb_next(p); ++ rb_erase(&skb->rbnode, root); ++ sum += skb->truesize; ++ kfree_skb(skb); ++ } ++ return sum; ++} ++ ++/** ++ * skb_queue_head - queue a buffer at the list head ++ * @list: list to use ++ * @newsk: buffer to queue ++ * ++ * Queue a buffer at the start of the list. This function takes the ++ * list lock and can be used safely with other locking &sk_buff functions ++ * safely. ++ * ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_queue_head(list, newsk); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_queue_head); ++ ++/** ++ * skb_queue_tail - queue a buffer at the list tail ++ * @list: list to use ++ * @newsk: buffer to queue ++ * ++ * Queue a buffer at the tail of the list. This function takes the ++ * list lock and can be used safely with other locking &sk_buff functions ++ * safely. ++ * ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_queue_tail(list, newsk); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_queue_tail); ++ ++/** ++ * skb_unlink - remove a buffer from a list ++ * @skb: buffer to remove ++ * @list: list to use ++ * ++ * Remove a packet from a list. The list locks are taken and this ++ * function is atomic with respect to other list locked calls ++ * ++ * You must know what list the SKB is on. ++ */ ++void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_unlink(skb, list); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_unlink); ++ ++/** ++ * skb_append - append a buffer ++ * @old: buffer to insert after ++ * @newsk: buffer to insert ++ * @list: list to use ++ * ++ * Place a packet after a given packet in a list. The list locks are taken ++ * and this function is atomic with respect to other list locked calls. ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_queue_after(list, old, newsk); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_append); ++ ++static inline void skb_split_inside_header(struct sk_buff *skb, ++ struct sk_buff* skb1, ++ const u32 len, const int pos) ++{ ++ int i; ++ ++ skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), ++ pos - len); ++ /* And move data appendix as is. */ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; ++ ++ skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; ++ skb_shinfo(skb)->nr_frags = 0; ++ skb1->data_len = skb->data_len; ++ skb1->len += skb1->data_len; ++ skb->data_len = 0; ++ skb->len = len; ++ skb_set_tail_pointer(skb, len); ++} ++ ++static inline void skb_split_no_header(struct sk_buff *skb, ++ struct sk_buff* skb1, ++ const u32 len, int pos) ++{ ++ int i, k = 0; ++ const int nfrags = skb_shinfo(skb)->nr_frags; ++ ++ skb_shinfo(skb)->nr_frags = 0; ++ skb1->len = skb1->data_len = skb->len - len; ++ skb->len = len; ++ skb->data_len = len - pos; ++ ++ for (i = 0; i < nfrags; i++) { ++ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (pos + size > len) { ++ skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; ++ ++ if (pos < len) { ++ /* Split frag. ++ * We have two variants in this case: ++ * 1. Move all the frag to the second ++ * part, if it is possible. F.e. ++ * this approach is mandatory for TUX, ++ * where splitting is expensive. ++ * 2. Split is accurately. We make this. ++ */ ++ skb_frag_ref(skb, i); ++ skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); ++ skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); ++ skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); ++ skb_shinfo(skb)->nr_frags++; ++ } ++ k++; ++ } else ++ skb_shinfo(skb)->nr_frags++; ++ pos += size; ++ } ++ skb_shinfo(skb1)->nr_frags = k; ++} ++ ++/** ++ * skb_split - Split fragmented skb to two parts at length len. ++ * @skb: the buffer to split ++ * @skb1: the buffer to receive the second part ++ * @len: new length for skb ++ */ ++void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) ++{ ++ int pos = skb_headlen(skb); ++ const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; ++ ++ skb_zcopy_downgrade_managed(skb); ++ ++ skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; ++ skb_zerocopy_clone(skb1, skb, 0); ++ if (len < pos) /* Split line is inside header. */ ++ skb_split_inside_header(skb, skb1, len, pos); ++ else /* Second chunk has no header, nothing to copy. */ ++ skb_split_no_header(skb, skb1, len, pos); ++} ++EXPORT_SYMBOL(skb_split); ++ ++/* Shifting from/to a cloned skb is a no-go. ++ * ++ * Caller cannot keep skb_shinfo related pointers past calling here! ++ */ ++static int skb_prepare_for_shift(struct sk_buff *skb) ++{ ++ return skb_unclone_keeptruesize(skb, GFP_ATOMIC); ++} ++ ++/** ++ * skb_shift - Shifts paged data partially from skb to another ++ * @tgt: buffer into which tail data gets added ++ * @skb: buffer from which the paged data comes from ++ * @shiftlen: shift up to this many bytes ++ * ++ * Attempts to shift up to shiftlen worth of bytes, which may be less than ++ * the length of the skb, from skb to tgt. Returns number bytes shifted. ++ * It's up to caller to free skb if everything was shifted. ++ * ++ * If @tgt runs out of frags, the whole operation is aborted. ++ * ++ * Skb cannot include anything else but paged data while tgt is allowed ++ * to have non-paged data as well. ++ * ++ * TODO: full sized shift could be optimized but that would need ++ * specialized skb free'er to handle frags without up-to-date nr_frags. ++ */ ++int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) ++{ ++ int from, to, merge, todo; ++ skb_frag_t *fragfrom, *fragto; ++ ++ BUG_ON(shiftlen > skb->len); ++ ++ if (skb_headlen(skb)) ++ return 0; ++ if (skb_zcopy(tgt) || skb_zcopy(skb)) ++ return 0; ++ ++ todo = shiftlen; ++ from = 0; ++ to = skb_shinfo(tgt)->nr_frags; ++ fragfrom = &skb_shinfo(skb)->frags[from]; ++ ++ /* Actual merge is delayed until the point when we know we can ++ * commit all, so that we don't have to undo partial changes ++ */ ++ if (!to || ++ !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), ++ skb_frag_off(fragfrom))) { ++ merge = -1; ++ } else { ++ merge = to - 1; ++ ++ todo -= skb_frag_size(fragfrom); ++ if (todo < 0) { ++ if (skb_prepare_for_shift(skb) || ++ skb_prepare_for_shift(tgt)) ++ return 0; ++ ++ /* All previous frag pointers might be stale! */ ++ fragfrom = &skb_shinfo(skb)->frags[from]; ++ fragto = &skb_shinfo(tgt)->frags[merge]; ++ ++ skb_frag_size_add(fragto, shiftlen); ++ skb_frag_size_sub(fragfrom, shiftlen); ++ skb_frag_off_add(fragfrom, shiftlen); ++ ++ goto onlymerged; ++ } ++ ++ from++; ++ } ++ ++ /* Skip full, not-fitting skb to avoid expensive operations */ ++ if ((shiftlen == skb->len) && ++ (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) ++ return 0; ++ ++ if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) ++ return 0; ++ ++ while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { ++ if (to == MAX_SKB_FRAGS) ++ return 0; ++ ++ fragfrom = &skb_shinfo(skb)->frags[from]; ++ fragto = &skb_shinfo(tgt)->frags[to]; ++ ++ if (todo >= skb_frag_size(fragfrom)) { ++ *fragto = *fragfrom; ++ todo -= skb_frag_size(fragfrom); ++ from++; ++ to++; ++ ++ } else { ++ __skb_frag_ref(fragfrom); ++ skb_frag_page_copy(fragto, fragfrom); ++ skb_frag_off_copy(fragto, fragfrom); ++ skb_frag_size_set(fragto, todo); ++ ++ skb_frag_off_add(fragfrom, todo); ++ skb_frag_size_sub(fragfrom, todo); ++ todo = 0; ++ ++ to++; ++ break; ++ } ++ } ++ ++ /* Ready to "commit" this state change to tgt */ ++ skb_shinfo(tgt)->nr_frags = to; ++ ++ if (merge >= 0) { ++ fragfrom = &skb_shinfo(skb)->frags[0]; ++ fragto = &skb_shinfo(tgt)->frags[merge]; ++ ++ skb_frag_size_add(fragto, skb_frag_size(fragfrom)); ++ __skb_frag_unref(fragfrom, skb->pp_recycle); ++ } ++ ++ /* Reposition in the original skb */ ++ to = 0; ++ while (from < skb_shinfo(skb)->nr_frags) ++ skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; ++ skb_shinfo(skb)->nr_frags = to; ++ ++ BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); ++ ++onlymerged: ++ /* Most likely the tgt won't ever need its checksum anymore, skb on ++ * the other hand might need it if it needs to be resent ++ */ ++ tgt->ip_summed = CHECKSUM_PARTIAL; ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ ++ skb_len_add(skb, -shiftlen); ++ skb_len_add(tgt, shiftlen); ++ ++ return shiftlen; ++} ++ ++/** ++ * skb_prepare_seq_read - Prepare a sequential read of skb data ++ * @skb: the buffer to read ++ * @from: lower offset of data to be read ++ * @to: upper offset of data to be read ++ * @st: state variable ++ * ++ * Initializes the specified state variable. Must be called before ++ * invoking skb_seq_read() for the first time. ++ */ ++void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, ++ unsigned int to, struct skb_seq_state *st) ++{ ++ st->lower_offset = from; ++ st->upper_offset = to; ++ st->root_skb = st->cur_skb = skb; ++ st->frag_idx = st->stepped_offset = 0; ++ st->frag_data = NULL; ++ st->frag_off = 0; ++} ++EXPORT_SYMBOL(skb_prepare_seq_read); ++ ++/** ++ * skb_seq_read - Sequentially read skb data ++ * @consumed: number of bytes consumed by the caller so far ++ * @data: destination pointer for data to be returned ++ * @st: state variable ++ * ++ * Reads a block of skb data at @consumed relative to the ++ * lower offset specified to skb_prepare_seq_read(). Assigns ++ * the head of the data block to @data and returns the length ++ * of the block or 0 if the end of the skb data or the upper ++ * offset has been reached. ++ * ++ * The caller is not required to consume all of the data ++ * returned, i.e. @consumed is typically set to the number ++ * of bytes already consumed and the next call to ++ * skb_seq_read() will return the remaining part of the block. ++ * ++ * Note 1: The size of each block of data returned can be arbitrary, ++ * this limitation is the cost for zerocopy sequential ++ * reads of potentially non linear data. ++ * ++ * Note 2: Fragment lists within fragments are not implemented ++ * at the moment, state->root_skb could be replaced with ++ * a stack for this purpose. ++ */ ++unsigned int skb_seq_read(unsigned int consumed, const u8 **data, ++ struct skb_seq_state *st) ++{ ++ unsigned int block_limit, abs_offset = consumed + st->lower_offset; ++ skb_frag_t *frag; ++ ++ if (unlikely(abs_offset >= st->upper_offset)) { ++ if (st->frag_data) { ++ kunmap_atomic(st->frag_data); ++ st->frag_data = NULL; ++ } ++ return 0; ++ } ++ ++next_skb: ++ block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; ++ ++ if (abs_offset < block_limit && !st->frag_data) { ++ *data = st->cur_skb->data + (abs_offset - st->stepped_offset); ++ return block_limit - abs_offset; ++ } ++ ++ if (st->frag_idx == 0 && !st->frag_data) ++ st->stepped_offset += skb_headlen(st->cur_skb); ++ ++ while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { ++ unsigned int pg_idx, pg_off, pg_sz; ++ ++ frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; ++ ++ pg_idx = 0; ++ pg_off = skb_frag_off(frag); ++ pg_sz = skb_frag_size(frag); ++ ++ if (skb_frag_must_loop(skb_frag_page(frag))) { ++ pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; ++ pg_off = offset_in_page(pg_off + st->frag_off); ++ pg_sz = min_t(unsigned int, pg_sz - st->frag_off, ++ PAGE_SIZE - pg_off); ++ } ++ ++ block_limit = pg_sz + st->stepped_offset; ++ if (abs_offset < block_limit) { ++ if (!st->frag_data) ++ st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); ++ ++ *data = (u8 *)st->frag_data + pg_off + ++ (abs_offset - st->stepped_offset); ++ ++ return block_limit - abs_offset; ++ } ++ ++ if (st->frag_data) { ++ kunmap_atomic(st->frag_data); ++ st->frag_data = NULL; ++ } ++ ++ st->stepped_offset += pg_sz; ++ st->frag_off += pg_sz; ++ if (st->frag_off == skb_frag_size(frag)) { ++ st->frag_off = 0; ++ st->frag_idx++; ++ } ++ } ++ ++ if (st->frag_data) { ++ kunmap_atomic(st->frag_data); ++ st->frag_data = NULL; ++ } ++ ++ if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { ++ st->cur_skb = skb_shinfo(st->root_skb)->frag_list; ++ st->frag_idx = 0; ++ goto next_skb; ++ } else if (st->cur_skb->next) { ++ st->cur_skb = st->cur_skb->next; ++ st->frag_idx = 0; ++ goto next_skb; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_seq_read); ++ ++/** ++ * skb_abort_seq_read - Abort a sequential read of skb data ++ * @st: state variable ++ * ++ * Must be called if skb_seq_read() was not called until it ++ * returned 0. ++ */ ++void skb_abort_seq_read(struct skb_seq_state *st) ++{ ++ if (st->frag_data) ++ kunmap_atomic(st->frag_data); ++} ++EXPORT_SYMBOL(skb_abort_seq_read); ++ ++#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) ++ ++static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, ++ struct ts_config *conf, ++ struct ts_state *state) ++{ ++ return skb_seq_read(offset, text, TS_SKB_CB(state)); ++} ++ ++static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) ++{ ++ skb_abort_seq_read(TS_SKB_CB(state)); ++} ++ ++/** ++ * skb_find_text - Find a text pattern in skb data ++ * @skb: the buffer to look in ++ * @from: search offset ++ * @to: search limit ++ * @config: textsearch configuration ++ * ++ * Finds a pattern in the skb data according to the specified ++ * textsearch configuration. Use textsearch_next() to retrieve ++ * subsequent occurrences of the pattern. Returns the offset ++ * to the first occurrence or UINT_MAX if no match was found. ++ */ ++unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, ++ unsigned int to, struct ts_config *config) ++{ ++ struct ts_state state; ++ unsigned int ret; ++ ++ BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); ++ ++ config->get_next_block = skb_ts_get_next_block; ++ config->finish = skb_ts_finish; ++ ++ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); ++ ++ ret = textsearch_find(config, &state); ++ return (ret <= to - from ? ret : UINT_MAX); ++} ++EXPORT_SYMBOL(skb_find_text); ++ ++int skb_append_pagefrags(struct sk_buff *skb, struct page *page, ++ int offset, size_t size) ++{ ++ int i = skb_shinfo(skb)->nr_frags; ++ ++ if (skb_can_coalesce(skb, i, page, offset)) { ++ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); ++ } else if (i < MAX_SKB_FRAGS) { ++ skb_zcopy_downgrade_managed(skb); ++ get_page(page); ++ skb_fill_page_desc_noacc(skb, i, page, offset, size); ++ } else { ++ return -EMSGSIZE; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_append_pagefrags); ++ ++/** ++ * skb_pull_rcsum - pull skb and update receive checksum ++ * @skb: buffer to update ++ * @len: length of data pulled ++ * ++ * This function performs an skb_pull on the packet and updates ++ * the CHECKSUM_COMPLETE checksum. It should be used on ++ * receive path processing instead of skb_pull unless you know ++ * that the checksum difference is zero (e.g., a valid IP header) ++ * or you are setting ip_summed to CHECKSUM_NONE. ++ */ ++void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) ++{ ++ unsigned char *data = skb->data; ++ ++ BUG_ON(len > skb->len); ++ __skb_pull(skb, len); ++ skb_postpull_rcsum(skb, data, len); ++ return skb->data; ++} ++EXPORT_SYMBOL_GPL(skb_pull_rcsum); ++ ++static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) ++{ ++ skb_frag_t head_frag; ++ struct page *page; ++ ++ page = virt_to_head_page(frag_skb->head); ++ __skb_frag_set_page(&head_frag, page); ++ skb_frag_off_set(&head_frag, frag_skb->data - ++ (unsigned char *)page_address(page)); ++ skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); ++ return head_frag; ++} ++ ++struct sk_buff *skb_segment_list(struct sk_buff *skb, ++ netdev_features_t features, ++ unsigned int offset) ++{ ++ struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; ++ unsigned int tnl_hlen = skb_tnl_header_len(skb); ++ unsigned int delta_truesize = 0; ++ unsigned int delta_len = 0; ++ struct sk_buff *tail = NULL; ++ struct sk_buff *nskb, *tmp; ++ int len_diff, err; ++ ++ skb_push(skb, -skb_network_offset(skb) + offset); ++ ++ skb_shinfo(skb)->frag_list = NULL; ++ ++ do { ++ nskb = list_skb; ++ list_skb = list_skb->next; ++ ++ err = 0; ++ delta_truesize += nskb->truesize; ++ if (skb_shared(nskb)) { ++ tmp = skb_clone(nskb, GFP_ATOMIC); ++ if (tmp) { ++ consume_skb(nskb); ++ nskb = tmp; ++ err = skb_unclone(nskb, GFP_ATOMIC); ++ } else { ++ err = -ENOMEM; ++ } ++ } ++ ++ if (!tail) ++ skb->next = nskb; ++ else ++ tail->next = nskb; ++ ++ if (unlikely(err)) { ++ nskb->next = list_skb; ++ goto err_linearize; ++ } ++ ++ tail = nskb; ++ ++ delta_len += nskb->len; ++ ++ skb_push(nskb, -skb_network_offset(nskb) + offset); ++ ++ skb_release_head_state(nskb); ++ len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); ++ __copy_skb_header(nskb, skb); ++ ++ skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); ++ nskb->transport_header += len_diff; ++ skb_copy_from_linear_data_offset(skb, -tnl_hlen, ++ nskb->data - tnl_hlen, ++ offset + tnl_hlen); ++ ++ if (skb_needs_linearize(nskb, features) && ++ __skb_linearize(nskb)) ++ goto err_linearize; ++ ++ } while (list_skb); ++ ++ skb->truesize = skb->truesize - delta_truesize; ++ skb->data_len = skb->data_len - delta_len; ++ skb->len = skb->len - delta_len; ++ ++ skb_gso_reset(skb); ++ ++ skb->prev = tail; ++ ++ if (skb_needs_linearize(skb, features) && ++ __skb_linearize(skb)) ++ goto err_linearize; ++ ++ skb_get(skb); ++ ++ return skb; ++ ++err_linearize: ++ kfree_skb_list(skb->next); ++ skb->next = NULL; ++ return ERR_PTR(-ENOMEM); ++} ++EXPORT_SYMBOL_GPL(skb_segment_list); ++ ++/** ++ * skb_segment - Perform protocol segmentation on skb. ++ * @head_skb: buffer to segment ++ * @features: features for the output path (see dev->features) ++ * ++ * This function performs segmentation on the given skb. It returns ++ * a pointer to the first in a list of new skbs for the segments. ++ * In case of error it returns ERR_PTR(err). ++ */ ++struct sk_buff *skb_segment(struct sk_buff *head_skb, ++ netdev_features_t features) ++{ ++ struct sk_buff *segs = NULL; ++ struct sk_buff *tail = NULL; ++ struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; ++ skb_frag_t *frag = skb_shinfo(head_skb)->frags; ++ unsigned int mss = skb_shinfo(head_skb)->gso_size; ++ unsigned int doffset = head_skb->data - skb_mac_header(head_skb); ++ struct sk_buff *frag_skb = head_skb; ++ unsigned int offset = doffset; ++ unsigned int tnl_hlen = skb_tnl_header_len(head_skb); ++ unsigned int partial_segs = 0; ++ unsigned int headroom; ++ unsigned int len = head_skb->len; ++ __be16 proto; ++ bool csum, sg; ++ int nfrags = skb_shinfo(head_skb)->nr_frags; ++ int err = -ENOMEM; ++ int i = 0; ++ int pos; ++ ++ if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && ++ mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { ++ struct sk_buff *check_skb; ++ ++ for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { ++ if (skb_headlen(check_skb) && !check_skb->head_frag) { ++ /* gso_size is untrusted, and we have a frag_list with ++ * a linear non head_frag item. ++ * ++ * If head_skb's headlen does not fit requested gso_size, ++ * it means that the frag_list members do NOT terminate ++ * on exact gso_size boundaries. Hence we cannot perform ++ * skb_frag_t page sharing. Therefore we must fallback to ++ * copying the frag_list skbs; we do so by disabling SG. ++ */ ++ features &= ~NETIF_F_SG; ++ break; ++ } ++ } ++ } ++ ++ __skb_push(head_skb, doffset); ++ proto = skb_network_protocol(head_skb, NULL); ++ if (unlikely(!proto)) ++ return ERR_PTR(-EINVAL); ++ ++ sg = !!(features & NETIF_F_SG); ++ csum = !!can_checksum_protocol(features, proto); ++ ++ if (sg && csum && (mss != GSO_BY_FRAGS)) { ++ if (!(features & NETIF_F_GSO_PARTIAL)) { ++ struct sk_buff *iter; ++ unsigned int frag_len; ++ ++ if (!list_skb || ++ !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) ++ goto normal; ++ ++ /* If we get here then all the required ++ * GSO features except frag_list are supported. ++ * Try to split the SKB to multiple GSO SKBs ++ * with no frag_list. ++ * Currently we can do that only when the buffers don't ++ * have a linear part and all the buffers except ++ * the last are of the same length. ++ */ ++ frag_len = list_skb->len; ++ skb_walk_frags(head_skb, iter) { ++ if (frag_len != iter->len && iter->next) ++ goto normal; ++ if (skb_headlen(iter) && !iter->head_frag) ++ goto normal; ++ ++ len -= iter->len; ++ } ++ ++ if (len != frag_len) ++ goto normal; ++ } ++ ++ /* GSO partial only requires that we trim off any excess that ++ * doesn't fit into an MSS sized block, so take care of that ++ * now. ++ */ ++ partial_segs = len / mss; ++ if (partial_segs > 1) ++ mss *= partial_segs; ++ else ++ partial_segs = 0; ++ } ++ ++normal: ++ headroom = skb_headroom(head_skb); ++ pos = skb_headlen(head_skb); ++ ++ do { ++ struct sk_buff *nskb; ++ skb_frag_t *nskb_frag; ++ int hsize; ++ int size; ++ ++ if (unlikely(mss == GSO_BY_FRAGS)) { ++ len = list_skb->len; ++ } else { ++ len = head_skb->len - offset; ++ if (len > mss) ++ len = mss; ++ } ++ ++ hsize = skb_headlen(head_skb) - offset; ++ ++ if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && ++ (skb_headlen(list_skb) == len || sg)) { ++ BUG_ON(skb_headlen(list_skb) > len); ++ ++ i = 0; ++ nfrags = skb_shinfo(list_skb)->nr_frags; ++ frag = skb_shinfo(list_skb)->frags; ++ frag_skb = list_skb; ++ pos += skb_headlen(list_skb); ++ ++ while (pos < offset + len) { ++ BUG_ON(i >= nfrags); ++ ++ size = skb_frag_size(frag); ++ if (pos + size > offset + len) ++ break; ++ ++ i++; ++ pos += size; ++ frag++; ++ } ++ ++ nskb = skb_clone(list_skb, GFP_ATOMIC); ++ list_skb = list_skb->next; ++ ++ if (unlikely(!nskb)) ++ goto err; ++ ++ if (unlikely(pskb_trim(nskb, len))) { ++ kfree_skb(nskb); ++ goto err; ++ } ++ ++ hsize = skb_end_offset(nskb); ++ if (skb_cow_head(nskb, doffset + headroom)) { ++ kfree_skb(nskb); ++ goto err; ++ } ++ ++ nskb->truesize += skb_end_offset(nskb) - hsize; ++ skb_release_head_state(nskb); ++ __skb_push(nskb, doffset); ++ } else { ++ if (hsize < 0) ++ hsize = 0; ++ if (hsize > len || !sg) ++ hsize = len; ++ ++ nskb = __alloc_skb(hsize + doffset + headroom, ++ GFP_ATOMIC, skb_alloc_rx_flag(head_skb), ++ NUMA_NO_NODE); ++ ++ if (unlikely(!nskb)) ++ goto err; ++ ++ skb_reserve(nskb, headroom); ++ __skb_put(nskb, doffset); ++ } ++ ++ if (segs) ++ tail->next = nskb; ++ else ++ segs = nskb; ++ tail = nskb; ++ ++ __copy_skb_header(nskb, head_skb); ++ ++ skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); ++ skb_reset_mac_len(nskb); ++ ++ skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, ++ nskb->data - tnl_hlen, ++ doffset + tnl_hlen); ++ ++ if (nskb->len == len + doffset) ++ goto perform_csum_check; ++ ++ if (!sg) { ++ if (!csum) { ++ if (!nskb->remcsum_offload) ++ nskb->ip_summed = CHECKSUM_NONE; ++ SKB_GSO_CB(nskb)->csum = ++ skb_copy_and_csum_bits(head_skb, offset, ++ skb_put(nskb, ++ len), ++ len); ++ SKB_GSO_CB(nskb)->csum_start = ++ skb_headroom(nskb) + doffset; ++ } else { ++ if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) ++ goto err; ++ } ++ continue; ++ } ++ ++ nskb_frag = skb_shinfo(nskb)->frags; ++ ++ skb_copy_from_linear_data_offset(head_skb, offset, ++ skb_put(nskb, hsize), hsize); ++ ++ skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & ++ SKBFL_SHARED_FRAG; ++ ++ if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || ++ skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) ++ goto err; ++ ++ while (pos < offset + len) { ++ if (i >= nfrags) { ++ i = 0; ++ nfrags = skb_shinfo(list_skb)->nr_frags; ++ frag = skb_shinfo(list_skb)->frags; ++ frag_skb = list_skb; ++ if (!skb_headlen(list_skb)) { ++ BUG_ON(!nfrags); ++ } else { ++ BUG_ON(!list_skb->head_frag); ++ ++ /* to make room for head_frag. */ ++ i--; ++ frag--; ++ } ++ if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || ++ skb_zerocopy_clone(nskb, frag_skb, ++ GFP_ATOMIC)) ++ goto err; ++ ++ list_skb = list_skb->next; ++ } ++ ++ if (unlikely(skb_shinfo(nskb)->nr_frags >= ++ MAX_SKB_FRAGS)) { ++ net_warn_ratelimited( ++ "skb_segment: too many frags: %u %u\n", ++ pos, mss); ++ err = -EINVAL; ++ goto err; ++ } ++ ++ *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; ++ __skb_frag_ref(nskb_frag); ++ size = skb_frag_size(nskb_frag); ++ ++ if (pos < offset) { ++ skb_frag_off_add(nskb_frag, offset - pos); ++ skb_frag_size_sub(nskb_frag, offset - pos); ++ } ++ ++ skb_shinfo(nskb)->nr_frags++; ++ ++ if (pos + size <= offset + len) { ++ i++; ++ frag++; ++ pos += size; ++ } else { ++ skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); ++ goto skip_fraglist; ++ } ++ ++ nskb_frag++; ++ } ++ ++skip_fraglist: ++ nskb->data_len = len - hsize; ++ nskb->len += nskb->data_len; ++ nskb->truesize += nskb->data_len; ++ ++perform_csum_check: ++ if (!csum) { ++ if (skb_has_shared_frag(nskb) && ++ __skb_linearize(nskb)) ++ goto err; ++ ++ if (!nskb->remcsum_offload) ++ nskb->ip_summed = CHECKSUM_NONE; ++ SKB_GSO_CB(nskb)->csum = ++ skb_checksum(nskb, doffset, ++ nskb->len - doffset, 0); ++ SKB_GSO_CB(nskb)->csum_start = ++ skb_headroom(nskb) + doffset; ++ } ++ } while ((offset += len) < head_skb->len); ++ ++ /* Some callers want to get the end of the list. ++ * Put it in segs->prev to avoid walking the list. ++ * (see validate_xmit_skb_list() for example) ++ */ ++ segs->prev = tail; ++ ++ if (partial_segs) { ++ struct sk_buff *iter; ++ int type = skb_shinfo(head_skb)->gso_type; ++ unsigned short gso_size = skb_shinfo(head_skb)->gso_size; ++ ++ /* Update type to add partial and then remove dodgy if set */ ++ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; ++ type &= ~SKB_GSO_DODGY; ++ ++ /* Update GSO info and prepare to start updating headers on ++ * our way back down the stack of protocols. ++ */ ++ for (iter = segs; iter; iter = iter->next) { ++ skb_shinfo(iter)->gso_size = gso_size; ++ skb_shinfo(iter)->gso_segs = partial_segs; ++ skb_shinfo(iter)->gso_type = type; ++ SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; ++ } ++ ++ if (tail->len - doffset <= gso_size) ++ skb_shinfo(tail)->gso_size = 0; ++ else if (tail != segs) ++ skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); ++ } ++ ++ /* Following permits correct backpressure, for protocols ++ * using skb_set_owner_w(). ++ * Idea is to tranfert ownership from head_skb to last segment. ++ */ ++ if (head_skb->destructor == sock_wfree) { ++ swap(tail->truesize, head_skb->truesize); ++ swap(tail->destructor, head_skb->destructor); ++ swap(tail->sk, head_skb->sk); ++ } ++ return segs; ++ ++err: ++ kfree_skb_list(segs); ++ return ERR_PTR(err); ++} ++EXPORT_SYMBOL_GPL(skb_segment); ++ ++#ifdef CONFIG_SKB_EXTENSIONS ++#define SKB_EXT_ALIGN_VALUE 8 ++#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) ++ ++static const u8 skb_ext_type_len[] = { ++#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) ++ [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), ++#endif ++#ifdef CONFIG_XFRM ++ [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), ++#endif ++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) ++ [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), ++#endif ++#if IS_ENABLED(CONFIG_MPTCP) ++ [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), ++#endif ++#if IS_ENABLED(CONFIG_MCTP_FLOWS) ++ [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), ++#endif ++}; ++ ++static __always_inline unsigned int skb_ext_total_length(void) ++{ ++ return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + ++#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) ++ skb_ext_type_len[SKB_EXT_BRIDGE_NF] + ++#endif ++#ifdef CONFIG_XFRM ++ skb_ext_type_len[SKB_EXT_SEC_PATH] + ++#endif ++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) ++ skb_ext_type_len[TC_SKB_EXT] + ++#endif ++#if IS_ENABLED(CONFIG_MPTCP) ++ skb_ext_type_len[SKB_EXT_MPTCP] + ++#endif ++#if IS_ENABLED(CONFIG_MCTP_FLOWS) ++ skb_ext_type_len[SKB_EXT_MCTP] + ++#endif ++ 0; ++} ++ ++static void skb_extensions_init(void) ++{ ++ BUILD_BUG_ON(SKB_EXT_NUM >= 8); ++ BUILD_BUG_ON(skb_ext_total_length() > 255); ++ ++ skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", ++ SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ NULL); ++} ++#else ++static void skb_extensions_init(void) {} ++#endif ++ ++void __init skb_init(void) ++{ ++ skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", ++ sizeof(struct sk_buff), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ offsetof(struct sk_buff, cb), ++ sizeof_field(struct sk_buff, cb), ++ NULL); ++ skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", ++ sizeof(struct sk_buff_fclones), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ NULL); ++ skb_extensions_init(); ++} ++ ++static int ++__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, ++ unsigned int recursion_level) ++{ ++ int start = skb_headlen(skb); ++ int i, copy = start - offset; ++ struct sk_buff *frag_iter; ++ int elt = 0; ++ ++ if (unlikely(recursion_level >= 24)) ++ return -EMSGSIZE; ++ ++ if (copy > 0) { ++ if (copy > len) ++ copy = len; ++ sg_set_buf(sg, skb->data + offset, copy); ++ elt++; ++ if ((len -= copy) == 0) ++ return elt; ++ offset += copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ if ((copy = end - offset) > 0) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ if (unlikely(elt && sg_is_last(&sg[elt - 1]))) ++ return -EMSGSIZE; ++ ++ if (copy > len) ++ copy = len; ++ sg_set_page(&sg[elt], skb_frag_page(frag), copy, ++ skb_frag_off(frag) + offset - start); ++ elt++; ++ if (!(len -= copy)) ++ return elt; ++ offset += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end, ret; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (unlikely(elt && sg_is_last(&sg[elt - 1]))) ++ return -EMSGSIZE; ++ ++ if (copy > len) ++ copy = len; ++ ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, ++ copy, recursion_level + 1); ++ if (unlikely(ret < 0)) ++ return ret; ++ elt += ret; ++ if ((len -= copy) == 0) ++ return elt; ++ offset += copy; ++ } ++ start = end; ++ } ++ BUG_ON(len); ++ return elt; ++} ++ ++/** ++ * skb_to_sgvec - Fill a scatter-gather list from a socket buffer ++ * @skb: Socket buffer containing the buffers to be mapped ++ * @sg: The scatter-gather list to map into ++ * @offset: The offset into the buffer's contents to start mapping ++ * @len: Length of buffer space to be mapped ++ * ++ * Fill the specified scatter-gather list with mappings/pointers into a ++ * region of the buffer space attached to a socket buffer. Returns either ++ * the number of scatterlist items used, or -EMSGSIZE if the contents ++ * could not fit. ++ */ ++int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) ++{ ++ int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); ++ ++ if (nsg <= 0) ++ return nsg; ++ ++ sg_mark_end(&sg[nsg - 1]); ++ ++ return nsg; ++} ++EXPORT_SYMBOL_GPL(skb_to_sgvec); ++ ++/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given ++ * sglist without mark the sg which contain last skb data as the end. ++ * So the caller can mannipulate sg list as will when padding new data after ++ * the first call without calling sg_unmark_end to expend sg list. ++ * ++ * Scenario to use skb_to_sgvec_nomark: ++ * 1. sg_init_table ++ * 2. skb_to_sgvec_nomark(payload1) ++ * 3. skb_to_sgvec_nomark(payload2) ++ * ++ * This is equivalent to: ++ * 1. sg_init_table ++ * 2. skb_to_sgvec(payload1) ++ * 3. sg_unmark_end ++ * 4. skb_to_sgvec(payload2) ++ * ++ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark ++ * is more preferable. ++ */ ++int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, ++ int offset, int len) ++{ ++ return __skb_to_sgvec(skb, sg, offset, len, 0); ++} ++EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); ++ ++ ++ ++/** ++ * skb_cow_data - Check that a socket buffer's data buffers are writable ++ * @skb: The socket buffer to check. ++ * @tailbits: Amount of trailing space to be added ++ * @trailer: Returned pointer to the skb where the @tailbits space begins ++ * ++ * Make sure that the data buffers attached to a socket buffer are ++ * writable. If they are not, private copies are made of the data buffers ++ * and the socket buffer is set to use these instead. ++ * ++ * If @tailbits is given, make sure that there is space to write @tailbits ++ * bytes of data beyond current end of socket buffer. @trailer will be ++ * set to point to the skb in which this space begins. ++ * ++ * The number of scatterlist elements required to completely map the ++ * COW'd and extended socket buffer will be returned. ++ */ ++int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) ++{ ++ int copyflag; ++ int elt; ++ struct sk_buff *skb1, **skb_p; ++ ++ /* If skb is cloned or its head is paged, reallocate ++ * head pulling out all the pages (pages are considered not writable ++ * at the moment even if they are anonymous). ++ */ ++ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && ++ !__pskb_pull_tail(skb, __skb_pagelen(skb))) ++ return -ENOMEM; ++ ++ /* Easy case. Most of packets will go this way. */ ++ if (!skb_has_frag_list(skb)) { ++ /* A little of trouble, not enough of space for trailer. ++ * This should not happen, when stack is tuned to generate ++ * good frames. OK, on miss we reallocate and reserve even more ++ * space, 128 bytes is fair. */ ++ ++ if (skb_tailroom(skb) < tailbits && ++ pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) ++ return -ENOMEM; ++ ++ /* Voila! */ ++ *trailer = skb; ++ return 1; ++ } ++ ++ /* Misery. We are in troubles, going to mincer fragments... */ ++ ++ elt = 1; ++ skb_p = &skb_shinfo(skb)->frag_list; ++ copyflag = 0; ++ ++ while ((skb1 = *skb_p) != NULL) { ++ int ntail = 0; ++ ++ /* The fragment is partially pulled by someone, ++ * this can happen on input. Copy it and everything ++ * after it. */ ++ ++ if (skb_shared(skb1)) ++ copyflag = 1; ++ ++ /* If the skb is the last, worry about trailer. */ ++ ++ if (skb1->next == NULL && tailbits) { ++ if (skb_shinfo(skb1)->nr_frags || ++ skb_has_frag_list(skb1) || ++ skb_tailroom(skb1) < tailbits) ++ ntail = tailbits + 128; ++ } ++ ++ if (copyflag || ++ skb_cloned(skb1) || ++ ntail || ++ skb_shinfo(skb1)->nr_frags || ++ skb_has_frag_list(skb1)) { ++ struct sk_buff *skb2; ++ ++ /* Fuck, we are miserable poor guys... */ ++ if (ntail == 0) ++ skb2 = skb_copy(skb1, GFP_ATOMIC); ++ else ++ skb2 = skb_copy_expand(skb1, ++ skb_headroom(skb1), ++ ntail, ++ GFP_ATOMIC); ++ if (unlikely(skb2 == NULL)) ++ return -ENOMEM; ++ ++ if (skb1->sk) ++ skb_set_owner_w(skb2, skb1->sk); ++ ++ /* Looking around. Are we still alive? ++ * OK, link new skb, drop old one */ ++ ++ skb2->next = skb1->next; ++ *skb_p = skb2; ++ kfree_skb(skb1); ++ skb1 = skb2; ++ } ++ elt++; ++ *trailer = skb1; ++ skb_p = &skb1->next; ++ } ++ ++ return elt; ++} ++EXPORT_SYMBOL_GPL(skb_cow_data); ++ ++static void sock_rmem_free(struct sk_buff *skb) ++{ ++ struct sock *sk = skb->sk; ++ ++ atomic_sub(skb->truesize, &sk->sk_rmem_alloc); ++} ++ ++static void skb_set_err_queue(struct sk_buff *skb) ++{ ++ /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. ++ * So, it is safe to (mis)use it to mark skbs on the error queue. ++ */ ++ skb->pkt_type = PACKET_OUTGOING; ++ BUILD_BUG_ON(PACKET_OUTGOING == 0); ++} ++ ++/* ++ * Note: We dont mem charge error packets (no sk_forward_alloc changes) ++ */ ++int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= ++ (unsigned int)READ_ONCE(sk->sk_rcvbuf)) ++ return -ENOMEM; ++ ++ skb_orphan(skb); ++ skb->sk = sk; ++ skb->destructor = sock_rmem_free; ++ atomic_add(skb->truesize, &sk->sk_rmem_alloc); ++ skb_set_err_queue(skb); ++ ++ /* before exiting rcu section, make sure dst is refcounted */ ++ skb_dst_force(skb); ++ ++ skb_queue_tail(&sk->sk_error_queue, skb); ++ if (!sock_flag(sk, SOCK_DEAD)) ++ sk_error_report(sk); ++ return 0; ++} ++EXPORT_SYMBOL(sock_queue_err_skb); ++ ++static bool is_icmp_err_skb(const struct sk_buff *skb) ++{ ++ return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || ++ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); ++} ++ ++struct sk_buff *sock_dequeue_err_skb(struct sock *sk) ++{ ++ struct sk_buff_head *q = &sk->sk_error_queue; ++ struct sk_buff *skb, *skb_next = NULL; ++ bool icmp_next = false; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&q->lock, flags); ++ skb = __skb_dequeue(q); ++ if (skb && (skb_next = skb_peek(q))) { ++ icmp_next = is_icmp_err_skb(skb_next); ++ if (icmp_next) ++ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; ++ } ++ spin_unlock_irqrestore(&q->lock, flags); ++ ++ if (is_icmp_err_skb(skb) && !icmp_next) ++ sk->sk_err = 0; ++ ++ if (skb_next) ++ sk_error_report(sk); ++ ++ return skb; ++} ++EXPORT_SYMBOL(sock_dequeue_err_skb); ++ ++/** ++ * skb_clone_sk - create clone of skb, and take reference to socket ++ * @skb: the skb to clone ++ * ++ * This function creates a clone of a buffer that holds a reference on ++ * sk_refcnt. Buffers created via this function are meant to be ++ * returned using sock_queue_err_skb, or free via kfree_skb. ++ * ++ * When passing buffers allocated with this function to sock_queue_err_skb ++ * it is necessary to wrap the call with sock_hold/sock_put in order to ++ * prevent the socket from being released prior to being enqueued on ++ * the sk_error_queue. ++ */ ++struct sk_buff *skb_clone_sk(struct sk_buff *skb) ++{ ++ struct sock *sk = skb->sk; ++ struct sk_buff *clone; ++ ++ if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) ++ return NULL; ++ ++ clone = skb_clone(skb, GFP_ATOMIC); ++ if (!clone) { ++ sock_put(sk); ++ return NULL; ++ } ++ ++ clone->sk = sk; ++ clone->destructor = sock_efree; ++ ++ return clone; ++} ++EXPORT_SYMBOL(skb_clone_sk); ++ ++static void __skb_complete_tx_timestamp(struct sk_buff *skb, ++ struct sock *sk, ++ int tstype, ++ bool opt_stats) ++{ ++ struct sock_exterr_skb *serr; ++ int err; ++ ++ BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); ++ ++ serr = SKB_EXT_ERR(skb); ++ memset(serr, 0, sizeof(*serr)); ++ serr->ee.ee_errno = ENOMSG; ++ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; ++ serr->ee.ee_info = tstype; ++ serr->opt_stats = opt_stats; ++ serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; ++ if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { ++ serr->ee.ee_data = skb_shinfo(skb)->tskey; ++ if (sk_is_tcp(sk)) ++ serr->ee.ee_data -= atomic_read(&sk->sk_tskey); ++ } ++ ++ err = sock_queue_err_skb(sk, skb); ++ ++ if (err) ++ kfree_skb(skb); ++} ++ ++static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) ++{ ++ bool ret; ++ ++ if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) ++ return true; ++ ++ read_lock_bh(&sk->sk_callback_lock); ++ ret = sk->sk_socket && sk->sk_socket->file && ++ file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); ++ read_unlock_bh(&sk->sk_callback_lock); ++ return ret; ++} ++ ++void skb_complete_tx_timestamp(struct sk_buff *skb, ++ struct skb_shared_hwtstamps *hwtstamps) ++{ ++ struct sock *sk = skb->sk; ++ ++ if (!skb_may_tx_timestamp(sk, false)) ++ goto err; ++ ++ /* Take a reference to prevent skb_orphan() from freeing the socket, ++ * but only if the socket refcount is not zero. ++ */ ++ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { ++ *skb_hwtstamps(skb) = *hwtstamps; ++ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); ++ sock_put(sk); ++ return; ++ } ++ ++err: ++ kfree_skb(skb); ++} ++EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); ++ ++void __skb_tstamp_tx(struct sk_buff *orig_skb, ++ const struct sk_buff *ack_skb, ++ struct skb_shared_hwtstamps *hwtstamps, ++ struct sock *sk, int tstype) ++{ ++ struct sk_buff *skb; ++ bool tsonly, opt_stats = false; ++ ++ if (!sk) ++ return; ++ ++ if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && ++ skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) ++ return; ++ ++ tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; ++ if (!skb_may_tx_timestamp(sk, tsonly)) ++ return; ++ ++ if (tsonly) { ++#ifdef CONFIG_INET ++ if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && ++ sk_is_tcp(sk)) { ++ skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ++ ack_skb); ++ opt_stats = true; ++ } else ++#endif ++ skb = alloc_skb(0, GFP_ATOMIC); ++ } else { ++ skb = skb_clone(orig_skb, GFP_ATOMIC); ++ } ++ if (!skb) ++ return; ++ ++ if (tsonly) { ++ skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & ++ SKBTX_ANY_TSTAMP; ++ skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; ++ } ++ ++ if (hwtstamps) ++ *skb_hwtstamps(skb) = *hwtstamps; ++ else ++ __net_timestamp(skb); ++ ++ __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); ++} ++EXPORT_SYMBOL_GPL(__skb_tstamp_tx); ++ ++void skb_tstamp_tx(struct sk_buff *orig_skb, ++ struct skb_shared_hwtstamps *hwtstamps) ++{ ++ return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, ++ SCM_TSTAMP_SND); ++} ++EXPORT_SYMBOL_GPL(skb_tstamp_tx); ++ ++void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) ++{ ++ struct sock *sk = skb->sk; ++ struct sock_exterr_skb *serr; ++ int err = 1; ++ ++ skb->wifi_acked_valid = 1; ++ skb->wifi_acked = acked; ++ ++ serr = SKB_EXT_ERR(skb); ++ memset(serr, 0, sizeof(*serr)); ++ serr->ee.ee_errno = ENOMSG; ++ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; ++ ++ /* Take a reference to prevent skb_orphan() from freeing the socket, ++ * but only if the socket refcount is not zero. ++ */ ++ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { ++ err = sock_queue_err_skb(sk, skb); ++ sock_put(sk); ++ } ++ if (err) ++ kfree_skb(skb); ++} ++EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); ++ ++/** ++ * skb_partial_csum_set - set up and verify partial csum values for packet ++ * @skb: the skb to set ++ * @start: the number of bytes after skb->data to start checksumming. ++ * @off: the offset from start to place the checksum. ++ * ++ * For untrusted partially-checksummed packets, we need to make sure the values ++ * for skb->csum_start and skb->csum_offset are valid so we don't oops. ++ * ++ * This function checks and sets those values and skb->ip_summed: if this ++ * returns false you should drop the packet. ++ */ ++bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) ++{ ++ u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); ++ u32 csum_start = skb_headroom(skb) + (u32)start; ++ ++ if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { ++ net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", ++ start, off, skb_headroom(skb), skb_headlen(skb)); ++ return false; ++ } ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ skb->csum_start = csum_start; ++ skb->csum_offset = off; ++ skb_set_transport_header(skb, start); ++ return true; ++} ++EXPORT_SYMBOL_GPL(skb_partial_csum_set); ++ ++static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, ++ unsigned int max) ++{ ++ if (skb_headlen(skb) >= len) ++ return 0; ++ ++ /* If we need to pullup then pullup to the max, so we ++ * won't need to do it again. ++ */ ++ if (max > skb->len) ++ max = skb->len; ++ ++ if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) ++ return -ENOMEM; ++ ++ if (skb_headlen(skb) < len) ++ return -EPROTO; ++ ++ return 0; ++} ++ ++#define MAX_TCP_HDR_LEN (15 * 4) ++ ++static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, ++ typeof(IPPROTO_IP) proto, ++ unsigned int off) ++{ ++ int err; ++ ++ switch (proto) { ++ case IPPROTO_TCP: ++ err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), ++ off + MAX_TCP_HDR_LEN); ++ if (!err && !skb_partial_csum_set(skb, off, ++ offsetof(struct tcphdr, ++ check))) ++ err = -EPROTO; ++ return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; ++ ++ case IPPROTO_UDP: ++ err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), ++ off + sizeof(struct udphdr)); ++ if (!err && !skb_partial_csum_set(skb, off, ++ offsetof(struct udphdr, ++ check))) ++ err = -EPROTO; ++ return err ? ERR_PTR(err) : &udp_hdr(skb)->check; ++ } ++ ++ return ERR_PTR(-EPROTO); ++} ++ ++/* This value should be large enough to cover a tagged ethernet header plus ++ * maximally sized IP and TCP or UDP headers. ++ */ ++#define MAX_IP_HDR_LEN 128 ++ ++static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) ++{ ++ unsigned int off; ++ bool fragment; ++ __sum16 *csum; ++ int err; ++ ++ fragment = false; ++ ++ err = skb_maybe_pull_tail(skb, ++ sizeof(struct iphdr), ++ MAX_IP_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ if (ip_is_fragment(ip_hdr(skb))) ++ fragment = true; ++ ++ off = ip_hdrlen(skb); ++ ++ err = -EPROTO; ++ ++ if (fragment) ++ goto out; ++ ++ csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); ++ if (IS_ERR(csum)) ++ return PTR_ERR(csum); ++ ++ if (recalculate) ++ *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ++ ip_hdr(skb)->daddr, ++ skb->len - off, ++ ip_hdr(skb)->protocol, 0); ++ err = 0; ++ ++out: ++ return err; ++} ++ ++/* This value should be large enough to cover a tagged ethernet header plus ++ * an IPv6 header, all options, and a maximal TCP or UDP header. ++ */ ++#define MAX_IPV6_HDR_LEN 256 ++ ++#define OPT_HDR(type, skb, off) \ ++ (type *)(skb_network_header(skb) + (off)) ++ ++static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) ++{ ++ int err; ++ u8 nexthdr; ++ unsigned int off; ++ unsigned int len; ++ bool fragment; ++ bool done; ++ __sum16 *csum; ++ ++ fragment = false; ++ done = false; ++ ++ off = sizeof(struct ipv6hdr); ++ ++ err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ nexthdr = ipv6_hdr(skb)->nexthdr; ++ ++ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); ++ while (off <= len && !done) { ++ switch (nexthdr) { ++ case IPPROTO_DSTOPTS: ++ case IPPROTO_HOPOPTS: ++ case IPPROTO_ROUTING: { ++ struct ipv6_opt_hdr *hp; ++ ++ err = skb_maybe_pull_tail(skb, ++ off + ++ sizeof(struct ipv6_opt_hdr), ++ MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); ++ nexthdr = hp->nexthdr; ++ off += ipv6_optlen(hp); ++ break; ++ } ++ case IPPROTO_AH: { ++ struct ip_auth_hdr *hp; ++ ++ err = skb_maybe_pull_tail(skb, ++ off + ++ sizeof(struct ip_auth_hdr), ++ MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ hp = OPT_HDR(struct ip_auth_hdr, skb, off); ++ nexthdr = hp->nexthdr; ++ off += ipv6_authlen(hp); ++ break; ++ } ++ case IPPROTO_FRAGMENT: { ++ struct frag_hdr *hp; ++ ++ err = skb_maybe_pull_tail(skb, ++ off + ++ sizeof(struct frag_hdr), ++ MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ hp = OPT_HDR(struct frag_hdr, skb, off); ++ ++ if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) ++ fragment = true; ++ ++ nexthdr = hp->nexthdr; ++ off += sizeof(struct frag_hdr); ++ break; ++ } ++ default: ++ done = true; ++ break; ++ } ++ } ++ ++ err = -EPROTO; ++ ++ if (!done || fragment) ++ goto out; ++ ++ csum = skb_checksum_setup_ip(skb, nexthdr, off); ++ if (IS_ERR(csum)) ++ return PTR_ERR(csum); ++ ++ if (recalculate) ++ *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, ++ &ipv6_hdr(skb)->daddr, ++ skb->len - off, nexthdr, 0); ++ err = 0; ++ ++out: ++ return err; ++} ++ ++/** ++ * skb_checksum_setup - set up partial checksum offset ++ * @skb: the skb to set up ++ * @recalculate: if true the pseudo-header checksum will be recalculated ++ */ ++int skb_checksum_setup(struct sk_buff *skb, bool recalculate) ++{ ++ int err; ++ ++ switch (skb->protocol) { ++ case htons(ETH_P_IP): ++ err = skb_checksum_setup_ipv4(skb, recalculate); ++ break; ++ ++ case htons(ETH_P_IPV6): ++ err = skb_checksum_setup_ipv6(skb, recalculate); ++ break; ++ ++ default: ++ err = -EPROTO; ++ break; ++ } ++ ++ return err; ++} ++EXPORT_SYMBOL(skb_checksum_setup); ++ ++/** ++ * skb_checksum_maybe_trim - maybe trims the given skb ++ * @skb: the skb to check ++ * @transport_len: the data length beyond the network header ++ * ++ * Checks whether the given skb has data beyond the given transport length. ++ * If so, returns a cloned skb trimmed to this transport length. ++ * Otherwise returns the provided skb. Returns NULL in error cases ++ * (e.g. transport_len exceeds skb length or out-of-memory). ++ * ++ * Caller needs to set the skb transport header and free any returned skb if it ++ * differs from the provided skb. ++ */ ++static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, ++ unsigned int transport_len) ++{ ++ struct sk_buff *skb_chk; ++ unsigned int len = skb_transport_offset(skb) + transport_len; ++ int ret; ++ ++ if (skb->len < len) ++ return NULL; ++ else if (skb->len == len) ++ return skb; ++ ++ skb_chk = skb_clone(skb, GFP_ATOMIC); ++ if (!skb_chk) ++ return NULL; ++ ++ ret = pskb_trim_rcsum(skb_chk, len); ++ if (ret) { ++ kfree_skb(skb_chk); ++ return NULL; ++ } ++ ++ return skb_chk; ++} ++ ++/** ++ * skb_checksum_trimmed - validate checksum of an skb ++ * @skb: the skb to check ++ * @transport_len: the data length beyond the network header ++ * @skb_chkf: checksum function to use ++ * ++ * Applies the given checksum function skb_chkf to the provided skb. ++ * Returns a checked and maybe trimmed skb. Returns NULL on error. ++ * ++ * If the skb has data beyond the given transport length, then a ++ * trimmed & cloned skb is checked and returned. ++ * ++ * Caller needs to set the skb transport header and free any returned skb if it ++ * differs from the provided skb. ++ */ ++struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, ++ unsigned int transport_len, ++ __sum16(*skb_chkf)(struct sk_buff *skb)) ++{ ++ struct sk_buff *skb_chk; ++ unsigned int offset = skb_transport_offset(skb); ++ __sum16 ret; ++ ++ skb_chk = skb_checksum_maybe_trim(skb, transport_len); ++ if (!skb_chk) ++ goto err; ++ ++ if (!pskb_may_pull(skb_chk, offset)) ++ goto err; ++ ++ skb_pull_rcsum(skb_chk, offset); ++ ret = skb_chkf(skb_chk); ++ skb_push_rcsum(skb_chk, offset); ++ ++ if (ret) ++ goto err; ++ ++ return skb_chk; ++ ++err: ++ if (skb_chk && skb_chk != skb) ++ kfree_skb(skb_chk); ++ ++ return NULL; ++ ++} ++EXPORT_SYMBOL(skb_checksum_trimmed); ++ ++void __skb_warn_lro_forwarding(const struct sk_buff *skb) ++{ ++ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", ++ skb->dev->name); ++} ++EXPORT_SYMBOL(__skb_warn_lro_forwarding); ++ ++void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) ++{ ++ if (head_stolen) { ++ skb_release_head_state(skb); ++ kmem_cache_free(skbuff_head_cache, skb); ++ } else { ++ __kfree_skb(skb); ++ } ++} ++EXPORT_SYMBOL(kfree_skb_partial); ++ ++/** ++ * skb_try_coalesce - try to merge skb to prior one ++ * @to: prior buffer ++ * @from: buffer to add ++ * @fragstolen: pointer to boolean ++ * @delta_truesize: how much more was allocated than was requested ++ */ ++bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, ++ bool *fragstolen, int *delta_truesize) ++{ ++ struct skb_shared_info *to_shinfo, *from_shinfo; ++ int i, delta, len = from->len; ++ ++ *fragstolen = false; ++ ++ if (skb_cloned(to)) ++ return false; ++ ++ /* In general, avoid mixing slab allocated and page_pool allocated ++ * pages within the same SKB. However when @to is not pp_recycle and ++ * @from is cloned, we can transition frag pages from page_pool to ++ * reference counted. ++ * ++ * On the other hand, don't allow coalescing two pp_recycle SKBs if ++ * @from is cloned, in case the SKB is using page_pool fragment ++ * references (PP_FLAG_PAGE_FRAG). Since we only take full page ++ * references for cloned SKBs at the moment that would result in ++ * inconsistent reference counts. ++ */ ++ if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) ++ return false; ++ ++ if (len <= skb_tailroom(to)) { ++ if (len) ++ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); ++ *delta_truesize = 0; ++ return true; ++ } ++ ++ to_shinfo = skb_shinfo(to); ++ from_shinfo = skb_shinfo(from); ++ if (to_shinfo->frag_list || from_shinfo->frag_list) ++ return false; ++ if (skb_zcopy(to) || skb_zcopy(from)) ++ return false; ++ ++ if (skb_headlen(from) != 0) { ++ struct page *page; ++ unsigned int offset; ++ ++ if (to_shinfo->nr_frags + ++ from_shinfo->nr_frags >= MAX_SKB_FRAGS) ++ return false; ++ ++ if (skb_head_is_locked(from)) ++ return false; ++ ++ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); ++ ++ page = virt_to_head_page(from->head); ++ offset = from->data - (unsigned char *)page_address(page); ++ ++ skb_fill_page_desc(to, to_shinfo->nr_frags, ++ page, offset, skb_headlen(from)); ++ *fragstolen = true; ++ } else { ++ if (to_shinfo->nr_frags + ++ from_shinfo->nr_frags > MAX_SKB_FRAGS) ++ return false; ++ ++ delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); ++ } ++ ++ WARN_ON_ONCE(delta < len); ++ ++ memcpy(to_shinfo->frags + to_shinfo->nr_frags, ++ from_shinfo->frags, ++ from_shinfo->nr_frags * sizeof(skb_frag_t)); ++ to_shinfo->nr_frags += from_shinfo->nr_frags; ++ ++ if (!skb_cloned(from)) ++ from_shinfo->nr_frags = 0; ++ ++ /* if the skb is not cloned this does nothing ++ * since we set nr_frags to 0. ++ */ ++ for (i = 0; i < from_shinfo->nr_frags; i++) ++ __skb_frag_ref(&from_shinfo->frags[i]); ++ ++ to->truesize += delta; ++ to->len += len; ++ to->data_len += len; ++ ++ *delta_truesize = delta; ++ return true; ++} ++EXPORT_SYMBOL(skb_try_coalesce); ++ ++/** ++ * skb_scrub_packet - scrub an skb ++ * ++ * @skb: buffer to clean ++ * @xnet: packet is crossing netns ++ * ++ * skb_scrub_packet can be used after encapsulating or decapsulting a packet ++ * into/from a tunnel. Some information have to be cleared during these ++ * operations. ++ * skb_scrub_packet can also be used to clean a skb before injecting it in ++ * another namespace (@xnet == true). We have to clear all information in the ++ * skb that could impact namespace isolation. ++ */ ++void skb_scrub_packet(struct sk_buff *skb, bool xnet) ++{ ++ skb->pkt_type = PACKET_HOST; ++ skb->skb_iif = 0; ++ skb->ignore_df = 0; ++ skb_dst_drop(skb); ++ skb_ext_reset(skb); ++ nf_reset_ct(skb); ++ nf_reset_trace(skb); ++ ++#ifdef CONFIG_NET_SWITCHDEV ++ skb->offload_fwd_mark = 0; ++ skb->offload_l3_fwd_mark = 0; ++#endif ++ ++ if (!xnet) ++ return; ++ ++ ipvs_reset(skb); ++ skb->mark = 0; ++ skb_clear_tstamp(skb); ++} ++EXPORT_SYMBOL_GPL(skb_scrub_packet); ++ ++/** ++ * skb_gso_transport_seglen - Return length of individual segments of a gso packet ++ * ++ * @skb: GSO skb ++ * ++ * skb_gso_transport_seglen is used to determine the real size of the ++ * individual segments, including Layer4 headers (TCP/UDP). ++ * ++ * The MAC/L2 or network (IP, IPv6) headers are not accounted for. ++ */ ++static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) ++{ ++ const struct skb_shared_info *shinfo = skb_shinfo(skb); ++ unsigned int thlen = 0; ++ ++ if (skb->encapsulation) { ++ thlen = skb_inner_transport_header(skb) - ++ skb_transport_header(skb); ++ ++ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) ++ thlen += inner_tcp_hdrlen(skb); ++ } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { ++ thlen = tcp_hdrlen(skb); ++ } else if (unlikely(skb_is_gso_sctp(skb))) { ++ thlen = sizeof(struct sctphdr); ++ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { ++ thlen = sizeof(struct udphdr); ++ } ++ /* UFO sets gso_size to the size of the fragmentation ++ * payload, i.e. the size of the L4 (UDP) header is already ++ * accounted for. ++ */ ++ return thlen + shinfo->gso_size; ++} ++ ++/** ++ * skb_gso_network_seglen - Return length of individual segments of a gso packet ++ * ++ * @skb: GSO skb ++ * ++ * skb_gso_network_seglen is used to determine the real size of the ++ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). ++ * ++ * The MAC/L2 header is not accounted for. ++ */ ++static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) ++{ ++ unsigned int hdr_len = skb_transport_header(skb) - ++ skb_network_header(skb); ++ ++ return hdr_len + skb_gso_transport_seglen(skb); ++} ++ ++/** ++ * skb_gso_mac_seglen - Return length of individual segments of a gso packet ++ * ++ * @skb: GSO skb ++ * ++ * skb_gso_mac_seglen is used to determine the real size of the ++ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 ++ * headers (TCP/UDP). ++ */ ++static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) ++{ ++ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); ++ ++ return hdr_len + skb_gso_transport_seglen(skb); ++} ++ ++/** ++ * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS ++ * ++ * There are a couple of instances where we have a GSO skb, and we ++ * want to determine what size it would be after it is segmented. ++ * ++ * We might want to check: ++ * - L3+L4+payload size (e.g. IP forwarding) ++ * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) ++ * ++ * This is a helper to do that correctly considering GSO_BY_FRAGS. ++ * ++ * @skb: GSO skb ++ * ++ * @seg_len: The segmented length (from skb_gso_*_seglen). In the ++ * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. ++ * ++ * @max_len: The maximum permissible length. ++ * ++ * Returns true if the segmented length <= max length. ++ */ ++static inline bool skb_gso_size_check(const struct sk_buff *skb, ++ unsigned int seg_len, ++ unsigned int max_len) { ++ const struct skb_shared_info *shinfo = skb_shinfo(skb); ++ const struct sk_buff *iter; ++ ++ if (shinfo->gso_size != GSO_BY_FRAGS) ++ return seg_len <= max_len; ++ ++ /* Undo this so we can re-use header sizes */ ++ seg_len -= GSO_BY_FRAGS; ++ ++ skb_walk_frags(skb, iter) { ++ if (seg_len + skb_headlen(iter) > max_len) ++ return false; ++ } ++ ++ return true; ++} ++ ++/** ++ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? ++ * ++ * @skb: GSO skb ++ * @mtu: MTU to validate against ++ * ++ * skb_gso_validate_network_len validates if a given skb will fit a ++ * wanted MTU once split. It considers L3 headers, L4 headers, and the ++ * payload. ++ */ ++bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) ++{ ++ return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); ++} ++EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); ++ ++/** ++ * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? ++ * ++ * @skb: GSO skb ++ * @len: length to validate against ++ * ++ * skb_gso_validate_mac_len validates if a given skb will fit a wanted ++ * length once split, including L2, L3 and L4 headers and the payload. ++ */ ++bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) ++{ ++ return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); ++} ++EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); ++ ++static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) ++{ ++ int mac_len, meta_len; ++ void *meta; ++ ++ if (skb_cow(skb, skb_headroom(skb)) < 0) { ++ kfree_skb(skb); ++ return NULL; ++ } ++ ++ mac_len = skb->data - skb_mac_header(skb); ++ if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { ++ memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), ++ mac_len - VLAN_HLEN - ETH_TLEN); ++ } ++ ++ meta_len = skb_metadata_len(skb); ++ if (meta_len) { ++ meta = skb_metadata_end(skb) - meta_len; ++ memmove(meta + VLAN_HLEN, meta, meta_len); ++ } ++ ++ skb->mac_header += VLAN_HLEN; ++ return skb; ++} ++ ++struct sk_buff *skb_vlan_untag(struct sk_buff *skb) ++{ ++ struct vlan_hdr *vhdr; ++ u16 vlan_tci; ++ ++ if (unlikely(skb_vlan_tag_present(skb))) { ++ /* vlan_tci is already set-up so leave this for another time */ ++ return skb; ++ } ++ ++ skb = skb_share_check(skb, GFP_ATOMIC); ++ if (unlikely(!skb)) ++ goto err_free; ++ /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ ++ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) ++ goto err_free; ++ ++ vhdr = (struct vlan_hdr *)skb->data; ++ vlan_tci = ntohs(vhdr->h_vlan_TCI); ++ __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); ++ ++ skb_pull_rcsum(skb, VLAN_HLEN); ++ vlan_set_encap_proto(skb, vhdr); ++ ++ skb = skb_reorder_vlan_header(skb); ++ if (unlikely(!skb)) ++ goto err_free; ++ ++ skb_reset_network_header(skb); ++ if (!skb_transport_header_was_set(skb)) ++ skb_reset_transport_header(skb); ++ skb_reset_mac_len(skb); ++ ++ return skb; ++ ++err_free: ++ kfree_skb(skb); ++ return NULL; ++} ++EXPORT_SYMBOL(skb_vlan_untag); ++ ++int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) ++{ ++ if (!pskb_may_pull(skb, write_len)) ++ return -ENOMEM; ++ ++ if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) ++ return 0; ++ ++ return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); ++} ++EXPORT_SYMBOL(skb_ensure_writable); ++ ++/* remove VLAN header from packet and update csum accordingly. ++ * expects a non skb_vlan_tag_present skb with a vlan tag payload ++ */ ++int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) ++{ ++ struct vlan_hdr *vhdr; ++ int offset = skb->data - skb_mac_header(skb); ++ int err; ++ ++ if (WARN_ONCE(offset, ++ "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", ++ offset)) { ++ return -EINVAL; ++ } ++ ++ err = skb_ensure_writable(skb, VLAN_ETH_HLEN); ++ if (unlikely(err)) ++ return err; ++ ++ skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); ++ ++ vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); ++ *vlan_tci = ntohs(vhdr->h_vlan_TCI); ++ ++ memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); ++ __skb_pull(skb, VLAN_HLEN); ++ ++ vlan_set_encap_proto(skb, vhdr); ++ skb->mac_header += VLAN_HLEN; ++ ++ if (skb_network_offset(skb) < ETH_HLEN) ++ skb_set_network_header(skb, ETH_HLEN); ++ ++ skb_reset_mac_len(skb); ++ ++ return err; ++} ++EXPORT_SYMBOL(__skb_vlan_pop); ++ ++/* Pop a vlan tag either from hwaccel or from payload. ++ * Expects skb->data at mac header. ++ */ ++int skb_vlan_pop(struct sk_buff *skb) ++{ ++ u16 vlan_tci; ++ __be16 vlan_proto; ++ int err; ++ ++ if (likely(skb_vlan_tag_present(skb))) { ++ __vlan_hwaccel_clear_tag(skb); ++ } else { ++ if (unlikely(!eth_type_vlan(skb->protocol))) ++ return 0; ++ ++ err = __skb_vlan_pop(skb, &vlan_tci); ++ if (err) ++ return err; ++ } ++ /* move next vlan tag to hw accel tag */ ++ if (likely(!eth_type_vlan(skb->protocol))) ++ return 0; ++ ++ vlan_proto = skb->protocol; ++ err = __skb_vlan_pop(skb, &vlan_tci); ++ if (unlikely(err)) ++ return err; ++ ++ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); ++ return 0; ++} ++EXPORT_SYMBOL(skb_vlan_pop); ++ ++/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). ++ * Expects skb->data at mac header. ++ */ ++int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) ++{ ++ if (skb_vlan_tag_present(skb)) { ++ int offset = skb->data - skb_mac_header(skb); ++ int err; ++ ++ if (WARN_ONCE(offset, ++ "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", ++ offset)) { ++ return -EINVAL; ++ } ++ ++ err = __vlan_insert_tag(skb, skb->vlan_proto, ++ skb_vlan_tag_get(skb)); ++ if (err) ++ return err; ++ ++ skb->protocol = skb->vlan_proto; ++ skb->mac_len += VLAN_HLEN; ++ ++ skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); ++ } ++ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); ++ return 0; ++} ++EXPORT_SYMBOL(skb_vlan_push); ++ ++/** ++ * skb_eth_pop() - Drop the Ethernet header at the head of a packet ++ * ++ * @skb: Socket buffer to modify ++ * ++ * Drop the Ethernet header of @skb. ++ * ++ * Expects that skb->data points to the mac header and that no VLAN tags are ++ * present. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_eth_pop(struct sk_buff *skb) ++{ ++ if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || ++ skb_network_offset(skb) < ETH_HLEN) ++ return -EPROTO; ++ ++ skb_pull_rcsum(skb, ETH_HLEN); ++ skb_reset_mac_header(skb); ++ skb_reset_mac_len(skb); ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_eth_pop); ++ ++/** ++ * skb_eth_push() - Add a new Ethernet header at the head of a packet ++ * ++ * @skb: Socket buffer to modify ++ * @dst: Destination MAC address of the new header ++ * @src: Source MAC address of the new header ++ * ++ * Prepend @skb with a new Ethernet header. ++ * ++ * Expects that skb->data points to the mac header, which must be empty. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, ++ const unsigned char *src) ++{ ++ struct ethhdr *eth; ++ int err; ++ ++ if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) ++ return -EPROTO; ++ ++ err = skb_cow_head(skb, sizeof(*eth)); ++ if (err < 0) ++ return err; ++ ++ skb_push(skb, sizeof(*eth)); ++ skb_reset_mac_header(skb); ++ skb_reset_mac_len(skb); ++ ++ eth = eth_hdr(skb); ++ ether_addr_copy(eth->h_dest, dst); ++ ether_addr_copy(eth->h_source, src); ++ eth->h_proto = skb->protocol; ++ ++ skb_postpush_rcsum(skb, eth, sizeof(*eth)); ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_eth_push); ++ ++/* Update the ethertype of hdr and the skb csum value if required. */ ++static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, ++ __be16 ethertype) ++{ ++ if (skb->ip_summed == CHECKSUM_COMPLETE) { ++ __be16 diff[] = { ~hdr->h_proto, ethertype }; ++ ++ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); ++ } ++ ++ hdr->h_proto = ethertype; ++} ++ ++/** ++ * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of ++ * the packet ++ * ++ * @skb: buffer ++ * @mpls_lse: MPLS label stack entry to push ++ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) ++ * @mac_len: length of the MAC header ++ * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is ++ * ethernet ++ * ++ * Expects skb->data at mac header. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, ++ int mac_len, bool ethernet) ++{ ++ struct mpls_shim_hdr *lse; ++ int err; ++ ++ if (unlikely(!eth_p_mpls(mpls_proto))) ++ return -EINVAL; ++ ++ /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ ++ if (skb->encapsulation) ++ return -EINVAL; ++ ++ err = skb_cow_head(skb, MPLS_HLEN); ++ if (unlikely(err)) ++ return err; ++ ++ if (!skb->inner_protocol) { ++ skb_set_inner_network_header(skb, skb_network_offset(skb)); ++ skb_set_inner_protocol(skb, skb->protocol); ++ } ++ ++ skb_push(skb, MPLS_HLEN); ++ memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), ++ mac_len); ++ skb_reset_mac_header(skb); ++ skb_set_network_header(skb, mac_len); ++ skb_reset_mac_len(skb); ++ ++ lse = mpls_hdr(skb); ++ lse->label_stack_entry = mpls_lse; ++ skb_postpush_rcsum(skb, lse, MPLS_HLEN); ++ ++ if (ethernet && mac_len >= ETH_HLEN) ++ skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); ++ skb->protocol = mpls_proto; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_mpls_push); ++ ++/** ++ * skb_mpls_pop() - pop the outermost MPLS header ++ * ++ * @skb: buffer ++ * @next_proto: ethertype of header after popped MPLS header ++ * @mac_len: length of the MAC header ++ * @ethernet: flag to indicate if the packet is ethernet ++ * ++ * Expects skb->data at mac header. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, ++ bool ethernet) ++{ ++ int err; ++ ++ if (unlikely(!eth_p_mpls(skb->protocol))) ++ return 0; ++ ++ err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); ++ if (unlikely(err)) ++ return err; ++ ++ skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); ++ memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), ++ mac_len); ++ ++ __skb_pull(skb, MPLS_HLEN); ++ skb_reset_mac_header(skb); ++ skb_set_network_header(skb, mac_len); ++ ++ if (ethernet && mac_len >= ETH_HLEN) { ++ struct ethhdr *hdr; ++ ++ /* use mpls_hdr() to get ethertype to account for VLANs. */ ++ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); ++ skb_mod_eth_type(skb, hdr, next_proto); ++ } ++ skb->protocol = next_proto; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_mpls_pop); ++ ++/** ++ * skb_mpls_update_lse() - modify outermost MPLS header and update csum ++ * ++ * @skb: buffer ++ * @mpls_lse: new MPLS label stack entry to update to ++ * ++ * Expects skb->data at mac header. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) ++{ ++ int err; ++ ++ if (unlikely(!eth_p_mpls(skb->protocol))) ++ return -EINVAL; ++ ++ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); ++ if (unlikely(err)) ++ return err; ++ ++ if (skb->ip_summed == CHECKSUM_COMPLETE) { ++ __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; ++ ++ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); ++ } ++ ++ mpls_hdr(skb)->label_stack_entry = mpls_lse; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_mpls_update_lse); ++ ++/** ++ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header ++ * ++ * @skb: buffer ++ * ++ * Expects skb->data at mac header. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_mpls_dec_ttl(struct sk_buff *skb) ++{ ++ u32 lse; ++ u8 ttl; ++ ++ if (unlikely(!eth_p_mpls(skb->protocol))) ++ return -EINVAL; ++ ++ if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) ++ return -ENOMEM; ++ ++ lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); ++ ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; ++ if (!--ttl) ++ return -EINVAL; ++ ++ lse &= ~MPLS_LS_TTL_MASK; ++ lse |= ttl << MPLS_LS_TTL_SHIFT; ++ ++ return skb_mpls_update_lse(skb, cpu_to_be32(lse)); ++} ++EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); ++ ++/** ++ * alloc_skb_with_frags - allocate skb with page frags ++ * ++ * @header_len: size of linear part ++ * @data_len: needed length in frags ++ * @max_page_order: max page order desired. ++ * @errcode: pointer to error code if any ++ * @gfp_mask: allocation mask ++ * ++ * This can be used to allocate a paged skb, given a maximal order for frags. ++ */ ++struct sk_buff *alloc_skb_with_frags(unsigned long header_len, ++ unsigned long data_len, ++ int max_page_order, ++ int *errcode, ++ gfp_t gfp_mask) ++{ ++ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; ++ unsigned long chunk; ++ struct sk_buff *skb; ++ struct page *page; ++ int i; ++ ++ *errcode = -EMSGSIZE; ++ /* Note this test could be relaxed, if we succeed to allocate ++ * high order pages... ++ */ ++ if (npages > MAX_SKB_FRAGS) ++ return NULL; ++ ++ *errcode = -ENOBUFS; ++ skb = alloc_skb(header_len, gfp_mask); ++ if (!skb) ++ return NULL; ++ ++ skb->truesize += npages << PAGE_SHIFT; ++ ++ for (i = 0; npages > 0; i++) { ++ int order = max_page_order; ++ ++ while (order) { ++ if (npages >= 1 << order) { ++ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | ++ __GFP_COMP | ++ __GFP_NOWARN, ++ order); ++ if (page) ++ goto fill_page; ++ /* Do not retry other high order allocations */ ++ order = 1; ++ max_page_order = 0; ++ } ++ order--; ++ } ++ page = alloc_page(gfp_mask); ++ if (!page) ++ goto failure; ++fill_page: ++ chunk = min_t(unsigned long, data_len, ++ PAGE_SIZE << order); ++ skb_fill_page_desc(skb, i, page, 0, chunk); ++ data_len -= chunk; ++ npages -= 1 << order; ++ } ++ return skb; ++ ++failure: ++ kfree_skb(skb); ++ return NULL; ++} ++EXPORT_SYMBOL(alloc_skb_with_frags); ++ ++/* carve out the first off bytes from skb when off < headlen */ ++static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, ++ const int headlen, gfp_t gfp_mask) ++{ ++ int i; ++ int size = skb_end_offset(skb); ++ int new_hlen = headlen - off; ++ u8 *data; ++ ++ size = SKB_DATA_ALIGN(size); ++ ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ data = kmalloc_reserve(size + ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ++ gfp_mask, NUMA_NO_NODE, NULL); ++ if (!data) ++ return -ENOMEM; ++ ++ size = SKB_WITH_OVERHEAD(ksize(data)); ++ ++ /* Copy real data, and all frags */ ++ skb_copy_from_linear_data_offset(skb, off, data, new_hlen); ++ skb->len -= off; ++ ++ memcpy((struct skb_shared_info *)(data + size), ++ skb_shinfo(skb), ++ offsetof(struct skb_shared_info, ++ frags[skb_shinfo(skb)->nr_frags])); ++ if (skb_cloned(skb)) { ++ /* drop the old head gracefully */ ++ if (skb_orphan_frags(skb, gfp_mask)) { ++ kfree(data); ++ return -ENOMEM; ++ } ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_frag_ref(skb, i); ++ if (skb_has_frag_list(skb)) ++ skb_clone_fraglist(skb); ++ skb_release_data(skb); ++ } else { ++ /* we can reuse existing recount- all we did was ++ * relocate values ++ */ ++ skb_free_head(skb); ++ } ++ ++ skb->head = data; ++ skb->data = data; ++ skb->head_frag = 0; ++ skb_set_end_offset(skb, size); ++ skb_set_tail_pointer(skb, skb_headlen(skb)); ++ skb_headers_offset_update(skb, 0); ++ skb->cloned = 0; ++ skb->hdr_len = 0; ++ skb->nohdr = 0; ++ atomic_set(&skb_shinfo(skb)->dataref, 1); ++ ++ return 0; ++} ++ ++static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); ++ ++/* carve out the first eat bytes from skb's frag_list. May recurse into ++ * pskb_carve() ++ */ ++static int pskb_carve_frag_list(struct sk_buff *skb, ++ struct skb_shared_info *shinfo, int eat, ++ gfp_t gfp_mask) ++{ ++ struct sk_buff *list = shinfo->frag_list; ++ struct sk_buff *clone = NULL; ++ struct sk_buff *insp = NULL; ++ ++ do { ++ if (!list) { ++ pr_err("Not enough bytes to eat. Want %d\n", eat); ++ return -EFAULT; ++ } ++ if (list->len <= eat) { ++ /* Eaten as whole. */ ++ eat -= list->len; ++ list = list->next; ++ insp = list; ++ } else { ++ /* Eaten partially. */ ++ if (skb_shared(list)) { ++ clone = skb_clone(list, gfp_mask); ++ if (!clone) ++ return -ENOMEM; ++ insp = list->next; ++ list = clone; ++ } else { ++ /* This may be pulled without problems. */ ++ insp = list; ++ } ++ if (pskb_carve(list, eat, gfp_mask) < 0) { ++ kfree_skb(clone); ++ return -ENOMEM; ++ } ++ break; ++ } ++ } while (eat); ++ ++ /* Free pulled out fragments. */ ++ while ((list = shinfo->frag_list) != insp) { ++ shinfo->frag_list = list->next; ++ consume_skb(list); ++ } ++ /* And insert new clone at head. */ ++ if (clone) { ++ clone->next = list; ++ shinfo->frag_list = clone; ++ } ++ return 0; ++} ++ ++/* carve off first len bytes from skb. Split line (off) is in the ++ * non-linear part of skb ++ */ ++static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, ++ int pos, gfp_t gfp_mask) ++{ ++ int i, k = 0; ++ int size = skb_end_offset(skb); ++ u8 *data; ++ const int nfrags = skb_shinfo(skb)->nr_frags; ++ struct skb_shared_info *shinfo; ++ ++ size = SKB_DATA_ALIGN(size); ++ ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ data = kmalloc_reserve(size + ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ++ gfp_mask, NUMA_NO_NODE, NULL); ++ if (!data) ++ return -ENOMEM; ++ ++ size = SKB_WITH_OVERHEAD(ksize(data)); ++ ++ memcpy((struct skb_shared_info *)(data + size), ++ skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); ++ if (skb_orphan_frags(skb, gfp_mask)) { ++ kfree(data); ++ return -ENOMEM; ++ } ++ shinfo = (struct skb_shared_info *)(data + size); ++ for (i = 0; i < nfrags; i++) { ++ int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (pos + fsize > off) { ++ shinfo->frags[k] = skb_shinfo(skb)->frags[i]; ++ ++ if (pos < off) { ++ /* Split frag. ++ * We have two variants in this case: ++ * 1. Move all the frag to the second ++ * part, if it is possible. F.e. ++ * this approach is mandatory for TUX, ++ * where splitting is expensive. ++ * 2. Split is accurately. We make this. ++ */ ++ skb_frag_off_add(&shinfo->frags[0], off - pos); ++ skb_frag_size_sub(&shinfo->frags[0], off - pos); ++ } ++ skb_frag_ref(skb, i); ++ k++; ++ } ++ pos += fsize; ++ } ++ shinfo->nr_frags = k; ++ if (skb_has_frag_list(skb)) ++ skb_clone_fraglist(skb); ++ ++ /* split line is in frag list */ ++ if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { ++ /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ ++ if (skb_has_frag_list(skb)) ++ kfree_skb_list(skb_shinfo(skb)->frag_list); ++ kfree(data); ++ return -ENOMEM; ++ } ++ skb_release_data(skb); ++ ++ skb->head = data; ++ skb->head_frag = 0; ++ skb->data = data; ++ skb_set_end_offset(skb, size); ++ skb_reset_tail_pointer(skb); ++ skb_headers_offset_update(skb, 0); ++ skb->cloned = 0; ++ skb->hdr_len = 0; ++ skb->nohdr = 0; ++ skb->len -= off; ++ skb->data_len = skb->len; ++ atomic_set(&skb_shinfo(skb)->dataref, 1); ++ return 0; ++} ++ ++/* remove len bytes from the beginning of the skb */ ++static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) ++{ ++ int headlen = skb_headlen(skb); ++ ++ if (len < headlen) ++ return pskb_carve_inside_header(skb, len, headlen, gfp); ++ else ++ return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); ++} ++ ++/* Extract to_copy bytes starting at off from skb, and return this in ++ * a new skb ++ */ ++struct sk_buff *pskb_extract(struct sk_buff *skb, int off, ++ int to_copy, gfp_t gfp) ++{ ++ struct sk_buff *clone = skb_clone(skb, gfp); ++ ++ if (!clone) ++ return NULL; ++ ++ if (pskb_carve(clone, off, gfp) < 0 || ++ pskb_trim(clone, to_copy)) { ++ kfree_skb(clone); ++ return NULL; ++ } ++ return clone; ++} ++EXPORT_SYMBOL(pskb_extract); ++ ++/** ++ * skb_condense - try to get rid of fragments/frag_list if possible ++ * @skb: buffer ++ * ++ * Can be used to save memory before skb is added to a busy queue. ++ * If packet has bytes in frags and enough tail room in skb->head, ++ * pull all of them, so that we can free the frags right now and adjust ++ * truesize. ++ * Notes: ++ * We do not reallocate skb->head thus can not fail. ++ * Caller must re-evaluate skb->truesize if needed. ++ */ ++void skb_condense(struct sk_buff *skb) ++{ ++ if (skb->data_len) { ++ if (skb->data_len > skb->end - skb->tail || ++ skb_cloned(skb)) ++ return; ++ ++ /* Nice, we can free page frag(s) right now */ ++ __pskb_pull_tail(skb, skb->data_len); ++ } ++ /* At this point, skb->truesize might be over estimated, ++ * because skb had a fragment, and fragments do not tell ++ * their truesize. ++ * When we pulled its content into skb->head, fragment ++ * was freed, but __pskb_pull_tail() could not possibly ++ * adjust skb->truesize, not knowing the frag truesize. ++ */ ++ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); ++} ++ ++#ifdef CONFIG_SKB_EXTENSIONS ++static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) ++{ ++ return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); ++} ++ ++/** ++ * __skb_ext_alloc - allocate a new skb extensions storage ++ * ++ * @flags: See kmalloc(). ++ * ++ * Returns the newly allocated pointer. The pointer can later attached to a ++ * skb via __skb_ext_set(). ++ * Note: caller must handle the skb_ext as an opaque data. ++ */ ++struct skb_ext *__skb_ext_alloc(gfp_t flags) ++{ ++ struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); ++ ++ if (new) { ++ memset(new->offset, 0, sizeof(new->offset)); ++ refcount_set(&new->refcnt, 1); ++ } ++ ++ return new; ++} ++ ++static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, ++ unsigned int old_active) ++{ ++ struct skb_ext *new; ++ ++ if (refcount_read(&old->refcnt) == 1) ++ return old; ++ ++ new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); ++ if (!new) ++ return NULL; ++ ++ memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); ++ refcount_set(&new->refcnt, 1); ++ ++#ifdef CONFIG_XFRM ++ if (old_active & (1 << SKB_EXT_SEC_PATH)) { ++ struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); ++ unsigned int i; ++ ++ for (i = 0; i < sp->len; i++) ++ xfrm_state_hold(sp->xvec[i]); ++ } ++#endif ++ __skb_ext_put(old); ++ return new; ++} ++ ++/** ++ * __skb_ext_set - attach the specified extension storage to this skb ++ * @skb: buffer ++ * @id: extension id ++ * @ext: extension storage previously allocated via __skb_ext_alloc() ++ * ++ * Existing extensions, if any, are cleared. ++ * ++ * Returns the pointer to the extension. ++ */ ++void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, ++ struct skb_ext *ext) ++{ ++ unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); ++ ++ skb_ext_put(skb); ++ newlen = newoff + skb_ext_type_len[id]; ++ ext->chunks = newlen; ++ ext->offset[id] = newoff; ++ skb->extensions = ext; ++ skb->active_extensions = 1 << id; ++ return skb_ext_get_ptr(ext, id); ++} ++ ++/** ++ * skb_ext_add - allocate space for given extension, COW if needed ++ * @skb: buffer ++ * @id: extension to allocate space for ++ * ++ * Allocates enough space for the given extension. ++ * If the extension is already present, a pointer to that extension ++ * is returned. ++ * ++ * If the skb was cloned, COW applies and the returned memory can be ++ * modified without changing the extension space of clones buffers. ++ * ++ * Returns pointer to the extension or NULL on allocation failure. ++ */ ++void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) ++{ ++ struct skb_ext *new, *old = NULL; ++ unsigned int newlen, newoff; ++ ++ if (skb->active_extensions) { ++ old = skb->extensions; ++ ++ new = skb_ext_maybe_cow(old, skb->active_extensions); ++ if (!new) ++ return NULL; ++ ++ if (__skb_ext_exist(new, id)) ++ goto set_active; ++ ++ newoff = new->chunks; ++ } else { ++ newoff = SKB_EXT_CHUNKSIZEOF(*new); ++ ++ new = __skb_ext_alloc(GFP_ATOMIC); ++ if (!new) ++ return NULL; ++ } ++ ++ newlen = newoff + skb_ext_type_len[id]; ++ new->chunks = newlen; ++ new->offset[id] = newoff; ++set_active: ++ skb->slow_gro = 1; ++ skb->extensions = new; ++ skb->active_extensions |= 1 << id; ++ return skb_ext_get_ptr(new, id); ++} ++EXPORT_SYMBOL(skb_ext_add); ++ ++#ifdef CONFIG_XFRM ++static void skb_ext_put_sp(struct sec_path *sp) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < sp->len; i++) ++ xfrm_state_put(sp->xvec[i]); ++} ++#endif ++ ++#ifdef CONFIG_MCTP_FLOWS ++static void skb_ext_put_mctp(struct mctp_flow *flow) ++{ ++ if (flow->key) ++ mctp_key_unref(flow->key); ++} ++#endif ++ ++void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) ++{ ++ struct skb_ext *ext = skb->extensions; ++ ++ skb->active_extensions &= ~(1 << id); ++ if (skb->active_extensions == 0) { ++ skb->extensions = NULL; ++ __skb_ext_put(ext); ++#ifdef CONFIG_XFRM ++ } else if (id == SKB_EXT_SEC_PATH && ++ refcount_read(&ext->refcnt) == 1) { ++ struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); ++ ++ skb_ext_put_sp(sp); ++ sp->len = 0; ++#endif ++ } ++} ++EXPORT_SYMBOL(__skb_ext_del); ++ ++void __skb_ext_put(struct skb_ext *ext) ++{ ++ /* If this is last clone, nothing can increment ++ * it after check passes. Avoids one atomic op. ++ */ ++ if (refcount_read(&ext->refcnt) == 1) ++ goto free_now; ++ ++ if (!refcount_dec_and_test(&ext->refcnt)) ++ return; ++free_now: ++#ifdef CONFIG_XFRM ++ if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) ++ skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); ++#endif ++#ifdef CONFIG_MCTP_FLOWS ++ if (__skb_ext_exist(ext, SKB_EXT_MCTP)) ++ skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); ++#endif ++ ++ kmem_cache_free(skbuff_ext_cache, ext); ++} ++EXPORT_SYMBOL(__skb_ext_put); ++#endif /* CONFIG_SKB_EXTENSIONS */ ++ ++/** ++ * skb_attempt_defer_free - queue skb for remote freeing ++ * @skb: buffer ++ * ++ * Put @skb in a per-cpu list, using the cpu which ++ * allocated the skb/pages to reduce false sharing ++ * and memory zone spinlock contention. ++ */ ++void skb_attempt_defer_free(struct sk_buff *skb) ++{ ++ int cpu = skb->alloc_cpu; ++ struct softnet_data *sd; ++ unsigned long flags; ++ unsigned int defer_max; ++ bool kick; ++ ++ if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || ++ !cpu_online(cpu) || ++ cpu == raw_smp_processor_id()) { ++nodefer: __kfree_skb(skb); ++ return; ++ } ++ ++ sd = &per_cpu(softnet_data, cpu); ++ defer_max = READ_ONCE(sysctl_skb_defer_max); ++ if (READ_ONCE(sd->defer_count) >= defer_max) ++ goto nodefer; ++ ++ spin_lock_irqsave(&sd->defer_lock, flags); ++ /* Send an IPI every time queue reaches half capacity. */ ++ kick = sd->defer_count == (defer_max >> 1); ++ /* Paired with the READ_ONCE() few lines above */ ++ WRITE_ONCE(sd->defer_count, sd->defer_count + 1); ++ ++ skb->next = sd->defer_list; ++ /* Paired with READ_ONCE() in skb_defer_free_flush() */ ++ WRITE_ONCE(sd->defer_list, skb); ++ spin_unlock_irqrestore(&sd->defer_lock, flags); ++ ++ /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU ++ * if we are unlucky enough (this seems very unlikely). ++ */ ++ if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) ++ smp_call_function_single_async(cpu, &sd->defer_csd); ++} +diff -rupN linux.orig/net/dsa/slave.c linux/net/dsa/slave.c +--- linux.orig/net/dsa/slave.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/dsa/slave.c 2022-12-04 10:40:26.732034003 -0500 +@@ -934,12 +934,12 @@ static void dsa_slave_get_ethtool_stats( s = per_cpu_ptr(dev->tstats, i); do { @@ -8833,11 +57287,10 @@ index 1291c2431d440..dcc550b871623 100644 data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; -diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c -index 3ca0cc4678862..dbae0c79d5cfb 100644 ---- a/net/ipv4/af_inet.c -+++ b/net/ipv4/af_inet.c -@@ -1684,9 +1684,9 @@ u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, +diff -rupN linux.orig/net/ipv4/af_inet.c linux/net/ipv4/af_inet.c +--- linux.orig/net/ipv4/af_inet.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/ipv4/af_inet.c 2022-12-04 10:40:26.732034003 -0500 +@@ -1686,9 +1686,9 @@ u64 snmp_get_cpu_field64(void __percpu * bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { @@ -8849,11 +57302,2095 @@ index 3ca0cc4678862..dbae0c79d5cfb 100644 return v; } -diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c -index b7de5e46fdd8f..f84da849819cc 100644 ---- a/net/ipv6/seg6_local.c -+++ b/net/ipv6/seg6_local.c -@@ -1508,13 +1508,13 @@ static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt) +diff -rupN linux.orig/net/ipv4/af_inet.c.orig linux/net/ipv4/af_inet.c.orig +--- linux.orig/net/ipv4/af_inet.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/net/ipv4/af_inet.c.orig 2022-12-04 10:40:18.732054506 -0500 +@@ -0,0 +1,2081 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * INET An implementation of the TCP/IP protocol suite for the LINUX ++ * operating system. INET is implemented using the BSD Socket ++ * interface as the means of communication with the user level. ++ * ++ * PF_INET protocol family socket handler. ++ * ++ * Authors: Ross Biro ++ * Fred N. van Kempen, ++ * Florian La Roche, ++ * Alan Cox, ++ * ++ * Changes (see also sock.c) ++ * ++ * piggy, ++ * Karl Knutson : Socket protocol table ++ * A.N.Kuznetsov : Socket death error in accept(). ++ * John Richardson : Fix non blocking error in connect() ++ * so sockets that fail to connect ++ * don't return -EINPROGRESS. ++ * Alan Cox : Asynchronous I/O support ++ * Alan Cox : Keep correct socket pointer on sock ++ * structures ++ * when accept() ed ++ * Alan Cox : Semantics of SO_LINGER aren't state ++ * moved to close when you look carefully. ++ * With this fixed and the accept bug fixed ++ * some RPC stuff seems happier. ++ * Niibe Yutaka : 4.4BSD style write async I/O ++ * Alan Cox, ++ * Tony Gale : Fixed reuse semantics. ++ * Alan Cox : bind() shouldn't abort existing but dead ++ * sockets. Stops FTP netin:.. I hope. ++ * Alan Cox : bind() works correctly for RAW sockets. ++ * Note that FreeBSD at least was broken ++ * in this respect so be careful with ++ * compatibility tests... ++ * Alan Cox : routing cache support ++ * Alan Cox : memzero the socket structure for ++ * compactness. ++ * Matt Day : nonblock connect error handler ++ * Alan Cox : Allow large numbers of pending sockets ++ * (eg for big web sites), but only if ++ * specifically application requested. ++ * Alan Cox : New buffering throughout IP. Used ++ * dumbly. ++ * Alan Cox : New buffering now used smartly. ++ * Alan Cox : BSD rather than common sense ++ * interpretation of listen. ++ * Germano Caronni : Assorted small races. ++ * Alan Cox : sendmsg/recvmsg basic support. ++ * Alan Cox : Only sendmsg/recvmsg now supported. ++ * Alan Cox : Locked down bind (see security list). ++ * Alan Cox : Loosened bind a little. ++ * Mike McLagan : ADD/DEL DLCI Ioctls ++ * Willy Konynenberg : Transparent proxying support. ++ * David S. Miller : New socket lookup architecture. ++ * Some other random speedups. ++ * Cyrus Durgin : Cleaned up file for kmod hacks. ++ * Andi Kleen : Fix inet_stream_connect TCP race. ++ */ ++ ++#define pr_fmt(fmt) "IPv4: " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_IP_MROUTE ++#include ++#endif ++#include ++#include ++ ++#include ++ ++/* The inetsw table contains everything that inet_create needs to ++ * build a new socket. ++ */ ++static struct list_head inetsw[SOCK_MAX]; ++static DEFINE_SPINLOCK(inetsw_lock); ++ ++/* New destruction routine */ ++ ++void inet_sock_destruct(struct sock *sk) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ ++ __skb_queue_purge(&sk->sk_receive_queue); ++ __skb_queue_purge(&sk->sk_error_queue); ++ ++ sk_mem_reclaim_final(sk); ++ ++ if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { ++ pr_err("Attempt to release TCP socket in state %d %p\n", ++ sk->sk_state, sk); ++ return; ++ } ++ if (!sock_flag(sk, SOCK_DEAD)) { ++ pr_err("Attempt to release alive inet socket %p\n", sk); ++ return; ++ } ++ ++ WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); ++ WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); ++ WARN_ON_ONCE(sk->sk_wmem_queued); ++ WARN_ON_ONCE(sk_forward_alloc_get(sk)); ++ ++ kfree(rcu_dereference_protected(inet->inet_opt, 1)); ++ dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); ++ dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); ++ sk_refcnt_debug_dec(sk); ++} ++EXPORT_SYMBOL(inet_sock_destruct); ++ ++/* ++ * The routines beyond this point handle the behaviour of an AF_INET ++ * socket object. Mostly it punts to the subprotocols of IP to do ++ * the work. ++ */ ++ ++/* ++ * Automatically bind an unbound socket. ++ */ ++ ++static int inet_autobind(struct sock *sk) ++{ ++ struct inet_sock *inet; ++ /* We may need to bind the socket. */ ++ lock_sock(sk); ++ inet = inet_sk(sk); ++ if (!inet->inet_num) { ++ if (sk->sk_prot->get_port(sk, 0)) { ++ release_sock(sk); ++ return -EAGAIN; ++ } ++ inet->inet_sport = htons(inet->inet_num); ++ } ++ release_sock(sk); ++ return 0; ++} ++ ++/* ++ * Move a socket into listening state. ++ */ ++int inet_listen(struct socket *sock, int backlog) ++{ ++ struct sock *sk = sock->sk; ++ unsigned char old_state; ++ int err, tcp_fastopen; ++ ++ lock_sock(sk); ++ ++ err = -EINVAL; ++ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) ++ goto out; ++ ++ old_state = sk->sk_state; ++ if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) ++ goto out; ++ ++ WRITE_ONCE(sk->sk_max_ack_backlog, backlog); ++ /* Really, if the socket is already in listen state ++ * we can only allow the backlog to be adjusted. ++ */ ++ if (old_state != TCP_LISTEN) { ++ /* Enable TFO w/o requiring TCP_FASTOPEN socket option. ++ * Note that only TCP sockets (SOCK_STREAM) will reach here. ++ * Also fastopen backlog may already been set via the option ++ * because the socket was in TCP_LISTEN state previously but ++ * was shutdown() rather than close(). ++ */ ++ tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); ++ if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && ++ (tcp_fastopen & TFO_SERVER_ENABLE) && ++ !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { ++ fastopen_queue_tune(sk, backlog); ++ tcp_fastopen_init_key_once(sock_net(sk)); ++ } ++ ++ err = inet_csk_listen_start(sk); ++ if (err) ++ goto out; ++ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); ++ } ++ err = 0; ++ ++out: ++ release_sock(sk); ++ return err; ++} ++EXPORT_SYMBOL(inet_listen); ++ ++/* ++ * Create an inet socket. ++ */ ++ ++static int inet_create(struct net *net, struct socket *sock, int protocol, ++ int kern) ++{ ++ struct sock *sk; ++ struct inet_protosw *answer; ++ struct inet_sock *inet; ++ struct proto *answer_prot; ++ unsigned char answer_flags; ++ int try_loading_module = 0; ++ int err; ++ ++ if (protocol < 0 || protocol >= IPPROTO_MAX) ++ return -EINVAL; ++ ++ sock->state = SS_UNCONNECTED; ++ ++ /* Look for the requested type/protocol pair. */ ++lookup_protocol: ++ err = -ESOCKTNOSUPPORT; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { ++ ++ err = 0; ++ /* Check the non-wild match. */ ++ if (protocol == answer->protocol) { ++ if (protocol != IPPROTO_IP) ++ break; ++ } else { ++ /* Check for the two wild cases. */ ++ if (IPPROTO_IP == protocol) { ++ protocol = answer->protocol; ++ break; ++ } ++ if (IPPROTO_IP == answer->protocol) ++ break; ++ } ++ err = -EPROTONOSUPPORT; ++ } ++ ++ if (unlikely(err)) { ++ if (try_loading_module < 2) { ++ rcu_read_unlock(); ++ /* ++ * Be more specific, e.g. net-pf-2-proto-132-type-1 ++ * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) ++ */ ++ if (++try_loading_module == 1) ++ request_module("net-pf-%d-proto-%d-type-%d", ++ PF_INET, protocol, sock->type); ++ /* ++ * Fall back to generic, e.g. net-pf-2-proto-132 ++ * (net-pf-PF_INET-proto-IPPROTO_SCTP) ++ */ ++ else ++ request_module("net-pf-%d-proto-%d", ++ PF_INET, protocol); ++ goto lookup_protocol; ++ } else ++ goto out_rcu_unlock; ++ } ++ ++ err = -EPERM; ++ if (sock->type == SOCK_RAW && !kern && ++ !ns_capable(net->user_ns, CAP_NET_RAW)) ++ goto out_rcu_unlock; ++ ++ sock->ops = answer->ops; ++ answer_prot = answer->prot; ++ answer_flags = answer->flags; ++ rcu_read_unlock(); ++ ++ WARN_ON(!answer_prot->slab); ++ ++ err = -ENOMEM; ++ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); ++ if (!sk) ++ goto out; ++ ++ err = 0; ++ if (INET_PROTOSW_REUSE & answer_flags) ++ sk->sk_reuse = SK_CAN_REUSE; ++ ++ inet = inet_sk(sk); ++ inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; ++ ++ inet->nodefrag = 0; ++ ++ if (SOCK_RAW == sock->type) { ++ inet->inet_num = protocol; ++ if (IPPROTO_RAW == protocol) ++ inet->hdrincl = 1; ++ } ++ ++ if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) ++ inet->pmtudisc = IP_PMTUDISC_DONT; ++ else ++ inet->pmtudisc = IP_PMTUDISC_WANT; ++ ++ inet->inet_id = 0; ++ ++ sock_init_data(sock, sk); ++ ++ sk->sk_destruct = inet_sock_destruct; ++ sk->sk_protocol = protocol; ++ sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; ++ ++ inet->uc_ttl = -1; ++ inet->mc_loop = 1; ++ inet->mc_ttl = 1; ++ inet->mc_all = 1; ++ inet->mc_index = 0; ++ inet->mc_list = NULL; ++ inet->rcv_tos = 0; ++ ++ sk_refcnt_debug_inc(sk); ++ ++ if (inet->inet_num) { ++ /* It assumes that any protocol which allows ++ * the user to assign a number at socket ++ * creation time automatically ++ * shares. ++ */ ++ inet->inet_sport = htons(inet->inet_num); ++ /* Add to protocol hash chains. */ ++ err = sk->sk_prot->hash(sk); ++ if (err) { ++ sk_common_release(sk); ++ goto out; ++ } ++ } ++ ++ if (sk->sk_prot->init) { ++ err = sk->sk_prot->init(sk); ++ if (err) { ++ sk_common_release(sk); ++ goto out; ++ } ++ } ++ ++ if (!kern) { ++ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); ++ if (err) { ++ sk_common_release(sk); ++ goto out; ++ } ++ } ++out: ++ return err; ++out_rcu_unlock: ++ rcu_read_unlock(); ++ goto out; ++} ++ ++ ++/* ++ * The peer socket should always be NULL (or else). When we call this ++ * function we are destroying the object and from then on nobody ++ * should refer to it. ++ */ ++int inet_release(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ ++ if (sk) { ++ long timeout; ++ ++ if (!sk->sk_kern_sock) ++ BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); ++ ++ /* Applications forget to leave groups before exiting */ ++ ip_mc_drop_socket(sk); ++ ++ /* If linger is set, we don't return until the close ++ * is complete. Otherwise we return immediately. The ++ * actually closing is done the same either way. ++ * ++ * If the close is due to the process exiting, we never ++ * linger.. ++ */ ++ timeout = 0; ++ if (sock_flag(sk, SOCK_LINGER) && ++ !(current->flags & PF_EXITING)) ++ timeout = sk->sk_lingertime; ++ sk->sk_prot->close(sk, timeout); ++ sock->sk = NULL; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(inet_release); ++ ++int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ++{ ++ struct sock *sk = sock->sk; ++ u32 flags = BIND_WITH_LOCK; ++ int err; ++ ++ /* If the socket has its own bind function then use it. (RAW) */ ++ if (sk->sk_prot->bind) { ++ return sk->sk_prot->bind(sk, uaddr, addr_len); ++ } ++ if (addr_len < sizeof(struct sockaddr_in)) ++ return -EINVAL; ++ ++ /* BPF prog is run before any checks are done so that if the prog ++ * changes context in a wrong way it will be caught. ++ */ ++ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, ++ CGROUP_INET4_BIND, &flags); ++ if (err) ++ return err; ++ ++ return __inet_bind(sk, uaddr, addr_len, flags); ++} ++EXPORT_SYMBOL(inet_bind); ++ ++int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, ++ u32 flags) ++{ ++ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; ++ struct inet_sock *inet = inet_sk(sk); ++ struct net *net = sock_net(sk); ++ unsigned short snum; ++ int chk_addr_ret; ++ u32 tb_id = RT_TABLE_LOCAL; ++ int err; ++ ++ if (addr->sin_family != AF_INET) { ++ /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) ++ * only if s_addr is INADDR_ANY. ++ */ ++ err = -EAFNOSUPPORT; ++ if (addr->sin_family != AF_UNSPEC || ++ addr->sin_addr.s_addr != htonl(INADDR_ANY)) ++ goto out; ++ } ++ ++ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; ++ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); ++ ++ /* Not specified by any standard per-se, however it breaks too ++ * many applications when removed. It is unfortunate since ++ * allowing applications to make a non-local bind solves ++ * several problems with systems using dynamic addressing. ++ * (ie. your servers still start up even if your ISDN link ++ * is temporarily down) ++ */ ++ err = -EADDRNOTAVAIL; ++ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr, ++ chk_addr_ret)) ++ goto out; ++ ++ snum = ntohs(addr->sin_port); ++ err = -EACCES; ++ if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && ++ snum && inet_port_requires_bind_service(net, snum) && ++ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) ++ goto out; ++ ++ /* We keep a pair of addresses. rcv_saddr is the one ++ * used by hash lookups, and saddr is used for transmit. ++ * ++ * In the BSD API these are the same except where it ++ * would be illegal to use them (multicast/broadcast) in ++ * which case the sending device address is used. ++ */ ++ if (flags & BIND_WITH_LOCK) ++ lock_sock(sk); ++ ++ /* Check these errors (active socket, double bind). */ ++ err = -EINVAL; ++ if (sk->sk_state != TCP_CLOSE || inet->inet_num) ++ goto out_release_sock; ++ ++ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; ++ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) ++ inet->inet_saddr = 0; /* Use device */ ++ ++ /* Make sure we are allowed to bind here. */ ++ if (snum || !(inet->bind_address_no_port || ++ (flags & BIND_FORCE_ADDRESS_NO_PORT))) { ++ if (sk->sk_prot->get_port(sk, snum)) { ++ inet->inet_saddr = inet->inet_rcv_saddr = 0; ++ err = -EADDRINUSE; ++ goto out_release_sock; ++ } ++ if (!(flags & BIND_FROM_BPF)) { ++ err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); ++ if (err) { ++ inet->inet_saddr = inet->inet_rcv_saddr = 0; ++ if (sk->sk_prot->put_port) ++ sk->sk_prot->put_port(sk); ++ goto out_release_sock; ++ } ++ } ++ } ++ ++ if (inet->inet_rcv_saddr) ++ sk->sk_userlocks |= SOCK_BINDADDR_LOCK; ++ if (snum) ++ sk->sk_userlocks |= SOCK_BINDPORT_LOCK; ++ inet->inet_sport = htons(inet->inet_num); ++ inet->inet_daddr = 0; ++ inet->inet_dport = 0; ++ sk_dst_reset(sk); ++ err = 0; ++out_release_sock: ++ if (flags & BIND_WITH_LOCK) ++ release_sock(sk); ++out: ++ return err; ++} ++ ++int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len, int flags) ++{ ++ struct sock *sk = sock->sk; ++ int err; ++ ++ if (addr_len < sizeof(uaddr->sa_family)) ++ return -EINVAL; ++ if (uaddr->sa_family == AF_UNSPEC) ++ return sk->sk_prot->disconnect(sk, flags); ++ ++ if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { ++ err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); ++ if (err) ++ return err; ++ } ++ ++ if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) ++ return -EAGAIN; ++ return sk->sk_prot->connect(sk, uaddr, addr_len); ++} ++EXPORT_SYMBOL(inet_dgram_connect); ++ ++static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) ++{ ++ DEFINE_WAIT_FUNC(wait, woken_wake_function); ++ ++ add_wait_queue(sk_sleep(sk), &wait); ++ sk->sk_write_pending += writebias; ++ ++ /* Basic assumption: if someone sets sk->sk_err, he _must_ ++ * change state of the socket from TCP_SYN_*. ++ * Connect() does not allow to get error notifications ++ * without closing the socket. ++ */ ++ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { ++ release_sock(sk); ++ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); ++ lock_sock(sk); ++ if (signal_pending(current) || !timeo) ++ break; ++ } ++ remove_wait_queue(sk_sleep(sk), &wait); ++ sk->sk_write_pending -= writebias; ++ return timeo; ++} ++ ++/* ++ * Connect to a remote host. There is regrettably still a little ++ * TCP 'magic' in here. ++ */ ++int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len, int flags, int is_sendmsg) ++{ ++ struct sock *sk = sock->sk; ++ int err; ++ long timeo; ++ ++ /* ++ * uaddr can be NULL and addr_len can be 0 if: ++ * sk is a TCP fastopen active socket and ++ * TCP_FASTOPEN_CONNECT sockopt is set and ++ * we already have a valid cookie for this socket. ++ * In this case, user can call write() after connect(). ++ * write() will invoke tcp_sendmsg_fastopen() which calls ++ * __inet_stream_connect(). ++ */ ++ if (uaddr) { ++ if (addr_len < sizeof(uaddr->sa_family)) ++ return -EINVAL; ++ ++ if (uaddr->sa_family == AF_UNSPEC) { ++ err = sk->sk_prot->disconnect(sk, flags); ++ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; ++ goto out; ++ } ++ } ++ ++ switch (sock->state) { ++ default: ++ err = -EINVAL; ++ goto out; ++ case SS_CONNECTED: ++ err = -EISCONN; ++ goto out; ++ case SS_CONNECTING: ++ if (inet_sk(sk)->defer_connect) ++ err = is_sendmsg ? -EINPROGRESS : -EISCONN; ++ else ++ err = -EALREADY; ++ /* Fall out of switch with err, set for this state */ ++ break; ++ case SS_UNCONNECTED: ++ err = -EISCONN; ++ if (sk->sk_state != TCP_CLOSE) ++ goto out; ++ ++ if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { ++ err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); ++ if (err) ++ goto out; ++ } ++ ++ err = sk->sk_prot->connect(sk, uaddr, addr_len); ++ if (err < 0) ++ goto out; ++ ++ sock->state = SS_CONNECTING; ++ ++ if (!err && inet_sk(sk)->defer_connect) ++ goto out; ++ ++ /* Just entered SS_CONNECTING state; the only ++ * difference is that return value in non-blocking ++ * case is EINPROGRESS, rather than EALREADY. ++ */ ++ err = -EINPROGRESS; ++ break; ++ } ++ ++ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); ++ ++ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { ++ int writebias = (sk->sk_protocol == IPPROTO_TCP) && ++ tcp_sk(sk)->fastopen_req && ++ tcp_sk(sk)->fastopen_req->data ? 1 : 0; ++ ++ /* Error code is set above */ ++ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) ++ goto out; ++ ++ err = sock_intr_errno(timeo); ++ if (signal_pending(current)) ++ goto out; ++ } ++ ++ /* Connection was closed by RST, timeout, ICMP error ++ * or another process disconnected us. ++ */ ++ if (sk->sk_state == TCP_CLOSE) ++ goto sock_error; ++ ++ /* sk->sk_err may be not zero now, if RECVERR was ordered by user ++ * and error was received after socket entered established state. ++ * Hence, it is handled normally after connect() return successfully. ++ */ ++ ++ sock->state = SS_CONNECTED; ++ err = 0; ++out: ++ return err; ++ ++sock_error: ++ err = sock_error(sk) ? : -ECONNABORTED; ++ sock->state = SS_UNCONNECTED; ++ if (sk->sk_prot->disconnect(sk, flags)) ++ sock->state = SS_DISCONNECTING; ++ goto out; ++} ++EXPORT_SYMBOL(__inet_stream_connect); ++ ++int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len, int flags) ++{ ++ int err; ++ ++ lock_sock(sock->sk); ++ err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); ++ release_sock(sock->sk); ++ return err; ++} ++EXPORT_SYMBOL(inet_stream_connect); ++ ++/* ++ * Accept a pending connection. The TCP layer now gives BSD semantics. ++ */ ++ ++int inet_accept(struct socket *sock, struct socket *newsock, int flags, ++ bool kern) ++{ ++ struct sock *sk1 = sock->sk; ++ int err = -EINVAL; ++ struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern); ++ ++ if (!sk2) ++ goto do_err; ++ ++ lock_sock(sk2); ++ ++ sock_rps_record_flow(sk2); ++ WARN_ON(!((1 << sk2->sk_state) & ++ (TCPF_ESTABLISHED | TCPF_SYN_RECV | ++ TCPF_CLOSE_WAIT | TCPF_CLOSE))); ++ ++ if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) ++ set_bit(SOCK_SUPPORT_ZC, &newsock->flags); ++ sock_graft(sk2, newsock); ++ ++ newsock->state = SS_CONNECTED; ++ err = 0; ++ release_sock(sk2); ++do_err: ++ return err; ++} ++EXPORT_SYMBOL(inet_accept); ++ ++/* ++ * This does both peername and sockname. ++ */ ++int inet_getname(struct socket *sock, struct sockaddr *uaddr, ++ int peer) ++{ ++ struct sock *sk = sock->sk; ++ struct inet_sock *inet = inet_sk(sk); ++ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); ++ ++ sin->sin_family = AF_INET; ++ lock_sock(sk); ++ if (peer) { ++ if (!inet->inet_dport || ++ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && ++ peer == 1)) { ++ release_sock(sk); ++ return -ENOTCONN; ++ } ++ sin->sin_port = inet->inet_dport; ++ sin->sin_addr.s_addr = inet->inet_daddr; ++ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, ++ CGROUP_INET4_GETPEERNAME); ++ } else { ++ __be32 addr = inet->inet_rcv_saddr; ++ if (!addr) ++ addr = inet->inet_saddr; ++ sin->sin_port = inet->inet_sport; ++ sin->sin_addr.s_addr = addr; ++ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, ++ CGROUP_INET4_GETSOCKNAME); ++ } ++ release_sock(sk); ++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); ++ return sizeof(*sin); ++} ++EXPORT_SYMBOL(inet_getname); ++ ++int inet_send_prepare(struct sock *sk) ++{ ++ sock_rps_record_flow(sk); ++ ++ /* We may need to bind the socket. */ ++ if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind && ++ inet_autobind(sk)) ++ return -EAGAIN; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(inet_send_prepare); ++ ++int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) ++{ ++ struct sock *sk = sock->sk; ++ ++ if (unlikely(inet_send_prepare(sk))) ++ return -EAGAIN; ++ ++ return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg, ++ sk, msg, size); ++} ++EXPORT_SYMBOL(inet_sendmsg); ++ ++ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, ++ size_t size, int flags) ++{ ++ struct sock *sk = sock->sk; ++ ++ if (unlikely(inet_send_prepare(sk))) ++ return -EAGAIN; ++ ++ if (sk->sk_prot->sendpage) ++ return sk->sk_prot->sendpage(sk, page, offset, size, flags); ++ return sock_no_sendpage(sock, page, offset, size, flags); ++} ++EXPORT_SYMBOL(inet_sendpage); ++ ++INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, ++ size_t, int, int *)); ++int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, ++ int flags) ++{ ++ struct sock *sk = sock->sk; ++ int addr_len = 0; ++ int err; ++ ++ if (likely(!(flags & MSG_ERRQUEUE))) ++ sock_rps_record_flow(sk); ++ ++ err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, ++ sk, msg, size, flags, &addr_len); ++ if (err >= 0) ++ msg->msg_namelen = addr_len; ++ return err; ++} ++EXPORT_SYMBOL(inet_recvmsg); ++ ++int inet_shutdown(struct socket *sock, int how) ++{ ++ struct sock *sk = sock->sk; ++ int err = 0; ++ ++ /* This should really check to make sure ++ * the socket is a TCP socket. (WHY AC...) ++ */ ++ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and ++ 1->2 bit 2 snds. ++ 2->3 */ ++ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ ++ return -EINVAL; ++ ++ lock_sock(sk); ++ if (sock->state == SS_CONNECTING) { ++ if ((1 << sk->sk_state) & ++ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) ++ sock->state = SS_DISCONNECTING; ++ else ++ sock->state = SS_CONNECTED; ++ } ++ ++ switch (sk->sk_state) { ++ case TCP_CLOSE: ++ err = -ENOTCONN; ++ /* Hack to wake up other listeners, who can poll for ++ EPOLLHUP, even on eg. unconnected UDP sockets -- RR */ ++ fallthrough; ++ default: ++ sk->sk_shutdown |= how; ++ if (sk->sk_prot->shutdown) ++ sk->sk_prot->shutdown(sk, how); ++ break; ++ ++ /* Remaining two branches are temporary solution for missing ++ * close() in multithreaded environment. It is _not_ a good idea, ++ * but we have no choice until close() is repaired at VFS level. ++ */ ++ case TCP_LISTEN: ++ if (!(how & RCV_SHUTDOWN)) ++ break; ++ fallthrough; ++ case TCP_SYN_SENT: ++ err = sk->sk_prot->disconnect(sk, O_NONBLOCK); ++ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; ++ break; ++ } ++ ++ /* Wake up anyone sleeping in poll. */ ++ sk->sk_state_change(sk); ++ release_sock(sk); ++ return err; ++} ++EXPORT_SYMBOL(inet_shutdown); ++ ++/* ++ * ioctl() calls you can issue on an INET socket. Most of these are ++ * device configuration and stuff and very rarely used. Some ioctls ++ * pass on to the socket itself. ++ * ++ * NOTE: I like the idea of a module for the config stuff. ie ifconfig ++ * loads the devconfigure module does its configuring and unloads it. ++ * There's a good 20K of config code hanging around the kernel. ++ */ ++ ++int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) ++{ ++ struct sock *sk = sock->sk; ++ int err = 0; ++ struct net *net = sock_net(sk); ++ void __user *p = (void __user *)arg; ++ struct ifreq ifr; ++ struct rtentry rt; ++ ++ switch (cmd) { ++ case SIOCADDRT: ++ case SIOCDELRT: ++ if (copy_from_user(&rt, p, sizeof(struct rtentry))) ++ return -EFAULT; ++ err = ip_rt_ioctl(net, cmd, &rt); ++ break; ++ case SIOCRTMSG: ++ err = -EINVAL; ++ break; ++ case SIOCDARP: ++ case SIOCGARP: ++ case SIOCSARP: ++ err = arp_ioctl(net, cmd, (void __user *)arg); ++ break; ++ case SIOCGIFADDR: ++ case SIOCGIFBRDADDR: ++ case SIOCGIFNETMASK: ++ case SIOCGIFDSTADDR: ++ case SIOCGIFPFLAGS: ++ if (get_user_ifreq(&ifr, NULL, p)) ++ return -EFAULT; ++ err = devinet_ioctl(net, cmd, &ifr); ++ if (!err && put_user_ifreq(&ifr, p)) ++ err = -EFAULT; ++ break; ++ ++ case SIOCSIFADDR: ++ case SIOCSIFBRDADDR: ++ case SIOCSIFNETMASK: ++ case SIOCSIFDSTADDR: ++ case SIOCSIFPFLAGS: ++ case SIOCSIFFLAGS: ++ if (get_user_ifreq(&ifr, NULL, p)) ++ return -EFAULT; ++ err = devinet_ioctl(net, cmd, &ifr); ++ break; ++ default: ++ if (sk->sk_prot->ioctl) ++ err = sk->sk_prot->ioctl(sk, cmd, arg); ++ else ++ err = -ENOIOCTLCMD; ++ break; ++ } ++ return err; ++} ++EXPORT_SYMBOL(inet_ioctl); ++ ++#ifdef CONFIG_COMPAT ++static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd, ++ struct compat_rtentry __user *ur) ++{ ++ compat_uptr_t rtdev; ++ struct rtentry rt; ++ ++ if (copy_from_user(&rt.rt_dst, &ur->rt_dst, ++ 3 * sizeof(struct sockaddr)) || ++ get_user(rt.rt_flags, &ur->rt_flags) || ++ get_user(rt.rt_metric, &ur->rt_metric) || ++ get_user(rt.rt_mtu, &ur->rt_mtu) || ++ get_user(rt.rt_window, &ur->rt_window) || ++ get_user(rt.rt_irtt, &ur->rt_irtt) || ++ get_user(rtdev, &ur->rt_dev)) ++ return -EFAULT; ++ ++ rt.rt_dev = compat_ptr(rtdev); ++ return ip_rt_ioctl(sock_net(sk), cmd, &rt); ++} ++ ++static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) ++{ ++ void __user *argp = compat_ptr(arg); ++ struct sock *sk = sock->sk; ++ ++ switch (cmd) { ++ case SIOCADDRT: ++ case SIOCDELRT: ++ return inet_compat_routing_ioctl(sk, cmd, argp); ++ default: ++ if (!sk->sk_prot->compat_ioctl) ++ return -ENOIOCTLCMD; ++ return sk->sk_prot->compat_ioctl(sk, cmd, arg); ++ } ++} ++#endif /* CONFIG_COMPAT */ ++ ++const struct proto_ops inet_stream_ops = { ++ .family = PF_INET, ++ .owner = THIS_MODULE, ++ .release = inet_release, ++ .bind = inet_bind, ++ .connect = inet_stream_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = inet_accept, ++ .getname = inet_getname, ++ .poll = tcp_poll, ++ .ioctl = inet_ioctl, ++ .gettstamp = sock_gettstamp, ++ .listen = inet_listen, ++ .shutdown = inet_shutdown, ++ .setsockopt = sock_common_setsockopt, ++ .getsockopt = sock_common_getsockopt, ++ .sendmsg = inet_sendmsg, ++ .recvmsg = inet_recvmsg, ++#ifdef CONFIG_MMU ++ .mmap = tcp_mmap, ++#endif ++ .sendpage = inet_sendpage, ++ .splice_read = tcp_splice_read, ++ .read_sock = tcp_read_sock, ++ .read_skb = tcp_read_skb, ++ .sendmsg_locked = tcp_sendmsg_locked, ++ .sendpage_locked = tcp_sendpage_locked, ++ .peek_len = tcp_peek_len, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = inet_compat_ioctl, ++#endif ++ .set_rcvlowat = tcp_set_rcvlowat, ++}; ++EXPORT_SYMBOL(inet_stream_ops); ++ ++const struct proto_ops inet_dgram_ops = { ++ .family = PF_INET, ++ .owner = THIS_MODULE, ++ .release = inet_release, ++ .bind = inet_bind, ++ .connect = inet_dgram_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = sock_no_accept, ++ .getname = inet_getname, ++ .poll = udp_poll, ++ .ioctl = inet_ioctl, ++ .gettstamp = sock_gettstamp, ++ .listen = sock_no_listen, ++ .shutdown = inet_shutdown, ++ .setsockopt = sock_common_setsockopt, ++ .getsockopt = sock_common_getsockopt, ++ .sendmsg = inet_sendmsg, ++ .read_skb = udp_read_skb, ++ .recvmsg = inet_recvmsg, ++ .mmap = sock_no_mmap, ++ .sendpage = inet_sendpage, ++ .set_peek_off = sk_set_peek_off, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = inet_compat_ioctl, ++#endif ++}; ++EXPORT_SYMBOL(inet_dgram_ops); ++ ++/* ++ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without ++ * udp_poll ++ */ ++static const struct proto_ops inet_sockraw_ops = { ++ .family = PF_INET, ++ .owner = THIS_MODULE, ++ .release = inet_release, ++ .bind = inet_bind, ++ .connect = inet_dgram_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = sock_no_accept, ++ .getname = inet_getname, ++ .poll = datagram_poll, ++ .ioctl = inet_ioctl, ++ .gettstamp = sock_gettstamp, ++ .listen = sock_no_listen, ++ .shutdown = inet_shutdown, ++ .setsockopt = sock_common_setsockopt, ++ .getsockopt = sock_common_getsockopt, ++ .sendmsg = inet_sendmsg, ++ .recvmsg = inet_recvmsg, ++ .mmap = sock_no_mmap, ++ .sendpage = inet_sendpage, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = inet_compat_ioctl, ++#endif ++}; ++ ++static const struct net_proto_family inet_family_ops = { ++ .family = PF_INET, ++ .create = inet_create, ++ .owner = THIS_MODULE, ++}; ++ ++/* Upon startup we insert all the elements in inetsw_array[] into ++ * the linked list inetsw. ++ */ ++static struct inet_protosw inetsw_array[] = ++{ ++ { ++ .type = SOCK_STREAM, ++ .protocol = IPPROTO_TCP, ++ .prot = &tcp_prot, ++ .ops = &inet_stream_ops, ++ .flags = INET_PROTOSW_PERMANENT | ++ INET_PROTOSW_ICSK, ++ }, ++ ++ { ++ .type = SOCK_DGRAM, ++ .protocol = IPPROTO_UDP, ++ .prot = &udp_prot, ++ .ops = &inet_dgram_ops, ++ .flags = INET_PROTOSW_PERMANENT, ++ }, ++ ++ { ++ .type = SOCK_DGRAM, ++ .protocol = IPPROTO_ICMP, ++ .prot = &ping_prot, ++ .ops = &inet_sockraw_ops, ++ .flags = INET_PROTOSW_REUSE, ++ }, ++ ++ { ++ .type = SOCK_RAW, ++ .protocol = IPPROTO_IP, /* wild card */ ++ .prot = &raw_prot, ++ .ops = &inet_sockraw_ops, ++ .flags = INET_PROTOSW_REUSE, ++ } ++}; ++ ++#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array) ++ ++void inet_register_protosw(struct inet_protosw *p) ++{ ++ struct list_head *lh; ++ struct inet_protosw *answer; ++ int protocol = p->protocol; ++ struct list_head *last_perm; ++ ++ spin_lock_bh(&inetsw_lock); ++ ++ if (p->type >= SOCK_MAX) ++ goto out_illegal; ++ ++ /* If we are trying to override a permanent protocol, bail. */ ++ last_perm = &inetsw[p->type]; ++ list_for_each(lh, &inetsw[p->type]) { ++ answer = list_entry(lh, struct inet_protosw, list); ++ /* Check only the non-wild match. */ ++ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) ++ break; ++ if (protocol == answer->protocol) ++ goto out_permanent; ++ last_perm = lh; ++ } ++ ++ /* Add the new entry after the last permanent entry if any, so that ++ * the new entry does not override a permanent entry when matched with ++ * a wild-card protocol. But it is allowed to override any existing ++ * non-permanent entry. This means that when we remove this entry, the ++ * system automatically returns to the old behavior. ++ */ ++ list_add_rcu(&p->list, last_perm); ++out: ++ spin_unlock_bh(&inetsw_lock); ++ ++ return; ++ ++out_permanent: ++ pr_err("Attempt to override permanent protocol %d\n", protocol); ++ goto out; ++ ++out_illegal: ++ pr_err("Ignoring attempt to register invalid socket type %d\n", ++ p->type); ++ goto out; ++} ++EXPORT_SYMBOL(inet_register_protosw); ++ ++void inet_unregister_protosw(struct inet_protosw *p) ++{ ++ if (INET_PROTOSW_PERMANENT & p->flags) { ++ pr_err("Attempt to unregister permanent protocol %d\n", ++ p->protocol); ++ } else { ++ spin_lock_bh(&inetsw_lock); ++ list_del_rcu(&p->list); ++ spin_unlock_bh(&inetsw_lock); ++ ++ synchronize_net(); ++ } ++} ++EXPORT_SYMBOL(inet_unregister_protosw); ++ ++static int inet_sk_reselect_saddr(struct sock *sk) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ __be32 old_saddr = inet->inet_saddr; ++ __be32 daddr = inet->inet_daddr; ++ struct flowi4 *fl4; ++ struct rtable *rt; ++ __be32 new_saddr; ++ struct ip_options_rcu *inet_opt; ++ ++ inet_opt = rcu_dereference_protected(inet->inet_opt, ++ lockdep_sock_is_held(sk)); ++ if (inet_opt && inet_opt->opt.srr) ++ daddr = inet_opt->opt.faddr; ++ ++ /* Query new route. */ ++ fl4 = &inet->cork.fl.u.ip4; ++ rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if, ++ sk->sk_protocol, inet->inet_sport, ++ inet->inet_dport, sk); ++ if (IS_ERR(rt)) ++ return PTR_ERR(rt); ++ ++ sk_setup_caps(sk, &rt->dst); ++ ++ new_saddr = fl4->saddr; ++ ++ if (new_saddr == old_saddr) ++ return 0; ++ ++ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) { ++ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", ++ __func__, &old_saddr, &new_saddr); ++ } ++ ++ inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; ++ ++ /* ++ * XXX The only one ugly spot where we need to ++ * XXX really change the sockets identity after ++ * XXX it has entered the hashes. -DaveM ++ * ++ * Besides that, it does not check for connection ++ * uniqueness. Wait for troubles. ++ */ ++ return __sk_prot_rehash(sk); ++} ++ ++int inet_sk_rebuild_header(struct sock *sk) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); ++ __be32 daddr; ++ struct ip_options_rcu *inet_opt; ++ struct flowi4 *fl4; ++ int err; ++ ++ /* Route is OK, nothing to do. */ ++ if (rt) ++ return 0; ++ ++ /* Reroute. */ ++ rcu_read_lock(); ++ inet_opt = rcu_dereference(inet->inet_opt); ++ daddr = inet->inet_daddr; ++ if (inet_opt && inet_opt->opt.srr) ++ daddr = inet_opt->opt.faddr; ++ rcu_read_unlock(); ++ fl4 = &inet->cork.fl.u.ip4; ++ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, ++ inet->inet_dport, inet->inet_sport, ++ sk->sk_protocol, RT_CONN_FLAGS(sk), ++ sk->sk_bound_dev_if); ++ if (!IS_ERR(rt)) { ++ err = 0; ++ sk_setup_caps(sk, &rt->dst); ++ } else { ++ err = PTR_ERR(rt); ++ ++ /* Routing failed... */ ++ sk->sk_route_caps = 0; ++ /* ++ * Other protocols have to map its equivalent state to TCP_SYN_SENT. ++ * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme ++ */ ++ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || ++ sk->sk_state != TCP_SYN_SENT || ++ (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || ++ (err = inet_sk_reselect_saddr(sk)) != 0) ++ sk->sk_err_soft = -err; ++ } ++ ++ return err; ++} ++EXPORT_SYMBOL(inet_sk_rebuild_header); ++ ++void inet_sk_set_state(struct sock *sk, int state) ++{ ++ trace_inet_sock_set_state(sk, sk->sk_state, state); ++ sk->sk_state = state; ++} ++EXPORT_SYMBOL(inet_sk_set_state); ++ ++void inet_sk_state_store(struct sock *sk, int newstate) ++{ ++ trace_inet_sock_set_state(sk, sk->sk_state, newstate); ++ smp_store_release(&sk->sk_state, newstate); ++} ++ ++struct sk_buff *inet_gso_segment(struct sk_buff *skb, ++ netdev_features_t features) ++{ ++ bool udpfrag = false, fixedid = false, gso_partial, encap; ++ struct sk_buff *segs = ERR_PTR(-EINVAL); ++ const struct net_offload *ops; ++ unsigned int offset = 0; ++ struct iphdr *iph; ++ int proto, tot_len; ++ int nhoff; ++ int ihl; ++ int id; ++ ++ skb_reset_network_header(skb); ++ nhoff = skb_network_header(skb) - skb_mac_header(skb); ++ if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) ++ goto out; ++ ++ iph = ip_hdr(skb); ++ ihl = iph->ihl * 4; ++ if (ihl < sizeof(*iph)) ++ goto out; ++ ++ id = ntohs(iph->id); ++ proto = iph->protocol; ++ ++ /* Warning: after this point, iph might be no longer valid */ ++ if (unlikely(!pskb_may_pull(skb, ihl))) ++ goto out; ++ __skb_pull(skb, ihl); ++ ++ encap = SKB_GSO_CB(skb)->encap_level > 0; ++ if (encap) ++ features &= skb->dev->hw_enc_features; ++ SKB_GSO_CB(skb)->encap_level += ihl; ++ ++ skb_reset_transport_header(skb); ++ ++ segs = ERR_PTR(-EPROTONOSUPPORT); ++ ++ if (!skb->encapsulation || encap) { ++ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ++ fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); ++ ++ /* fixed ID is invalid if DF bit is not set */ ++ if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) ++ goto out; ++ } ++ ++ ops = rcu_dereference(inet_offloads[proto]); ++ if (likely(ops && ops->callbacks.gso_segment)) { ++ segs = ops->callbacks.gso_segment(skb, features); ++ if (!segs) ++ skb->network_header = skb_mac_header(skb) + nhoff - skb->head; ++ } ++ ++ if (IS_ERR_OR_NULL(segs)) ++ goto out; ++ ++ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); ++ ++ skb = segs; ++ do { ++ iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); ++ if (udpfrag) { ++ iph->frag_off = htons(offset >> 3); ++ if (skb->next) ++ iph->frag_off |= htons(IP_MF); ++ offset += skb->len - nhoff - ihl; ++ tot_len = skb->len - nhoff; ++ } else if (skb_is_gso(skb)) { ++ if (!fixedid) { ++ iph->id = htons(id); ++ id += skb_shinfo(skb)->gso_segs; ++ } ++ ++ if (gso_partial) ++ tot_len = skb_shinfo(skb)->gso_size + ++ SKB_GSO_CB(skb)->data_offset + ++ skb->head - (unsigned char *)iph; ++ else ++ tot_len = skb->len - nhoff; ++ } else { ++ if (!fixedid) ++ iph->id = htons(id++); ++ tot_len = skb->len - nhoff; ++ } ++ iph->tot_len = htons(tot_len); ++ ip_send_check(iph); ++ if (encap) ++ skb_reset_inner_headers(skb); ++ skb->network_header = (u8 *)iph - skb->head; ++ skb_reset_mac_len(skb); ++ } while ((skb = skb->next)); ++ ++out: ++ return segs; ++} ++ ++static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, ++ netdev_features_t features) ++{ ++ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) ++ return ERR_PTR(-EINVAL); ++ ++ return inet_gso_segment(skb, features); ++} ++ ++struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) ++{ ++ const struct net_offload *ops; ++ struct sk_buff *pp = NULL; ++ const struct iphdr *iph; ++ struct sk_buff *p; ++ unsigned int hlen; ++ unsigned int off; ++ unsigned int id; ++ int flush = 1; ++ int proto; ++ ++ off = skb_gro_offset(skb); ++ hlen = off + sizeof(*iph); ++ iph = skb_gro_header_fast(skb, off); ++ if (skb_gro_header_hard(skb, hlen)) { ++ iph = skb_gro_header_slow(skb, hlen, off); ++ if (unlikely(!iph)) ++ goto out; ++ } ++ ++ proto = iph->protocol; ++ ++ ops = rcu_dereference(inet_offloads[proto]); ++ if (!ops || !ops->callbacks.gro_receive) ++ goto out; ++ ++ if (*(u8 *)iph != 0x45) ++ goto out; ++ ++ if (ip_is_fragment(iph)) ++ goto out; ++ ++ if (unlikely(ip_fast_csum((u8 *)iph, 5))) ++ goto out; ++ ++ id = ntohl(*(__be32 *)&iph->id); ++ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); ++ id >>= 16; ++ ++ list_for_each_entry(p, head, list) { ++ struct iphdr *iph2; ++ u16 flush_id; ++ ++ if (!NAPI_GRO_CB(p)->same_flow) ++ continue; ++ ++ iph2 = (struct iphdr *)(p->data + off); ++ /* The above works because, with the exception of the top ++ * (inner most) layer, we only aggregate pkts with the same ++ * hdr length so all the hdrs we'll need to verify will start ++ * at the same offset. ++ */ ++ if ((iph->protocol ^ iph2->protocol) | ++ ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ++ ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { ++ NAPI_GRO_CB(p)->same_flow = 0; ++ continue; ++ } ++ ++ /* All fields must match except length and checksum. */ ++ NAPI_GRO_CB(p)->flush |= ++ (iph->ttl ^ iph2->ttl) | ++ (iph->tos ^ iph2->tos) | ++ ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); ++ ++ NAPI_GRO_CB(p)->flush |= flush; ++ ++ /* We need to store of the IP ID check to be included later ++ * when we can verify that this packet does in fact belong ++ * to a given flow. ++ */ ++ flush_id = (u16)(id - ntohs(iph2->id)); ++ ++ /* This bit of code makes it much easier for us to identify ++ * the cases where we are doing atomic vs non-atomic IP ID ++ * checks. Specifically an atomic check can return IP ID ++ * values 0 - 0xFFFF, while a non-atomic check can only ++ * return 0 or 0xFFFF. ++ */ ++ if (!NAPI_GRO_CB(p)->is_atomic || ++ !(iph->frag_off & htons(IP_DF))) { ++ flush_id ^= NAPI_GRO_CB(p)->count; ++ flush_id = flush_id ? 0xFFFF : 0; ++ } ++ ++ /* If the previous IP ID value was based on an atomic ++ * datagram we can overwrite the value and ignore it. ++ */ ++ if (NAPI_GRO_CB(skb)->is_atomic) ++ NAPI_GRO_CB(p)->flush_id = flush_id; ++ else ++ NAPI_GRO_CB(p)->flush_id |= flush_id; ++ } ++ ++ NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); ++ NAPI_GRO_CB(skb)->flush |= flush; ++ skb_set_network_header(skb, off); ++ /* The above will be needed by the transport layer if there is one ++ * immediately following this IP hdr. ++ */ ++ ++ /* Note : No need to call skb_gro_postpull_rcsum() here, ++ * as we already checked checksum over ipv4 header was 0 ++ */ ++ skb_gro_pull(skb, sizeof(*iph)); ++ skb_set_transport_header(skb, skb_gro_offset(skb)); ++ ++ pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive, ++ ops->callbacks.gro_receive, head, skb); ++ ++out: ++ skb_gro_flush_final(skb, pp, flush); ++ ++ return pp; ++} ++ ++static struct sk_buff *ipip_gro_receive(struct list_head *head, ++ struct sk_buff *skb) ++{ ++ if (NAPI_GRO_CB(skb)->encap_mark) { ++ NAPI_GRO_CB(skb)->flush = 1; ++ return NULL; ++ } ++ ++ NAPI_GRO_CB(skb)->encap_mark = 1; ++ ++ return inet_gro_receive(head, skb); ++} ++ ++#define SECONDS_PER_DAY 86400 ++ ++/* inet_current_timestamp - Return IP network timestamp ++ * ++ * Return milliseconds since midnight in network byte order. ++ */ ++__be32 inet_current_timestamp(void) ++{ ++ u32 secs; ++ u32 msecs; ++ struct timespec64 ts; ++ ++ ktime_get_real_ts64(&ts); ++ ++ /* Get secs since midnight. */ ++ (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs); ++ /* Convert to msecs. */ ++ msecs = secs * MSEC_PER_SEC; ++ /* Convert nsec to msec. */ ++ msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC; ++ ++ /* Convert to network byte order. */ ++ return htonl(msecs); ++} ++EXPORT_SYMBOL(inet_current_timestamp); ++ ++int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) ++{ ++ if (sk->sk_family == AF_INET) ++ return ip_recv_error(sk, msg, len, addr_len); ++#if IS_ENABLED(CONFIG_IPV6) ++ if (sk->sk_family == AF_INET6) ++ return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); ++#endif ++ return -EINVAL; ++} ++ ++int inet_gro_complete(struct sk_buff *skb, int nhoff) ++{ ++ __be16 newlen = htons(skb->len - nhoff); ++ struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); ++ const struct net_offload *ops; ++ int proto = iph->protocol; ++ int err = -ENOSYS; ++ ++ if (skb->encapsulation) { ++ skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP)); ++ skb_set_inner_network_header(skb, nhoff); ++ } ++ ++ csum_replace2(&iph->check, iph->tot_len, newlen); ++ iph->tot_len = newlen; ++ ++ ops = rcu_dereference(inet_offloads[proto]); ++ if (WARN_ON(!ops || !ops->callbacks.gro_complete)) ++ goto out; ++ ++ /* Only need to add sizeof(*iph) to get to the next hdr below ++ * because any hdr with option will have been flushed in ++ * inet_gro_receive(). ++ */ ++ err = INDIRECT_CALL_2(ops->callbacks.gro_complete, ++ tcp4_gro_complete, udp4_gro_complete, ++ skb, nhoff + sizeof(*iph)); ++ ++out: ++ return err; ++} ++ ++static int ipip_gro_complete(struct sk_buff *skb, int nhoff) ++{ ++ skb->encapsulation = 1; ++ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; ++ return inet_gro_complete(skb, nhoff); ++} ++ ++int inet_ctl_sock_create(struct sock **sk, unsigned short family, ++ unsigned short type, unsigned char protocol, ++ struct net *net) ++{ ++ struct socket *sock; ++ int rc = sock_create_kern(net, family, type, protocol, &sock); ++ ++ if (rc == 0) { ++ *sk = sock->sk; ++ (*sk)->sk_allocation = GFP_ATOMIC; ++ /* ++ * Unhash it so that IP input processing does not even see it, ++ * we do not wish this socket to see incoming packets. ++ */ ++ (*sk)->sk_prot->unhash(*sk); ++ } ++ return rc; ++} ++EXPORT_SYMBOL_GPL(inet_ctl_sock_create); ++ ++unsigned long snmp_fold_field(void __percpu *mib, int offt) ++{ ++ unsigned long res = 0; ++ int i; ++ ++ for_each_possible_cpu(i) ++ res += snmp_get_cpu_field(mib, i, offt); ++ return res; ++} ++EXPORT_SYMBOL_GPL(snmp_fold_field); ++ ++#if BITS_PER_LONG==32 ++ ++u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, ++ size_t syncp_offset) ++{ ++ void *bhptr; ++ struct u64_stats_sync *syncp; ++ u64 v; ++ unsigned int start; ++ ++ bhptr = per_cpu_ptr(mib, cpu); ++ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); ++ do { ++ start = u64_stats_fetch_begin_irq(syncp); ++ v = *(((u64 *)bhptr) + offt); ++ } while (u64_stats_fetch_retry_irq(syncp, start)); ++ ++ return v; ++} ++EXPORT_SYMBOL_GPL(snmp_get_cpu_field64); ++ ++u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) ++{ ++ u64 res = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset); ++ } ++ return res; ++} ++EXPORT_SYMBOL_GPL(snmp_fold_field64); ++#endif ++ ++#ifdef CONFIG_IP_MULTICAST ++static const struct net_protocol igmp_protocol = { ++ .handler = igmp_rcv, ++}; ++#endif ++ ++static const struct net_protocol tcp_protocol = { ++ .handler = tcp_v4_rcv, ++ .err_handler = tcp_v4_err, ++ .no_policy = 1, ++ .icmp_strict_tag_validation = 1, ++}; ++ ++static const struct net_protocol udp_protocol = { ++ .handler = udp_rcv, ++ .err_handler = udp_err, ++ .no_policy = 1, ++}; ++ ++static const struct net_protocol icmp_protocol = { ++ .handler = icmp_rcv, ++ .err_handler = icmp_err, ++ .no_policy = 1, ++}; ++ ++static __net_init int ipv4_mib_init_net(struct net *net) ++{ ++ int i; ++ ++ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); ++ if (!net->mib.tcp_statistics) ++ goto err_tcp_mib; ++ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); ++ if (!net->mib.ip_statistics) ++ goto err_ip_mib; ++ ++ for_each_possible_cpu(i) { ++ struct ipstats_mib *af_inet_stats; ++ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); ++ u64_stats_init(&af_inet_stats->syncp); ++ } ++ ++ net->mib.net_statistics = alloc_percpu(struct linux_mib); ++ if (!net->mib.net_statistics) ++ goto err_net_mib; ++ net->mib.udp_statistics = alloc_percpu(struct udp_mib); ++ if (!net->mib.udp_statistics) ++ goto err_udp_mib; ++ net->mib.udplite_statistics = alloc_percpu(struct udp_mib); ++ if (!net->mib.udplite_statistics) ++ goto err_udplite_mib; ++ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); ++ if (!net->mib.icmp_statistics) ++ goto err_icmp_mib; ++ net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), ++ GFP_KERNEL); ++ if (!net->mib.icmpmsg_statistics) ++ goto err_icmpmsg_mib; ++ ++ tcp_mib_init(net); ++ return 0; ++ ++err_icmpmsg_mib: ++ free_percpu(net->mib.icmp_statistics); ++err_icmp_mib: ++ free_percpu(net->mib.udplite_statistics); ++err_udplite_mib: ++ free_percpu(net->mib.udp_statistics); ++err_udp_mib: ++ free_percpu(net->mib.net_statistics); ++err_net_mib: ++ free_percpu(net->mib.ip_statistics); ++err_ip_mib: ++ free_percpu(net->mib.tcp_statistics); ++err_tcp_mib: ++ return -ENOMEM; ++} ++ ++static __net_exit void ipv4_mib_exit_net(struct net *net) ++{ ++ kfree(net->mib.icmpmsg_statistics); ++ free_percpu(net->mib.icmp_statistics); ++ free_percpu(net->mib.udplite_statistics); ++ free_percpu(net->mib.udp_statistics); ++ free_percpu(net->mib.net_statistics); ++ free_percpu(net->mib.ip_statistics); ++ free_percpu(net->mib.tcp_statistics); ++#ifdef CONFIG_MPTCP ++ /* allocated on demand, see mptcp_init_sock() */ ++ free_percpu(net->mib.mptcp_statistics); ++#endif ++} ++ ++static __net_initdata struct pernet_operations ipv4_mib_ops = { ++ .init = ipv4_mib_init_net, ++ .exit = ipv4_mib_exit_net, ++}; ++ ++static int __init init_ipv4_mibs(void) ++{ ++ return register_pernet_subsys(&ipv4_mib_ops); ++} ++ ++static __net_init int inet_init_net(struct net *net) ++{ ++ /* ++ * Set defaults for local port range ++ */ ++ seqlock_init(&net->ipv4.ip_local_ports.lock); ++ net->ipv4.ip_local_ports.range[0] = 32768; ++ net->ipv4.ip_local_ports.range[1] = 60999; ++ ++ seqlock_init(&net->ipv4.ping_group_range.lock); ++ /* ++ * Sane defaults - nobody may create ping sockets. ++ * Boot scripts should set this to distro-specific group. ++ */ ++ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); ++ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); ++ ++ /* Default values for sysctl-controlled parameters. ++ * We set them here, in case sysctl is not compiled. ++ */ ++ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; ++ net->ipv4.sysctl_ip_fwd_update_priority = 1; ++ net->ipv4.sysctl_ip_dynaddr = 0; ++ net->ipv4.sysctl_ip_early_demux = 1; ++ net->ipv4.sysctl_udp_early_demux = 1; ++ net->ipv4.sysctl_tcp_early_demux = 1; ++ net->ipv4.sysctl_nexthop_compat_mode = 1; ++#ifdef CONFIG_SYSCTL ++ net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; ++#endif ++ ++ /* Some igmp sysctl, whose values are always used */ ++ net->ipv4.sysctl_igmp_max_memberships = 20; ++ net->ipv4.sysctl_igmp_max_msf = 10; ++ /* IGMP reports for link-local multicast groups are enabled by default */ ++ net->ipv4.sysctl_igmp_llm_reports = 1; ++ net->ipv4.sysctl_igmp_qrv = 2; ++ ++ net->ipv4.sysctl_fib_notify_on_flag_change = 0; ++ ++ return 0; ++} ++ ++static __net_initdata struct pernet_operations af_inet_ops = { ++ .init = inet_init_net, ++}; ++ ++static int __init init_inet_pernet_ops(void) ++{ ++ return register_pernet_subsys(&af_inet_ops); ++} ++ ++static int ipv4_proc_init(void); ++ ++/* ++ * IP protocol layer initialiser ++ */ ++ ++static struct packet_offload ip_packet_offload __read_mostly = { ++ .type = cpu_to_be16(ETH_P_IP), ++ .callbacks = { ++ .gso_segment = inet_gso_segment, ++ .gro_receive = inet_gro_receive, ++ .gro_complete = inet_gro_complete, ++ }, ++}; ++ ++static const struct net_offload ipip_offload = { ++ .callbacks = { ++ .gso_segment = ipip_gso_segment, ++ .gro_receive = ipip_gro_receive, ++ .gro_complete = ipip_gro_complete, ++ }, ++}; ++ ++static int __init ipip_offload_init(void) ++{ ++ return inet_add_offload(&ipip_offload, IPPROTO_IPIP); ++} ++ ++static int __init ipv4_offload_init(void) ++{ ++ /* ++ * Add offloads ++ */ ++ if (udpv4_offload_init() < 0) ++ pr_crit("%s: Cannot add UDP protocol offload\n", __func__); ++ if (tcpv4_offload_init() < 0) ++ pr_crit("%s: Cannot add TCP protocol offload\n", __func__); ++ if (ipip_offload_init() < 0) ++ pr_crit("%s: Cannot add IPIP protocol offload\n", __func__); ++ ++ dev_add_offload(&ip_packet_offload); ++ return 0; ++} ++ ++fs_initcall(ipv4_offload_init); ++ ++static struct packet_type ip_packet_type __read_mostly = { ++ .type = cpu_to_be16(ETH_P_IP), ++ .func = ip_rcv, ++ .list_func = ip_list_rcv, ++}; ++ ++static int __init inet_init(void) ++{ ++ struct inet_protosw *q; ++ struct list_head *r; ++ int rc; ++ ++ sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); ++ ++ raw_hashinfo_init(&raw_v4_hashinfo); ++ ++ rc = proto_register(&tcp_prot, 1); ++ if (rc) ++ goto out; ++ ++ rc = proto_register(&udp_prot, 1); ++ if (rc) ++ goto out_unregister_tcp_proto; ++ ++ rc = proto_register(&raw_prot, 1); ++ if (rc) ++ goto out_unregister_udp_proto; ++ ++ rc = proto_register(&ping_prot, 1); ++ if (rc) ++ goto out_unregister_raw_proto; ++ ++ /* ++ * Tell SOCKET that we are alive... ++ */ ++ ++ (void)sock_register(&inet_family_ops); ++ ++#ifdef CONFIG_SYSCTL ++ ip_static_sysctl_init(); ++#endif ++ ++ /* ++ * Add all the base protocols. ++ */ ++ ++ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) ++ pr_crit("%s: Cannot add ICMP protocol\n", __func__); ++ if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) ++ pr_crit("%s: Cannot add UDP protocol\n", __func__); ++ if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) ++ pr_crit("%s: Cannot add TCP protocol\n", __func__); ++#ifdef CONFIG_IP_MULTICAST ++ if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) ++ pr_crit("%s: Cannot add IGMP protocol\n", __func__); ++#endif ++ ++ /* Register the socket-side information for inet_create. */ ++ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) ++ INIT_LIST_HEAD(r); ++ ++ for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) ++ inet_register_protosw(q); ++ ++ /* ++ * Set the ARP module up ++ */ ++ ++ arp_init(); ++ ++ /* ++ * Set the IP module up ++ */ ++ ++ ip_init(); ++ ++ /* Initialise per-cpu ipv4 mibs */ ++ if (init_ipv4_mibs()) ++ panic("%s: Cannot init ipv4 mibs\n", __func__); ++ ++ /* Setup TCP slab cache for open requests. */ ++ tcp_init(); ++ ++ /* Setup UDP memory threshold */ ++ udp_init(); ++ ++ /* Add UDP-Lite (RFC 3828) */ ++ udplite4_register(); ++ ++ raw_init(); ++ ++ ping_init(); ++ ++ /* ++ * Set the ICMP layer up ++ */ ++ ++ if (icmp_init() < 0) ++ panic("Failed to create the ICMP control socket.\n"); ++ ++ /* ++ * Initialise the multicast router ++ */ ++#if defined(CONFIG_IP_MROUTE) ++ if (ip_mr_init()) ++ pr_crit("%s: Cannot init ipv4 mroute\n", __func__); ++#endif ++ ++ if (init_inet_pernet_ops()) ++ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); ++ ++ ipv4_proc_init(); ++ ++ ipfrag_init(); ++ ++ dev_add_pack(&ip_packet_type); ++ ++ ip_tunnel_core_init(); ++ ++ rc = 0; ++out: ++ return rc; ++out_unregister_raw_proto: ++ proto_unregister(&raw_prot); ++out_unregister_udp_proto: ++ proto_unregister(&udp_prot); ++out_unregister_tcp_proto: ++ proto_unregister(&tcp_prot); ++ goto out; ++} ++ ++fs_initcall(inet_init); ++ ++/* ------------------------------------------------------------------------ */ ++ ++#ifdef CONFIG_PROC_FS ++static int __init ipv4_proc_init(void) ++{ ++ int rc = 0; ++ ++ if (raw_proc_init()) ++ goto out_raw; ++ if (tcp4_proc_init()) ++ goto out_tcp; ++ if (udp4_proc_init()) ++ goto out_udp; ++ if (ping_proc_init()) ++ goto out_ping; ++ if (ip_misc_proc_init()) ++ goto out_misc; ++out: ++ return rc; ++out_misc: ++ ping_proc_exit(); ++out_ping: ++ udp4_proc_exit(); ++out_udp: ++ tcp4_proc_exit(); ++out_tcp: ++ raw_proc_exit(); ++out_raw: ++ rc = -ENOMEM; ++ goto out; ++} ++ ++#else /* CONFIG_PROC_FS */ ++static int __init ipv4_proc_init(void) ++{ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -rupN linux.orig/net/ipv6/seg6_local.c linux/net/ipv6/seg6_local.c +--- linux.orig/net/ipv6/seg6_local.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/ipv6/seg6_local.c 2022-12-04 10:40:26.732034003 -0500 +@@ -1508,13 +1508,13 @@ static int put_nla_counters(struct sk_bu pcounters = per_cpu_ptr(slwt->pcpu_counters, i); do { @@ -8869,11 +59406,10 @@ index b7de5e46fdd8f..f84da849819cc 100644 counters.packets += packets; counters.bytes += bytes; -diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c -index 9d7b238a67372..965b9cb2ef3f2 100644 ---- a/net/mac80211/sta_info.c -+++ b/net/mac80211/sta_info.c -@@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, +diff -rupN linux.orig/net/mac80211/sta_info.c linux/net/mac80211/sta_info.c +--- linux.orig/net/mac80211/sta_info.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/mac80211/sta_info.c 2022-12-04 10:40:26.732034003 -0500 +@@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu( u64 value; do { @@ -8885,7 +59421,7 @@ index 9d7b238a67372..965b9cb2ef3f2 100644 return value; } -@@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) +@@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(st u64 value; do { @@ -8897,11 +59433,10 @@ index 9d7b238a67372..965b9cb2ef3f2 100644 return value; } -diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c -index b52afe316dc41..35b5f806fdda1 100644 ---- a/net/mpls/af_mpls.c -+++ b/net/mpls/af_mpls.c -@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev, +diff -rupN linux.orig/net/mpls/af_mpls.c linux/net/mpls/af_mpls.c +--- linux.orig/net/mpls/af_mpls.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/mpls/af_mpls.c 2022-12-04 10:40:26.732034003 -0500 +@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_d p = per_cpu_ptr(mdev->stats, i); do { @@ -8913,11 +59448,10 @@ index b52afe316dc41..35b5f806fdda1 100644 stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; -diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c -index efab2b06d3732..5a7349002508e 100644 ---- a/net/netfilter/ipvs/ip_vs_ctl.c -+++ b/net/netfilter/ipvs/ip_vs_ctl.c -@@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +diff -rupN linux.orig/net/netfilter/ipvs/ip_vs_ctl.c linux/net/netfilter/ipvs/ip_vs_ctl.c +--- linux.orig/net/netfilter/ipvs/ip_vs_ctl.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/netfilter/ipvs/ip_vs_ctl.c 2022-12-04 10:40:26.736033993 -0500 +@@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struc u64 conns, inpkts, outpkts, inbytes, outbytes; do { @@ -8933,11 +59467,10 @@ index efab2b06d3732..5a7349002508e 100644 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", i, (u64)conns, (u64)inpkts, -diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c -index 63c70141b3e5d..cde0d9f0d838e 100644 ---- a/net/netfilter/nf_tables_api.c -+++ b/net/netfilter/nf_tables_api.c -@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) +diff -rupN linux.orig/net/netfilter/nf_tables_api.c linux/net/netfilter/nf_tables_api.c +--- linux.orig/net/netfilter/nf_tables_api.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/netfilter/nf_tables_api.c 2022-12-04 10:40:26.736033993 -0500 +@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); do { @@ -8950,11 +59483,10 @@ index 63c70141b3e5d..cde0d9f0d838e 100644 total.pkts += pkts; total.bytes += bytes; } -diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c -index 93c596e3b22b9..b05458c170484 100644 ---- a/net/openvswitch/datapath.c -+++ b/net/openvswitch/datapath.c -@@ -715,9 +715,9 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, +diff -rupN linux.orig/net/openvswitch/datapath.c linux/net/openvswitch/datapath.c +--- linux.orig/net/openvswitch/datapath.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/openvswitch/datapath.c 2022-12-04 10:40:26.736033993 -0500 +@@ -715,9 +715,9 @@ static void get_dp_stats(const struct da percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { @@ -8966,11 +59498,10 @@ index 93c596e3b22b9..b05458c170484 100644 stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; -diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c -index d4a2db0b22998..0a0e4c283f02e 100644 ---- a/net/openvswitch/flow_table.c -+++ b/net/openvswitch/flow_table.c -@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma) +diff -rupN linux.orig/net/openvswitch/flow_table.c linux/net/openvswitch/flow_table.c +--- linux.orig/net/openvswitch/flow_table.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/openvswitch/flow_table.c 2022-12-04 10:40:26.736033993 -0500 +@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counter stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { @@ -8982,7 +59513,7 @@ index d4a2db0b22998..0a0e4c283f02e 100644 ma->masks_usage_zero_cntr[i] += counter; } -@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flow_table *table) +@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flo stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { diff --git a/packages/jelos/tmpfiles.d/jelos-dirs.conf b/packages/virtual/emulators/tmpfiles.d/jelos-dirs.conf similarity index 100% rename from packages/jelos/tmpfiles.d/jelos-dirs.conf rename to packages/virtual/emulators/tmpfiles.d/jelos-dirs.conf diff --git a/projects/Rockchip/packages/linux/package.mk b/projects/Rockchip/packages/linux/package.mk index 92fe16b78..de7b60fc9 100644 --- a/projects/Rockchip/packages/linux/package.mk +++ b/projects/Rockchip/packages/linux/package.mk @@ -25,7 +25,7 @@ case ${DEVICE} in PKG_GIT_CLONE_BRANCH="main" ;; RG552) - PKG_VERSION="6.0.7" + PKG_VERSION="6.0.11" PKG_URL="https://www.kernel.org/pub/linux/kernel/v6.x/${PKG_NAME}-${PKG_VERSION}.tar.xz" ;; RG353P|RG503) diff --git a/projects/Rockchip/packages/linux/patches/RG552/patch-6.0.5-rt14.patch b/projects/Rockchip/packages/linux/patches/RG552/patch-6.0.5-rt14.patch index c0c976eb9..2de168ec2 100644 --- a/projects/Rockchip/packages/linux/patches/RG552/patch-6.0.5-rt14.patch +++ b/projects/Rockchip/packages/linux/patches/RG552/patch-6.0.5-rt14.patch @@ -1,7 +1,43 @@ -diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig -index 11ecf09aadc86..98aa5a478719c 100644 ---- a/arch/arm/Kconfig -+++ b/arch/arm/Kconfig +diff -rupN linux.orig/arch/arm/include/asm/thread_info.h linux/arch/arm/include/asm/thread_info.h +--- linux.orig/arch/arm/include/asm/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/include/asm/thread_info.h 2022-12-04 10:40:26.676034147 -0500 +@@ -62,6 +62,7 @@ struct cpu_context_save { + struct thread_info { + unsigned long flags; /* low level flags */ + int preempt_count; /* 0 => preemptable, <0 => bug */ ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + __u32 cpu; /* cpu */ + __u32 cpu_domain; /* cpu domain */ + struct cpu_context_save cpu_context; /* cpu context */ +@@ -133,6 +134,7 @@ extern int vfp_restore_user_hwstate(stru + #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ + #define TIF_SECCOMP 7 /* seccomp syscall filtering active */ + #define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ ++#define TIF_NEED_RESCHED_LAZY 9 + + #define TIF_USING_IWMMXT 17 + #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ +@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(stru + #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) + #define _TIF_SECCOMP (1 << TIF_SECCOMP) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) + + /* Checks for any syscall work in entry-common.S */ +@@ -156,7 +159,8 @@ extern int vfp_restore_user_hwstate(stru + /* + * Change these and you break ASM code in entry-common.S + */ +-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ ++#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ ++ _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NOTIFY_SIGNAL) + +diff -rupN linux.orig/arch/arm/Kconfig linux/arch/arm/Kconfig +--- linux.orig/arch/arm/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/Kconfig 2022-12-04 10:40:26.676034147 -0500 @@ -33,6 +33,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW @@ -35,48 +71,9 @@ index 11ecf09aadc86..98aa5a478719c 100644 select RTC_LIB select SYS_SUPPORTS_APM_EMULATION select THREAD_INFO_IN_TASK -diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h -index aecc403b28804..1b56e56f8f415 100644 ---- a/arch/arm/include/asm/thread_info.h -+++ b/arch/arm/include/asm/thread_info.h -@@ -62,6 +62,7 @@ struct cpu_context_save { - struct thread_info { - unsigned long flags; /* low level flags */ - int preempt_count; /* 0 => preemptable, <0 => bug */ -+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ - __u32 cpu; /* cpu */ - __u32 cpu_domain; /* cpu domain */ - struct cpu_context_save cpu_context; /* cpu context */ -@@ -133,6 +134,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, - #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ - #define TIF_SECCOMP 7 /* seccomp syscall filtering active */ - #define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ -+#define TIF_NEED_RESCHED_LAZY 9 - - #define TIF_USING_IWMMXT 17 - #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ -@@ -147,6 +149,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, - #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) - #define _TIF_SECCOMP (1 << TIF_SECCOMP) - #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) - #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) - - /* Checks for any syscall work in entry-common.S */ -@@ -156,7 +159,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, - /* - * Change these and you break ASM code in entry-common.S - */ --#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ -+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ -+ _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NOTIFY_SIGNAL) - -diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c -index 2c8d76fd7c662..c3bdec7d2df9c 100644 ---- a/arch/arm/kernel/asm-offsets.c -+++ b/arch/arm/kernel/asm-offsets.c +diff -rupN linux.orig/arch/arm/kernel/asm-offsets.c linux/arch/arm/kernel/asm-offsets.c +--- linux.orig/arch/arm/kernel/asm-offsets.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/kernel/asm-offsets.c 2022-12-04 10:40:26.676034147 -0500 @@ -43,6 +43,7 @@ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); @@ -85,11 +82,10 @@ index 2c8d76fd7c662..c3bdec7d2df9c 100644 DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context)); -diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S -index c39303e5c2347..cfb4660e9feab 100644 ---- a/arch/arm/kernel/entry-armv.S -+++ b/arch/arm/kernel/entry-armv.S -@@ -222,11 +222,18 @@ ENDPROC(__dabt_svc) +diff -rupN linux.orig/arch/arm/kernel/entry-armv.S linux/arch/arm/kernel/entry-armv.S +--- linux.orig/arch/arm/kernel/entry-armv.S 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/kernel/entry-armv.S 2022-12-04 10:40:26.676034147 -0500 +@@ -222,11 +222,18 @@ __irq_svc: #ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count @@ -110,7 +106,7 @@ index c39303e5c2347..cfb4660e9feab 100644 #endif svc_exit r5, irq = 1 @ return from exception -@@ -241,8 +248,14 @@ ENDPROC(__irq_svc) +@@ -241,8 +248,14 @@ svc_preempt: 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED @@ -126,11 +122,10 @@ index c39303e5c2347..cfb4660e9feab 100644 #endif __und_fault: -diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c -index ea128e32e8ca8..3671a4214d6f4 100644 ---- a/arch/arm/kernel/signal.c -+++ b/arch/arm/kernel/signal.c -@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) +diff -rupN linux.orig/arch/arm/kernel/signal.c linux/arch/arm/kernel/signal.c +--- linux.orig/arch/arm/kernel/signal.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/kernel/signal.c 2022-12-04 10:40:26.676034147 -0500 +@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, un */ trace_hardirqs_off(); do { @@ -140,11 +135,10 @@ index ea128e32e8ca8..3671a4214d6f4 100644 schedule(); } else { if (unlikely(!user_mode(regs))) -diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c -index 46cccd6bf705a..480a1976a9dce 100644 ---- a/arch/arm/mm/fault.c -+++ b/arch/arm/mm/fault.c -@@ -421,6 +421,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, +diff -rupN linux.orig/arch/arm/mm/fault.c linux/arch/arm/mm/fault.c +--- linux.orig/arch/arm/mm/fault.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm/mm/fault.c 2022-12-04 10:40:26.676034147 -0500 +@@ -421,6 +421,9 @@ do_translation_fault(unsigned long addr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); @@ -154,7 +148,7 @@ index 46cccd6bf705a..480a1976a9dce 100644 if (user_mode(regs)) goto bad_area; -@@ -491,6 +494,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, +@@ -491,6 +494,9 @@ do_translation_fault(unsigned long addr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -164,31 +158,10 @@ index 46cccd6bf705a..480a1976a9dce 100644 do_bad_area(addr, fsr, regs); return 0; } -diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 3795eb5ba1cdd..6922949e61b71 100644 ---- a/arch/arm64/Kconfig -+++ b/arch/arm64/Kconfig -@@ -93,6 +93,7 @@ config ARM64 - select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 - select ARCH_SUPPORTS_NUMA_BALANCING - select ARCH_SUPPORTS_PAGE_TABLE_CHECK -+ select ARCH_SUPPORTS_RT - select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT - select ARCH_WANT_DEFAULT_BPF_JIT - select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT -@@ -200,6 +201,7 @@ config ARM64 - select HAVE_PERF_USER_STACK_DUMP - select HAVE_PREEMPT_DYNAMIC_KEY - select HAVE_REGS_AND_STACK_ACCESS_API -+ select HAVE_PREEMPT_LAZY - select HAVE_POSIX_CPU_TIMERS_TASK_WORK - select HAVE_FUNCTION_ARG_ACCESS_API - select MMU_GATHER_RCU_TABLE_FREE -diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h -index 0159b625cc7f0..a5486918e5eeb 100644 ---- a/arch/arm64/include/asm/preempt.h -+++ b/arch/arm64/include/asm/preempt.h -@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_and_test(void) +diff -rupN linux.orig/arch/arm64/include/asm/preempt.h linux/arch/arm64/include/asm/preempt.h +--- linux.orig/arch/arm64/include/asm/preempt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/include/asm/preempt.h 2022-12-04 10:40:26.676034147 -0500 +@@ -71,13 +71,36 @@ static inline bool __preempt_count_dec_a * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE * pair. */ @@ -226,10 +199,9 @@ index 0159b625cc7f0..a5486918e5eeb 100644 } #ifdef CONFIG_PREEMPTION -diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h -index 848739c15de82..4b7148fd5551f 100644 ---- a/arch/arm64/include/asm/thread_info.h -+++ b/arch/arm64/include/asm/thread_info.h +diff -rupN linux.orig/arch/arm64/include/asm/thread_info.h linux/arch/arm64/include/asm/thread_info.h +--- linux.orig/arch/arm64/include/asm/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/include/asm/thread_info.h 2022-12-04 10:40:26.676034147 -0500 @@ -26,6 +26,7 @@ struct thread_info { #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ @@ -238,7 +210,7 @@ index 848739c15de82..4b7148fd5551f 100644 union { u64 preempt_count; /* 0 => preemptible, <0 => bug */ struct { -@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_struct *dst, +@@ -68,6 +69,7 @@ int arch_dup_task_struct(struct task_str #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ @@ -246,7 +218,7 @@ index 848739c15de82..4b7148fd5551f 100644 #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ -@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_struct *dst, +@@ -100,8 +102,10 @@ int arch_dup_task_struct(struct task_str #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) @@ -258,7 +230,7 @@ index 848739c15de82..4b7148fd5551f 100644 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ _TIF_NOTIFY_SIGNAL) -@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_struct *dst, +@@ -110,6 +114,8 @@ int arch_dup_task_struct(struct task_str _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) @@ -267,10 +239,28 @@ index 848739c15de82..4b7148fd5551f 100644 #ifdef CONFIG_SHADOW_CALL_STACK #define INIT_SCS \ .scs_base = init_shadow_call_stack, \ -diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c -index 1197e7679882e..e74c0415f67ea 100644 ---- a/arch/arm64/kernel/asm-offsets.c -+++ b/arch/arm64/kernel/asm-offsets.c +diff -rupN linux.orig/arch/arm64/Kconfig linux/arch/arm64/Kconfig +--- linux.orig/arch/arm64/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/Kconfig 2022-12-04 10:40:26.676034147 -0500 +@@ -93,6 +93,7 @@ config ARM64 + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PAGE_TABLE_CHECK ++ select ARCH_SUPPORTS_RT + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +@@ -200,6 +201,7 @@ config ARM64 + select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_DYNAMIC_KEY + select HAVE_REGS_AND_STACK_ACCESS_API ++ select HAVE_PREEMPT_LAZY + select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_FUNCTION_ARG_ACCESS_API + select MMU_GATHER_RCU_TABLE_FREE +diff -rupN linux.orig/arch/arm64/kernel/asm-offsets.c linux/arch/arm64/kernel/asm-offsets.c +--- linux.orig/arch/arm64/kernel/asm-offsets.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/kernel/asm-offsets.c 2022-12-04 10:40:26.676034147 -0500 @@ -32,6 +32,7 @@ int main(void) DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); @@ -279,11 +269,10 @@ index 1197e7679882e..e74c0415f67ea 100644 #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif -diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c -index 9ad911f1647c8..545c41a84411e 100644 ---- a/arch/arm64/kernel/signal.c -+++ b/arch/arm64/kernel/signal.c -@@ -1103,7 +1103,7 @@ static void do_signal(struct pt_regs *regs) +diff -rupN linux.orig/arch/arm64/kernel/signal.c linux/arch/arm64/kernel/signal.c +--- linux.orig/arch/arm64/kernel/signal.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/arm64/kernel/signal.c 2022-12-04 10:40:26.676034147 -0500 +@@ -1103,7 +1103,7 @@ static void do_signal(struct pt_regs *re void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { do { @@ -292,34 +281,10 @@ index 9ad911f1647c8..545c41a84411e 100644 /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); -diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig -index cbe7bb029aec8..ad5bcc255f4e3 100644 ---- a/arch/powerpc/Kconfig -+++ b/arch/powerpc/Kconfig -@@ -149,6 +149,7 @@ config PPC - select ARCH_STACKWALK - select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x -+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK - select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_CMPXCHG_LOCKREF if PPC64 - select ARCH_USE_MEMTEST -@@ -241,8 +242,10 @@ config PPC - select HAVE_PERF_EVENTS_NMI if PPC64 - select HAVE_PERF_REGS - select HAVE_PERF_USER_STACK_DUMP -+ select HAVE_PREEMPT_LAZY - select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE -+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM - select HAVE_RSEQ - select HAVE_SETUP_PER_CPU_AREA if PPC64 - select HAVE_SOFTIRQ_ON_OWN_STACK -diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h -index 1c8460e235838..b1653c160bab9 100644 ---- a/arch/powerpc/include/asm/stackprotector.h -+++ b/arch/powerpc/include/asm/stackprotector.h -@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void) +diff -rupN linux.orig/arch/powerpc/include/asm/stackprotector.h linux/arch/powerpc/include/asm/stackprotector.h +--- linux.orig/arch/powerpc/include/asm/stackprotector.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/include/asm/stackprotector.h 2022-12-04 10:40:26.676034147 -0500 +@@ -24,7 +24,11 @@ static __always_inline void boot_init_st unsigned long canary; /* Try to get a semi random initial value. */ @@ -331,10 +296,9 @@ index 1c8460e235838..b1653c160bab9 100644 canary ^= mftb(); canary ^= LINUX_VERSION_CODE; canary &= CANARY_MASK; -diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h -index af58f1ed3952e..520864de8bb27 100644 ---- a/arch/powerpc/include/asm/thread_info.h -+++ b/arch/powerpc/include/asm/thread_info.h +diff -rupN linux.orig/arch/powerpc/include/asm/thread_info.h linux/arch/powerpc/include/asm/thread_info.h +--- linux.orig/arch/powerpc/include/asm/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/include/asm/thread_info.h 2022-12-04 10:40:26.676034147 -0500 @@ -53,6 +53,8 @@ struct thread_info { int preempt_count; /* 0 => preemptable, @@ -389,11 +353,32 @@ index af58f1ed3952e..520864de8bb27 100644 /* Bits in local_flags */ /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ -diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c -index f9db0a172401a..38aa3d06c632c 100644 ---- a/arch/powerpc/kernel/interrupt.c -+++ b/arch/powerpc/kernel/interrupt.c -@@ -184,7 +184,7 @@ interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs) +diff -rupN linux.orig/arch/powerpc/Kconfig linux/arch/powerpc/Kconfig +--- linux.orig/arch/powerpc/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/Kconfig 2022-12-04 10:40:26.676034147 -0500 +@@ -149,6 +149,7 @@ config PPC + select ARCH_STACKWALK + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if PPC64 + select ARCH_USE_MEMTEST +@@ -241,8 +242,10 @@ config PPC + select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RELIABLE_STACKTRACE ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select HAVE_RSEQ + select HAVE_SETUP_PER_CPU_AREA if PPC64 + select HAVE_SOFTIRQ_ON_OWN_STACK +diff -rupN linux.orig/arch/powerpc/kernel/interrupt.c linux/arch/powerpc/kernel/interrupt.c +--- linux.orig/arch/powerpc/kernel/interrupt.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/kernel/interrupt.c 2022-12-04 10:40:26.676034147 -0500 +@@ -184,7 +184,7 @@ again: ti_flags = read_thread_flags(); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); @@ -402,7 +387,7 @@ index f9db0a172401a..38aa3d06c632c 100644 schedule(); } else { /* -@@ -388,11 +388,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) +@@ -388,11 +388,15 @@ notrace unsigned long interrupt_exit_ker /* Returning to a kernel context with local irqs enabled. */ WARN_ON_ONCE(!(regs->msr & MSR_EE)); again: @@ -419,10 +404,9 @@ index f9db0a172401a..38aa3d06c632c 100644 } } -diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c -index dadfcef5d6db4..3bfe55d82b042 100644 ---- a/arch/powerpc/kernel/traps.c -+++ b/arch/powerpc/kernel/traps.c +diff -rupN linux.orig/arch/powerpc/kernel/traps.c linux/arch/powerpc/kernel/traps.c +--- linux.orig/arch/powerpc/kernel/traps.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/kernel/traps.c 2022-12-04 10:40:26.676034147 -0500 @@ -260,12 +260,17 @@ static char *get_mmu_str(void) static int __die(const char *str, struct pt_regs *regs, long err) @@ -442,10 +426,9 @@ index dadfcef5d6db4..3bfe55d82b042 100644 IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", -diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig -index dcb398d5e0093..2cfa432afdb12 100644 ---- a/arch/powerpc/kvm/Kconfig -+++ b/arch/powerpc/kvm/Kconfig +diff -rupN linux.orig/arch/powerpc/kvm/Kconfig linux/arch/powerpc/kvm/Kconfig +--- linux.orig/arch/powerpc/kvm/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/kvm/Kconfig 2022-12-04 10:40:26.676034147 -0500 @@ -221,6 +221,7 @@ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" @@ -454,10 +437,9 @@ index dcb398d5e0093..2cfa432afdb12 100644 select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING -diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c -index 561adac690229..61c4c0610aa6a 100644 ---- a/arch/powerpc/platforms/pseries/iommu.c -+++ b/arch/powerpc/platforms/pseries/iommu.c +diff -rupN linux.orig/arch/powerpc/platforms/pseries/iommu.c linux/arch/powerpc/platforms/pseries/iommu.c +--- linux.orig/arch/powerpc/platforms/pseries/iommu.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/powerpc/platforms/pseries/iommu.c 2022-12-04 10:40:26.676034147 -0500 @@ -24,6 +24,7 @@ #include #include @@ -466,7 +448,7 @@ index 561adac690229..61c4c0610aa6a 100644 #include #include #include -@@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, +@@ -195,7 +196,13 @@ static int tce_build_pSeriesLP(unsigned return ret; } @@ -481,7 +463,7 @@ index 561adac690229..61c4c0610aa6a 100644 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, -@@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +@@ -218,9 +225,10 @@ static int tce_buildmulti_pSeriesLP(stru direction, attrs); } @@ -494,7 +476,7 @@ index 561adac690229..61c4c0610aa6a 100644 /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() -@@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +@@ -229,12 +237,12 @@ static int tce_buildmulti_pSeriesLP(stru tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { @@ -509,7 +491,7 @@ index 561adac690229..61c4c0610aa6a 100644 } rpn = __pa(uaddr) >> tceshift; -@@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +@@ -264,7 +272,7 @@ static int tce_buildmulti_pSeriesLP(stru tcenum += limit; } while (npages > 0 && !rc); @@ -518,7 +500,7 @@ index 561adac690229..61c4c0610aa6a 100644 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; -@@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, +@@ -440,16 +448,17 @@ static int tce_setrange_multi_pSeriesLP( DMA_BIDIRECTIONAL, 0); } @@ -540,7 +522,7 @@ index 561adac690229..61c4c0610aa6a 100644 } proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; -@@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, +@@ -492,7 +501,7 @@ static int tce_setrange_multi_pSeriesLP( /* error cleanup: caller will clear whole range */ @@ -549,31 +531,10 @@ index 561adac690229..61c4c0610aa6a 100644 return rc; } -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 159c025ebb03e..4d62ceece1bb0 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -109,6 +109,7 @@ config X86 - select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 - select ARCH_SUPPORTS_LTO_CLANG - select ARCH_SUPPORTS_LTO_CLANG_THIN -+ select ARCH_SUPPORTS_RT - select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_MEMTEST - select ARCH_USE_QUEUED_RWLOCKS -@@ -243,6 +244,7 @@ config X86 - select HAVE_PCI - select HAVE_PERF_REGS - select HAVE_PERF_USER_STACK_DUMP -+ select HAVE_PREEMPT_LAZY - select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT - select MMU_GATHER_MERGE_VMAS - select HAVE_POSIX_CPU_TIMERS_TASK_WORK -diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h -index 5f6daea1ee248..cd20b4a5719a4 100644 ---- a/arch/x86/include/asm/preempt.h -+++ b/arch/x86/include/asm/preempt.h -@@ -90,17 +90,48 @@ static __always_inline void __preempt_count_sub(int val) +diff -rupN linux.orig/arch/x86/include/asm/preempt.h linux/arch/x86/include/asm/preempt.h +--- linux.orig/arch/x86/include/asm/preempt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/x86/include/asm/preempt.h 2022-12-04 10:40:26.676034147 -0500 +@@ -90,17 +90,48 @@ static __always_inline void __preempt_co * a decrement which hits zero means we have no preempt_count and should * reschedule. */ @@ -623,10 +584,9 @@ index 5f6daea1ee248..cd20b4a5719a4 100644 } #ifdef CONFIG_PREEMPTION -diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h -index f0cb881c1d690..fd8fb76f324fc 100644 ---- a/arch/x86/include/asm/thread_info.h -+++ b/arch/x86/include/asm/thread_info.h +diff -rupN linux.orig/arch/x86/include/asm/thread_info.h linux/arch/x86/include/asm/thread_info.h +--- linux.orig/arch/x86/include/asm/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/x86/include/asm/thread_info.h 2022-12-04 10:40:26.676034147 -0500 @@ -57,6 +57,8 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ @@ -660,11 +620,29 @@ index f0cb881c1d690..fd8fb76f324fc 100644 #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) -diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c -index fac8ff983aec8..65fb9bad1577a 100644 ---- a/drivers/bcma/driver_gpio.c -+++ b/drivers/bcma/driver_gpio.c -@@ -115,7 +115,7 @@ static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id) +diff -rupN linux.orig/arch/x86/Kconfig linux/arch/x86/Kconfig +--- linux.orig/arch/x86/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/arch/x86/Kconfig 2022-12-04 10:40:26.676034147 -0500 +@@ -109,6 +109,7 @@ config X86 + select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 + select ARCH_SUPPORTS_LTO_CLANG + select ARCH_SUPPORTS_LTO_CLANG_THIN ++ select ARCH_SUPPORTS_RT + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_MEMTEST + select ARCH_USE_QUEUED_RWLOCKS +@@ -243,6 +244,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_MERGE_VMAS + select HAVE_POSIX_CPU_TIMERS_TASK_WORK +diff -rupN linux.orig/drivers/bcma/driver_gpio.c linux/drivers/bcma/driver_gpio.c +--- linux.orig/drivers/bcma/driver_gpio.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/bcma/driver_gpio.c 2022-12-04 10:40:26.680034137 -0500 +@@ -115,7 +115,7 @@ static irqreturn_t bcma_gpio_irq_handler return IRQ_NONE; for_each_set_bit(gpio, &irqs, gc->ngpio) @@ -673,11 +651,10 @@ index fac8ff983aec8..65fb9bad1577a 100644 bcma_chipco_gpio_polarity(cc, irqs, val & irqs); return IRQ_HANDLED; -diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c -index 226ea76cc8197..4043d909d41bf 100644 ---- a/drivers/block/zram/zram_drv.c -+++ b/drivers/block/zram/zram_drv.c -@@ -60,6 +60,40 @@ static void zram_free_page(struct zram *zram, size_t index); +diff -rupN linux.orig/drivers/block/zram/zram_drv.c linux/drivers/block/zram/zram_drv.c +--- linux.orig/drivers/block/zram/zram_drv.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/block/zram/zram_drv.c 2022-12-04 10:40:26.680034137 -0500 +@@ -60,6 +60,40 @@ static void zram_free_page(struct zram * static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); @@ -718,7 +695,7 @@ index 226ea76cc8197..4043d909d41bf 100644 static int zram_slot_trylock(struct zram *zram, u32 index) { -@@ -75,6 +109,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) +@@ -75,6 +109,7 @@ static void zram_slot_unlock(struct zram { bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } @@ -726,7 +703,7 @@ index 226ea76cc8197..4043d909d41bf 100644 static inline bool init_done(struct zram *zram) { -@@ -1198,6 +1233,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) +@@ -1198,6 +1233,7 @@ static bool zram_meta_alloc(struct zram if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); @@ -734,10 +711,9 @@ index 226ea76cc8197..4043d909d41bf 100644 return true; } -diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h -index 80c3b43b4828f..ff021a9728d1e 100644 ---- a/drivers/block/zram/zram_drv.h -+++ b/drivers/block/zram/zram_drv.h +diff -rupN linux.orig/drivers/block/zram/zram_drv.h linux/drivers/block/zram/zram_drv.h +--- linux.orig/drivers/block/zram/zram_drv.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/block/zram/zram_drv.h 2022-12-04 10:40:26.680034137 -0500 @@ -63,6 +63,9 @@ struct zram_table_entry { unsigned long element; }; @@ -748,11 +724,10 @@ index 80c3b43b4828f..ff021a9728d1e 100644 #ifdef CONFIG_ZRAM_MEMORY_TRACKING ktime_t ac_time; #endif -diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c -index bcff6429e0b4f..4a9ae338a2bdf 100644 ---- a/drivers/char/tpm/tpm_tis.c -+++ b/drivers/char/tpm/tpm_tis.c -@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da +diff -rupN linux.orig/drivers/char/tpm/tpm_tis.c linux/drivers/char/tpm/tpm_tis.c +--- linux.orig/drivers/char/tpm/tpm_tis.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/char/tpm/tpm_tis.c 2022-12-04 10:40:26.680034137 -0500 +@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to return container_of(data, struct tpm_tis_tcg_phy, priv); } @@ -784,7 +759,7 @@ index bcff6429e0b4f..4a9ae338a2bdf 100644 static int interrupts = -1; module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); -@@ -185,12 +210,12 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, +@@ -185,12 +210,12 @@ static int tpm_tcg_write_bytes(struct tp switch (io_mode) { case TPM_TIS_PHYS_8: while (len--) @@ -799,11 +774,10 @@ index bcff6429e0b4f..4a9ae338a2bdf 100644 break; } -diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c -index 64cb060d9d753..77a41151c921b 100644 ---- a/drivers/gpio/gpio-mlxbf2.c -+++ b/drivers/gpio/gpio-mlxbf2.c -@@ -273,10 +273,8 @@ static irqreturn_t mlxbf2_gpio_irq_handler(int irq, void *ptr) +diff -rupN linux.orig/drivers/gpio/gpio-mlxbf2.c linux/drivers/gpio/gpio-mlxbf2.c +--- linux.orig/drivers/gpio/gpio-mlxbf2.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpio/gpio-mlxbf2.c 2022-12-04 10:40:26.680034137 -0500 +@@ -273,10 +273,8 @@ static irqreturn_t mlxbf2_gpio_irq_handl pending = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_CAUSE_EVTEN0); writel(pending, gs->gpio_io + YU_GPIO_CAUSE_OR_CLRCAUSE); @@ -816,23 +790,10 @@ index 64cb060d9d753..77a41151c921b 100644 return IRQ_RETVAL(pending); } -diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig -index 7ae3b7d67fcfc..844f54f1daea9 100644 ---- a/drivers/gpu/drm/i915/Kconfig -+++ b/drivers/gpu/drm/i915/Kconfig -@@ -3,7 +3,6 @@ config DRM_I915 - tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics" - depends on DRM - depends on X86 && PCI -- depends on !PREEMPT_RT - select INTEL_GTT if X86 - select INTERVAL_TREE - # we need shmfs for the swappable backing store, and in particular -diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c -index 4442aa355f868..23085e82c3ed5 100644 ---- a/drivers/gpu/drm/i915/display/intel_crtc.c -+++ b/drivers/gpu/drm/i915/display/intel_crtc.c -@@ -522,7 +522,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state) +diff -rupN linux.orig/drivers/gpu/drm/i915/display/intel_crtc.c linux/drivers/gpu/drm/i915/display/intel_crtc.c +--- linux.orig/drivers/gpu/drm/i915/display/intel_crtc.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/display/intel_crtc.c 2022-12-04 10:40:26.680034137 -0500 +@@ -522,7 +522,8 @@ void intel_pipe_update_start(struct inte */ intel_psr_wait_for_idle_locked(new_crtc_state); @@ -842,7 +803,7 @@ index 4442aa355f868..23085e82c3ed5 100644 crtc->debug.min_vbl = min; crtc->debug.max_vbl = max; -@@ -547,11 +548,13 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state) +@@ -547,11 +548,13 @@ void intel_pipe_update_start(struct inte break; } @@ -858,7 +819,7 @@ index 4442aa355f868..23085e82c3ed5 100644 } finish_wait(wq, &wait); -@@ -584,7 +587,8 @@ void intel_pipe_update_start(struct intel_crtc_state *new_crtc_state) +@@ -584,7 +587,8 @@ void intel_pipe_update_start(struct inte return; irq_disable: @@ -868,7 +829,7 @@ index 4442aa355f868..23085e82c3ed5 100644 } #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) -@@ -685,7 +689,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) +@@ -685,7 +689,8 @@ void intel_pipe_update_end(struct intel_ */ intel_vrr_send_push(new_crtc_state); @@ -878,11 +839,10 @@ index 4442aa355f868..23085e82c3ed5 100644 if (intel_vgpu_active(dev_priv)) return; -diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c -index ecc990ec1b952..8d04b10681f0d 100644 ---- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c -+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c -@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) +diff -rupN linux.orig/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c linux/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +--- linux.orig/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c 2022-12-04 10:40:26.680034137 -0500 +@@ -312,10 +312,9 @@ void __intel_breadcrumbs_park(struct int /* Kick the work once more to drain the signalers, and disarm the irq */ irq_work_sync(&b->irq_work); while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { @@ -895,11 +855,10 @@ index ecc990ec1b952..8d04b10681f0d 100644 } } -diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c -index c718e6dc40b51..0e592999b7d60 100644 ---- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c -+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c -@@ -1302,7 +1302,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +diff -rupN linux.orig/drivers/gpu/drm/i915/gt/intel_execlists_submission.c linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +--- linux.orig/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 2022-12-04 10:40:26.680034137 -0500 +@@ -1302,7 +1302,7 @@ static void execlists_dequeue(struct int * and context switches) submission. */ @@ -908,7 +867,7 @@ index c718e6dc40b51..0e592999b7d60 100644 /* * If the queue is higher priority than the last -@@ -1402,7 +1402,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1402,7 +1402,7 @@ static void execlists_dequeue(struct int * Even if ELSP[1] is occupied and not worthy * of timeslices, our queue might be. */ @@ -917,7 +876,7 @@ index c718e6dc40b51..0e592999b7d60 100644 return; } } -@@ -1428,7 +1428,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1428,7 +1428,7 @@ static void execlists_dequeue(struct int if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.sched_engine->lock); @@ -926,7 +885,7 @@ index c718e6dc40b51..0e592999b7d60 100644 return; /* leave this for another sibling */ } -@@ -1590,7 +1590,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1590,7 +1590,7 @@ done: */ sched_engine->queue_priority_hint = queue_prio(sched_engine); i915_sched_engine_reset_on_empty(sched_engine); @@ -935,7 +894,7 @@ index c718e6dc40b51..0e592999b7d60 100644 /* * We can skip poking the HW if we ended up with exactly the same set -@@ -1616,13 +1616,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) +@@ -1616,13 +1616,6 @@ done: } } @@ -949,7 +908,7 @@ index c718e6dc40b51..0e592999b7d60 100644 static void clear_ports(struct i915_request **ports, int count) { memset_p((void **)ports, NULL, count); -@@ -2468,7 +2461,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) +@@ -2468,7 +2461,7 @@ static void execlists_submission_tasklet } if (!engine->execlists.pending[0]) { @@ -958,11 +917,10 @@ index c718e6dc40b51..0e592999b7d60 100644 start_timeslice(engine); } -diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c -index 73cebc6aa6507..98305fb393413 100644 ---- a/drivers/gpu/drm/i915/i915_irq.c -+++ b/drivers/gpu/drm/i915/i915_irq.c -@@ -917,7 +917,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, +diff -rupN linux.orig/drivers/gpu/drm/i915/i915_irq.c linux/drivers/gpu/drm/i915/i915_irq.c +--- linux.orig/drivers/gpu/drm/i915/i915_irq.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/i915_irq.c 2022-12-04 10:40:26.680034137 -0500 +@@ -917,7 +917,8 @@ static bool i915_get_crtc_scanoutpos(str */ spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); @@ -972,7 +930,7 @@ index 73cebc6aa6507..98305fb393413 100644 /* Get optional system timestamp before query. */ if (stime) -@@ -981,7 +982,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, +@@ -981,7 +982,8 @@ static bool i915_get_crtc_scanoutpos(str if (etime) *etime = ktime_get(); @@ -982,11 +940,10 @@ index 73cebc6aa6507..98305fb393413 100644 spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); -diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c -index 62fad16a55e84..af07927650b24 100644 ---- a/drivers/gpu/drm/i915/i915_request.c -+++ b/drivers/gpu/drm/i915/i915_request.c -@@ -612,7 +612,6 @@ bool __i915_request_submit(struct i915_request *request) +diff -rupN linux.orig/drivers/gpu/drm/i915/i915_request.c linux/drivers/gpu/drm/i915/i915_request.c +--- linux.orig/drivers/gpu/drm/i915/i915_request.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/i915_request.c 2022-12-04 10:40:26.680034137 -0500 +@@ -612,7 +612,6 @@ bool __i915_request_submit(struct i915_r RQ_TRACE(request, "\n"); @@ -994,7 +951,7 @@ index 62fad16a55e84..af07927650b24 100644 lockdep_assert_held(&engine->sched_engine->lock); /* -@@ -721,7 +720,6 @@ void __i915_request_unsubmit(struct i915_request *request) +@@ -721,7 +720,6 @@ void __i915_request_unsubmit(struct i915 */ RQ_TRACE(request, "\n"); @@ -1002,10 +959,9 @@ index 62fad16a55e84..af07927650b24 100644 lockdep_assert_held(&engine->sched_engine->lock); /* -diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h -index 37b5c9e9d260e..73f29d8008f0c 100644 ---- a/drivers/gpu/drm/i915/i915_trace.h -+++ b/drivers/gpu/drm/i915/i915_trace.h +diff -rupN linux.orig/drivers/gpu/drm/i915/i915_trace.h linux/drivers/gpu/drm/i915/i915_trace.h +--- linux.orig/drivers/gpu/drm/i915/i915_trace.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/i915_trace.h 2022-12-04 10:40:26.680034137 -0500 @@ -6,6 +6,10 @@ #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _I915_TRACE_H_ @@ -1017,7 +973,7 @@ index 37b5c9e9d260e..73f29d8008f0c 100644 #include #include #include -@@ -323,7 +327,7 @@ DEFINE_EVENT(i915_request, i915_request_add, +@@ -323,7 +327,7 @@ DEFINE_EVENT(i915_request, i915_request_ TP_ARGS(rq) ); @@ -1026,11 +982,10 @@ index 37b5c9e9d260e..73f29d8008f0c 100644 DEFINE_EVENT(i915_request, i915_request_guc_submit, TP_PROTO(struct i915_request *rq), TP_ARGS(rq) -diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h -index c10d68cdc3ca5..593f3a7e0e4fc 100644 ---- a/drivers/gpu/drm/i915/i915_utils.h -+++ b/drivers/gpu/drm/i915/i915_utils.h -@@ -294,7 +294,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) +diff -rupN linux.orig/drivers/gpu/drm/i915/i915_utils.h linux/drivers/gpu/drm/i915/i915_utils.h +--- linux.orig/drivers/gpu/drm/i915/i915_utils.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/i915_utils.h 2022-12-04 10:40:26.680034137 -0500 +@@ -294,7 +294,7 @@ wait_remaining_ms_from_jiffies(unsigned #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ @@ -1039,10 +994,20 @@ index c10d68cdc3ca5..593f3a7e0e4fc 100644 # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) #else # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) -diff --git a/drivers/net/ethernet/alacritech/slic.h b/drivers/net/ethernet/alacritech/slic.h -index 4eecbdfff3ff1..82071d0e5f7fc 100644 ---- a/drivers/net/ethernet/alacritech/slic.h -+++ b/drivers/net/ethernet/alacritech/slic.h +diff -rupN linux.orig/drivers/gpu/drm/i915/Kconfig linux/drivers/gpu/drm/i915/Kconfig +--- linux.orig/drivers/gpu/drm/i915/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/gpu/drm/i915/Kconfig 2022-12-04 10:40:26.680034137 -0500 +@@ -3,7 +3,6 @@ config DRM_I915 + tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics" + depends on DRM + depends on X86 && PCI +- depends on !PREEMPT_RT + select INTEL_GTT if X86 + select INTERVAL_TREE + # we need shmfs for the swappable backing store, and in particular +diff -rupN linux.orig/drivers/net/ethernet/alacritech/slic.h linux/drivers/net/ethernet/alacritech/slic.h +--- linux.orig/drivers/net/ethernet/alacritech/slic.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/alacritech/slic.h 2022-12-04 10:40:26.680034137 -0500 @@ -288,13 +288,13 @@ do { \ u64_stats_update_end(&(st)->syncp); \ } while (0) @@ -1063,11 +1028,10 @@ index 4eecbdfff3ff1..82071d0e5f7fc 100644 } struct slic_upr { -diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c -index 39242c5a17290..8f81d288c4880 100644 ---- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c -+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c -@@ -118,9 +118,9 @@ static void ena_safe_update_stat(u64 *src, u64 *dst, +diff -rupN linux.orig/drivers/net/ethernet/amazon/ena/ena_ethtool.c linux/drivers/net/ethernet/amazon/ena/ena_ethtool.c +--- linux.orig/drivers/net/ethernet/amazon/ena/ena_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/amazon/ena/ena_ethtool.c 2022-12-04 10:40:26.680034137 -0500 +@@ -118,9 +118,9 @@ static void ena_safe_update_stat(u64 *sr unsigned int start; do { @@ -1079,11 +1043,10 @@ index 39242c5a17290..8f81d288c4880 100644 } static void ena_queue_stats(struct ena_adapter *adapter, u64 **data) -diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c -index 6a356a6cee15a..1c5d482990806 100644 ---- a/drivers/net/ethernet/amazon/ena/ena_netdev.c -+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c -@@ -3270,10 +3270,10 @@ static void ena_get_stats64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/amazon/ena/ena_netdev.c linux/drivers/net/ethernet/amazon/ena/ena_netdev.c +--- linux.orig/drivers/net/ethernet/amazon/ena/ena_netdev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/amazon/ena/ena_netdev.c 2022-12-04 10:40:26.680034137 -0500 +@@ -3270,10 +3270,10 @@ static void ena_get_stats64(struct net_d tx_ring = &adapter->tx_ring[i]; do { @@ -1096,7 +1059,7 @@ index 6a356a6cee15a..1c5d482990806 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; -@@ -3281,20 +3281,20 @@ static void ena_get_stats64(struct net_device *netdev, +@@ -3281,20 +3281,20 @@ static void ena_get_stats64(struct net_d rx_ring = &adapter->rx_ring[i]; do { @@ -1121,11 +1084,10 @@ index 6a356a6cee15a..1c5d482990806 100644 stats->rx_dropped = rx_drops; stats->tx_dropped = tx_drops; -diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c -index 25129e723b575..1e8d902e1c8ea 100644 ---- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c -+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c -@@ -934,7 +934,7 @@ unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data) +diff -rupN linux.orig/drivers/net/ethernet/aquantia/atlantic/aq_ring.c linux/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +--- linux.orig/drivers/net/ethernet/aquantia/atlantic/aq_ring.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/aquantia/atlantic/aq_ring.c 2022-12-04 10:40:26.680034137 -0500 +@@ -934,7 +934,7 @@ unsigned int aq_ring_fill_stats_data(str /* This data should mimic aq_ethtool_queue_rx_stat_names structure */ do { count = 0; @@ -1134,7 +1096,7 @@ index 25129e723b575..1e8d902e1c8ea 100644 data[count] = self->stats.rx.packets; data[++count] = self->stats.rx.jumbo_packets; data[++count] = self->stats.rx.lro_packets; -@@ -951,15 +951,15 @@ unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data) +@@ -951,15 +951,15 @@ unsigned int aq_ring_fill_stats_data(str data[++count] = self->stats.rx.xdp_tx; data[++count] = self->stats.rx.xdp_invalid; data[++count] = self->stats.rx.xdp_redirect; @@ -1153,11 +1115,10 @@ index 25129e723b575..1e8d902e1c8ea 100644 } return ++count; -diff --git a/drivers/net/ethernet/asix/ax88796c_main.c b/drivers/net/ethernet/asix/ax88796c_main.c -index 6ba5b024a7be7..25e7beb68e515 100644 ---- a/drivers/net/ethernet/asix/ax88796c_main.c -+++ b/drivers/net/ethernet/asix/ax88796c_main.c -@@ -662,12 +662,12 @@ static void ax88796c_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/ethernet/asix/ax88796c_main.c linux/drivers/net/ethernet/asix/ax88796c_main.c +--- linux.orig/drivers/net/ethernet/asix/ax88796c_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/asix/ax88796c_main.c 2022-12-04 10:40:26.680034137 -0500 +@@ -662,12 +662,12 @@ static void ax88796c_get_stats64(struct s = per_cpu_ptr(ax_local->stats, cpu); do { @@ -1172,11 +1133,10 @@ index 6ba5b024a7be7..25e7beb68e515 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c -index e5857e88c2076..caf1714f36a18 100644 ---- a/drivers/net/ethernet/broadcom/b44.c -+++ b/drivers/net/ethernet/broadcom/b44.c -@@ -1680,7 +1680,7 @@ static void b44_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/broadcom/b44.c linux/drivers/net/ethernet/broadcom/b44.c +--- linux.orig/drivers/net/ethernet/broadcom/b44.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/broadcom/b44.c 2022-12-04 10:40:26.680034137 -0500 +@@ -1680,7 +1680,7 @@ static void b44_get_stats64(struct net_d unsigned int start; do { @@ -1185,7 +1145,7 @@ index e5857e88c2076..caf1714f36a18 100644 /* Convert HW stats into rtnl_link_stats64 stats. */ nstat->rx_packets = hwstat->rx_pkts; -@@ -1714,7 +1714,7 @@ static void b44_get_stats64(struct net_device *dev, +@@ -1714,7 +1714,7 @@ static void b44_get_stats64(struct net_d /* Carrier lost counter seems to be broken for some devices */ nstat->tx_carrier_errors = hwstat->tx_carrier_lost; #endif @@ -1194,7 +1154,7 @@ index e5857e88c2076..caf1714f36a18 100644 } -@@ -2082,12 +2082,12 @@ static void b44_get_ethtool_stats(struct net_device *dev, +@@ -2082,12 +2082,12 @@ static void b44_get_ethtool_stats(struct do { data_src = &hwstat->tx_good_octets; data_dst = data; @@ -1209,11 +1169,10 @@ index e5857e88c2076..caf1714f36a18 100644 } static void b44_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) -diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c -index 47fc8e6963d59..98d5bd15ee433 100644 ---- a/drivers/net/ethernet/broadcom/bcmsysport.c -+++ b/drivers/net/ethernet/broadcom/bcmsysport.c -@@ -457,10 +457,10 @@ static void bcm_sysport_update_tx_stats(struct bcm_sysport_priv *priv, +diff -rupN linux.orig/drivers/net/ethernet/broadcom/bcmsysport.c linux/drivers/net/ethernet/broadcom/bcmsysport.c +--- linux.orig/drivers/net/ethernet/broadcom/bcmsysport.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/broadcom/bcmsysport.c 2022-12-04 10:40:26.680034137 -0500 +@@ -457,10 +457,10 @@ static void bcm_sysport_update_tx_stats( for (q = 0; q < priv->netdev->num_tx_queues; q++) { ring = &priv->tx_rings[q]; do { @@ -1226,7 +1185,7 @@ index 47fc8e6963d59..98d5bd15ee433 100644 *tx_bytes += bytes; *tx_packets += packets; -@@ -504,9 +504,9 @@ static void bcm_sysport_get_stats(struct net_device *dev, +@@ -504,9 +504,9 @@ static void bcm_sysport_get_stats(struct if (s->stat_sizeof == sizeof(u64) && s->type == BCM_SYSPORT_STAT_NETDEV64) { do { @@ -1238,7 +1197,7 @@ index 47fc8e6963d59..98d5bd15ee433 100644 } else data[i] = *(u32 *)p; j++; -@@ -1878,10 +1878,10 @@ static void bcm_sysport_get_stats64(struct net_device *dev, +@@ -1878,10 +1878,10 @@ static void bcm_sysport_get_stats64(stru &stats->tx_packets); do { @@ -1251,11 +1210,10 @@ index 47fc8e6963d59..98d5bd15ee433 100644 } static void bcm_sysport_netif_start(struct net_device *dev) -diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c -index 6dae768671e3d..9e6de2f968fa3 100644 ---- a/drivers/net/ethernet/cortina/gemini.c -+++ b/drivers/net/ethernet/cortina/gemini.c -@@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/cortina/gemini.c linux/drivers/net/ethernet/cortina/gemini.c +--- linux.orig/drivers/net/ethernet/cortina/gemini.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/cortina/gemini.c 2022-12-04 10:40:26.680034137 -0500 +@@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_ /* Racing with RX NAPI */ do { @@ -1264,7 +1222,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 stats->rx_packets = port->stats.rx_packets; stats->rx_bytes = port->stats.rx_bytes; -@@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_device *netdev, +@@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_ stats->rx_crc_errors = port->stats.rx_crc_errors; stats->rx_frame_errors = port->stats.rx_frame_errors; @@ -1278,7 +1236,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 stats->tx_errors = port->stats.tx_errors; stats->tx_packets = port->stats.tx_packets; -@@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_device *netdev, +@@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_ stats->rx_missed_errors = port->stats.rx_missed_errors; stats->rx_fifo_errors = port->stats.rx_fifo_errors; @@ -1297,7 +1255,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 stats->rx_dropped += stats->rx_missed_errors; } -@@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, +@@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struc /* Racing with MIB interrupt */ do { p = values; @@ -1319,7 +1277,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 for (i = 0; i < RX_STATUS_NUM; i++) *p++ = port->rx_stats[i]; -@@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, +@@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struc *p++ = port->rx_csum_stats[i]; *p++ = port->rx_napi_exits; @@ -1335,7 +1293,7 @@ index 6dae768671e3d..9e6de2f968fa3 100644 for (i = 0; i < TX_MAX_FRAGS; i++) { *values++ = port->tx_frag_stats[i]; -@@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, +@@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struc *values++ = port->tx_frags_linearized; *values++ = port->tx_hw_csummed; @@ -1344,11 +1302,10 @@ index 6dae768671e3d..9e6de2f968fa3 100644 } static int gmac_get_ksettings(struct net_device *netdev, -diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c -index bd0df189d8719..39e7a4a3c15e6 100644 ---- a/drivers/net/ethernet/emulex/benet/be_ethtool.c -+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c -@@ -389,10 +389,10 @@ static void be_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/emulex/benet/be_ethtool.c linux/drivers/net/ethernet/emulex/benet/be_ethtool.c +--- linux.orig/drivers/net/ethernet/emulex/benet/be_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/emulex/benet/be_ethtool.c 2022-12-04 10:40:26.680034137 -0500 +@@ -389,10 +389,10 @@ static void be_get_ethtool_stats(struct struct be_rx_stats *stats = rx_stats(rxo); do { @@ -1361,7 +1318,7 @@ index bd0df189d8719..39e7a4a3c15e6 100644 for (i = 2; i < ETHTOOL_RXSTATS_NUM; i++) { p = (u8 *)stats + et_rx_stats[i].offset; -@@ -405,19 +405,19 @@ static void be_get_ethtool_stats(struct net_device *netdev, +@@ -405,19 +405,19 @@ static void be_get_ethtool_stats(struct struct be_tx_stats *stats = tx_stats(txo); do { @@ -1385,11 +1342,10 @@ index bd0df189d8719..39e7a4a3c15e6 100644 base += ETHTOOL_TXSTATS_NUM; } } -diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c -index 414362febbb9d..9350c901aa27b 100644 ---- a/drivers/net/ethernet/emulex/benet/be_main.c -+++ b/drivers/net/ethernet/emulex/benet/be_main.c -@@ -665,10 +665,10 @@ static void be_get_stats64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/emulex/benet/be_main.c linux/drivers/net/ethernet/emulex/benet/be_main.c +--- linux.orig/drivers/net/ethernet/emulex/benet/be_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/emulex/benet/be_main.c 2022-12-04 10:40:26.684034126 -0500 +@@ -665,10 +665,10 @@ static void be_get_stats64(struct net_de const struct be_rx_stats *rx_stats = rx_stats(rxo); do { @@ -1402,7 +1358,7 @@ index 414362febbb9d..9350c901aa27b 100644 stats->rx_packets += pkts; stats->rx_bytes += bytes; stats->multicast += rx_stats(rxo)->rx_mcast_pkts; -@@ -680,10 +680,10 @@ static void be_get_stats64(struct net_device *netdev, +@@ -680,10 +680,10 @@ static void be_get_stats64(struct net_de const struct be_tx_stats *tx_stats = tx_stats(txo); do { @@ -1415,7 +1371,7 @@ index 414362febbb9d..9350c901aa27b 100644 stats->tx_packets += pkts; stats->tx_bytes += bytes; } -@@ -2155,16 +2155,16 @@ static int be_get_new_eqd(struct be_eq_obj *eqo) +@@ -2155,16 +2155,16 @@ static int be_get_new_eqd(struct be_eq_o for_all_rx_queues_on_eq(adapter, eqo, rxo, i) { do { @@ -1436,10 +1392,9 @@ index 414362febbb9d..9350c901aa27b 100644 } /* Skip, if wrapped around or first calculation */ -diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h -index 671f51135c269..53b7e95213a85 100644 ---- a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h -+++ b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h +diff -rupN linux.orig/drivers/net/ethernet/fungible/funeth/funeth_txrx.h linux/drivers/net/ethernet/fungible/funeth/funeth_txrx.h +--- linux.orig/drivers/net/ethernet/fungible/funeth/funeth_txrx.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/fungible/funeth/funeth_txrx.h 2022-12-04 10:40:26.684034126 -0500 @@ -206,9 +206,9 @@ struct funeth_rxq { #define FUN_QSTAT_READ(q, seq, stats_copy) \ @@ -1452,11 +1407,10 @@ index 671f51135c269..53b7e95213a85 100644 #define FUN_INT_NAME_LEN (IFNAMSIZ + 16) -diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c -index 7b9a2d9d96243..50b384910c839 100644 ---- a/drivers/net/ethernet/google/gve/gve_ethtool.c -+++ b/drivers/net/ethernet/google/gve/gve_ethtool.c -@@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/google/gve/gve_ethtool.c linux/drivers/net/ethernet/google/gve/gve_ethtool.c +--- linux.orig/drivers/net/ethernet/google/gve/gve_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/google/gve/gve_ethtool.c 2022-12-04 10:40:26.684034126 -0500 +@@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device struct gve_rx_ring *rx = &priv->rx[ring]; start = @@ -1473,7 +1427,7 @@ index 7b9a2d9d96243..50b384910c839 100644 start)); rx_pkts += tmp_rx_pkts; rx_bytes += tmp_rx_bytes; -@@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device *netdev, +@@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device if (priv->tx) { do { start = @@ -1486,7 +1440,7 @@ index 7b9a2d9d96243..50b384910c839 100644 start)); tx_pkts += tmp_tx_pkts; tx_bytes += tmp_tx_bytes; -@@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device *netdev, +@@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device data[i++] = rx->fill_cnt - rx->cnt; do { start = @@ -1502,7 +1456,7 @@ index 7b9a2d9d96243..50b384910c839 100644 start)); data[i++] = tmp_rx_bytes; data[i++] = rx->rx_cont_packet_cnt; -@@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device *netdev, +@@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device } do { start = @@ -1514,11 +1468,10 @@ index 7b9a2d9d96243..50b384910c839 100644 start)); data[i++] = tmp_tx_bytes; data[i++] = tx->wake_queue; -diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c -index 044db3ebb071c..6cafee55efc32 100644 ---- a/drivers/net/ethernet/google/gve/gve_main.c -+++ b/drivers/net/ethernet/google/gve/gve_main.c -@@ -51,10 +51,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) +diff -rupN linux.orig/drivers/net/ethernet/google/gve/gve_main.c linux/drivers/net/ethernet/google/gve/gve_main.c +--- linux.orig/drivers/net/ethernet/google/gve/gve_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/google/gve/gve_main.c 2022-12-04 10:40:26.684034126 -0500 +@@ -51,10 +51,10 @@ static void gve_get_stats(struct net_dev for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { do { start = @@ -1531,7 +1484,7 @@ index 044db3ebb071c..6cafee55efc32 100644 start)); s->rx_packets += packets; s->rx_bytes += bytes; -@@ -64,10 +64,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) +@@ -64,10 +64,10 @@ static void gve_get_stats(struct net_dev for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { do { start = @@ -1544,7 +1497,7 @@ index 044db3ebb071c..6cafee55efc32 100644 start)); s->tx_packets += packets; s->tx_bytes += bytes; -@@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_priv *priv) +@@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_ } do { @@ -1556,11 +1509,10 @@ index 044db3ebb071c..6cafee55efc32 100644 stats[stats_idx++] = (struct stats) { .stat_name = cpu_to_be32(TX_WAKE_CNT), .value = cpu_to_be64(priv->tx[idx].wake_queue), -diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c -index 35d70041b9e84..f82e98263307a 100644 ---- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c -+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c -@@ -2486,7 +2486,7 @@ static void hns3_fetch_stats(struct rtnl_link_stats64 *stats, +diff -rupN linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +--- linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 2022-12-04 10:40:26.684034126 -0500 +@@ -2488,7 +2488,7 @@ static void hns3_fetch_stats(struct rtnl unsigned int start; do { @@ -1569,7 +1521,7 @@ index 35d70041b9e84..f82e98263307a 100644 if (is_tx) { stats->tx_bytes += ring->stats.tx_bytes; stats->tx_packets += ring->stats.tx_pkts; -@@ -2520,7 +2520,7 @@ static void hns3_fetch_stats(struct rtnl_link_stats64 *stats, +@@ -2522,7 +2522,7 @@ static void hns3_fetch_stats(struct rtnl stats->multicast += ring->stats.rx_multicast; stats->rx_length_errors += ring->stats.err_pkt_len; } @@ -1578,11 +1530,5909 @@ index 35d70041b9e84..f82e98263307a 100644 } static void hns3_nic_get_stats64(struct net_device *netdev, -diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c -index e5828a658caf4..a866bea651103 100644 ---- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c -+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c -@@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, struct hinic_rxq_stats *stats) +diff -rupN linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig +--- linux.orig/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c.orig 2022-12-04 10:40:18.116056079 -0500 +@@ -0,0 +1,5895 @@ ++// SPDX-License-Identifier: GPL-2.0+ ++// Copyright (c) 2016-2017 Hisilicon Limited. ++ ++#include ++#include ++#include ++#ifdef CONFIG_RFS_ACCEL ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "hnae3.h" ++#include "hns3_enet.h" ++/* All hns3 tracepoints are defined by the include below, which ++ * must be included exactly once across the whole kernel with ++ * CREATE_TRACE_POINTS defined ++ */ ++#define CREATE_TRACE_POINTS ++#include "hns3_trace.h" ++ ++#define hns3_set_field(origin, shift, val) ((origin) |= (val) << (shift)) ++#define hns3_tx_bd_count(S) DIV_ROUND_UP(S, HNS3_MAX_BD_SIZE) ++ ++#define hns3_rl_err(fmt, ...) \ ++ do { \ ++ if (net_ratelimit()) \ ++ netdev_err(fmt, ##__VA_ARGS__); \ ++ } while (0) ++ ++static void hns3_clear_all_ring(struct hnae3_handle *h, bool force); ++ ++static const char hns3_driver_name[] = "hns3"; ++static const char hns3_driver_string[] = ++ "Hisilicon Ethernet Network Driver for Hip08 Family"; ++static const char hns3_copyright[] = "Copyright (c) 2017 Huawei Corporation."; ++static struct hnae3_client client; ++ ++static int debug = -1; ++module_param(debug, int, 0); ++MODULE_PARM_DESC(debug, " Network interface message level setting"); ++ ++static unsigned int tx_sgl = 1; ++module_param(tx_sgl, uint, 0600); ++MODULE_PARM_DESC(tx_sgl, "Minimum number of frags when using dma_map_sg() to optimize the IOMMU mapping"); ++ ++static bool page_pool_enabled = true; ++module_param(page_pool_enabled, bool, 0400); ++ ++#define HNS3_SGL_SIZE(nfrag) (sizeof(struct scatterlist) * (nfrag) + \ ++ sizeof(struct sg_table)) ++#define HNS3_MAX_SGL_SIZE ALIGN(HNS3_SGL_SIZE(HNS3_MAX_TSO_BD_NUM), \ ++ dma_get_cache_alignment()) ++ ++#define DEFAULT_MSG_LEVEL (NETIF_MSG_PROBE | NETIF_MSG_LINK | \ ++ NETIF_MSG_IFDOWN | NETIF_MSG_IFUP) ++ ++#define HNS3_INNER_VLAN_TAG 1 ++#define HNS3_OUTER_VLAN_TAG 2 ++ ++#define HNS3_MIN_TX_LEN 33U ++#define HNS3_MIN_TUN_PKT_LEN 65U ++ ++/* hns3_pci_tbl - PCI Device ID Table ++ * ++ * Last entry must be all 0s ++ * ++ * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, ++ * Class, Class Mask, private data (not used) } ++ */ ++static const struct pci_device_id hns3_pci_tbl[] = { ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_200G_RDMA), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_VF), 0}, ++ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_RDMA_DCB_PFC_VF), ++ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, ++ /* required last entry */ ++ {0,} ++}; ++MODULE_DEVICE_TABLE(pci, hns3_pci_tbl); ++ ++#define HNS3_RX_PTYPE_ENTRY(ptype, l, s, t, h) \ ++ { ptype, \ ++ l, \ ++ CHECKSUM_##s, \ ++ HNS3_L3_TYPE_##t, \ ++ 1, \ ++ h} ++ ++#define HNS3_RX_PTYPE_UNUSED_ENTRY(ptype) \ ++ { ptype, 0, CHECKSUM_NONE, HNS3_L3_TYPE_PARSE_FAIL, 0, \ ++ PKT_HASH_TYPE_NONE } ++ ++static const struct hns3_rx_ptype hns3_rx_ptype_tbl[] = { ++ HNS3_RX_PTYPE_UNUSED_ENTRY(0), ++ HNS3_RX_PTYPE_ENTRY(1, 0, COMPLETE, ARP, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(2, 0, COMPLETE, RARP, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(3, 0, COMPLETE, LLDP, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(4, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(5, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(6, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(7, 0, COMPLETE, CNM, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(8, 0, NONE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(9), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(10), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(11), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(12), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(13), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(14), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(15), ++ HNS3_RX_PTYPE_ENTRY(16, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(17, 0, COMPLETE, IPV4, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(18, 0, COMPLETE, IPV4, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(19, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(20, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(21, 0, NONE, IPV4, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(22, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(23, 0, NONE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(24, 0, NONE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(25, 0, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(26), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(27), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(28), ++ HNS3_RX_PTYPE_ENTRY(29, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(30, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(31, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(32, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(33, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(34, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(35, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(36, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(37, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(38), ++ HNS3_RX_PTYPE_ENTRY(39, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(40, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(41, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(42, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(43, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(44, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(45, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(46), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(47), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(48), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(49), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(50), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(51), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(52), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(53), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(54), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(55), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(56), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(57), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(58), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(59), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(60), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(61), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(62), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(63), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(64), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(65), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(66), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(67), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(68), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(69), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(70), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(71), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(72), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(73), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(74), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(75), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(76), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(77), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(78), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(79), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(80), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(81), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(82), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(83), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(84), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(85), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(86), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(87), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(88), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(89), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(90), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(91), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(92), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(93), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(94), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(95), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(96), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(97), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(98), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(99), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(100), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(101), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(102), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(103), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(104), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(105), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(106), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(107), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(108), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(109), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(110), ++ HNS3_RX_PTYPE_ENTRY(111, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(112, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(113, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(114, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(115, 0, NONE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(116, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(117, 0, NONE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(118, 0, NONE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(119, 0, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(120), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(121), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(122), ++ HNS3_RX_PTYPE_ENTRY(123, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(124, 0, COMPLETE, PARSE_FAIL, PKT_HASH_TYPE_NONE), ++ HNS3_RX_PTYPE_ENTRY(125, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(126, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(127, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(128, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(129, 1, UNNECESSARY, IPV4, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(130, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(131, 0, COMPLETE, IPV4, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(132), ++ HNS3_RX_PTYPE_ENTRY(133, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(134, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(135, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(136, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(137, 1, UNNECESSARY, IPV6, PKT_HASH_TYPE_L4), ++ HNS3_RX_PTYPE_ENTRY(138, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_ENTRY(139, 0, COMPLETE, IPV6, PKT_HASH_TYPE_L3), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(140), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(141), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(142), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(143), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(144), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(145), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(146), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(147), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(148), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(149), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(150), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(151), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(152), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(153), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(154), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(155), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(156), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(157), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(158), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(159), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(160), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(161), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(162), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(163), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(164), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(165), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(166), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(167), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(168), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(169), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(170), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(171), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(172), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(173), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(174), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(175), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(176), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(177), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(178), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(179), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(180), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(181), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(182), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(183), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(184), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(185), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(186), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(187), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(188), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(189), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(190), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(191), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(192), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(193), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(194), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(195), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(196), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(197), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(198), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(199), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(200), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(201), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(202), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(203), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(204), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(205), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(206), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(207), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(208), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(209), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(210), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(211), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(212), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(213), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(214), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(215), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(216), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(217), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(218), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(219), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(220), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(221), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(222), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(223), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(224), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(225), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(226), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(227), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(228), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(229), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(230), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(231), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(232), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(233), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(234), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(235), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(236), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(237), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(238), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(239), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(240), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(241), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(242), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(243), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(244), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(245), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(246), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(247), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(248), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(249), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(250), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(251), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(252), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(253), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(254), ++ HNS3_RX_PTYPE_UNUSED_ENTRY(255), ++}; ++ ++#define HNS3_INVALID_PTYPE \ ++ ARRAY_SIZE(hns3_rx_ptype_tbl) ++ ++static irqreturn_t hns3_irq_handle(int irq, void *vector) ++{ ++ struct hns3_enet_tqp_vector *tqp_vector = vector; ++ ++ napi_schedule_irqoff(&tqp_vector->napi); ++ tqp_vector->event_cnt++; ++ ++ return IRQ_HANDLED; ++} ++ ++static void hns3_nic_uninit_irq(struct hns3_nic_priv *priv) ++{ ++ struct hns3_enet_tqp_vector *tqp_vectors; ++ unsigned int i; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vectors = &priv->tqp_vector[i]; ++ ++ if (tqp_vectors->irq_init_flag != HNS3_VECTOR_INITED) ++ continue; ++ ++ /* clear the affinity mask */ ++ irq_set_affinity_hint(tqp_vectors->vector_irq, NULL); ++ ++ /* release the irq resource */ ++ free_irq(tqp_vectors->vector_irq, tqp_vectors); ++ tqp_vectors->irq_init_flag = HNS3_VECTOR_NOT_INITED; ++ } ++} ++ ++static int hns3_nic_init_irq(struct hns3_nic_priv *priv) ++{ ++ struct hns3_enet_tqp_vector *tqp_vectors; ++ int txrx_int_idx = 0; ++ int rx_int_idx = 0; ++ int tx_int_idx = 0; ++ unsigned int i; ++ int ret; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vectors = &priv->tqp_vector[i]; ++ ++ if (tqp_vectors->irq_init_flag == HNS3_VECTOR_INITED) ++ continue; ++ ++ if (tqp_vectors->tx_group.ring && tqp_vectors->rx_group.ring) { ++ snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN, ++ "%s-%s-%s-%d", hns3_driver_name, ++ pci_name(priv->ae_handle->pdev), ++ "TxRx", txrx_int_idx++); ++ txrx_int_idx++; ++ } else if (tqp_vectors->rx_group.ring) { ++ snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN, ++ "%s-%s-%s-%d", hns3_driver_name, ++ pci_name(priv->ae_handle->pdev), ++ "Rx", rx_int_idx++); ++ } else if (tqp_vectors->tx_group.ring) { ++ snprintf(tqp_vectors->name, HNAE3_INT_NAME_LEN, ++ "%s-%s-%s-%d", hns3_driver_name, ++ pci_name(priv->ae_handle->pdev), ++ "Tx", tx_int_idx++); ++ } else { ++ /* Skip this unused q_vector */ ++ continue; ++ } ++ ++ tqp_vectors->name[HNAE3_INT_NAME_LEN - 1] = '\0'; ++ ++ irq_set_status_flags(tqp_vectors->vector_irq, IRQ_NOAUTOEN); ++ ret = request_irq(tqp_vectors->vector_irq, hns3_irq_handle, 0, ++ tqp_vectors->name, tqp_vectors); ++ if (ret) { ++ netdev_err(priv->netdev, "request irq(%d) fail\n", ++ tqp_vectors->vector_irq); ++ hns3_nic_uninit_irq(priv); ++ return ret; ++ } ++ ++ irq_set_affinity_hint(tqp_vectors->vector_irq, ++ &tqp_vectors->affinity_mask); ++ ++ tqp_vectors->irq_init_flag = HNS3_VECTOR_INITED; ++ } ++ ++ return 0; ++} ++ ++static void hns3_mask_vector_irq(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 mask_en) ++{ ++ writel(mask_en, tqp_vector->mask_addr); ++} ++ ++static void hns3_vector_enable(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ napi_enable(&tqp_vector->napi); ++ enable_irq(tqp_vector->vector_irq); ++ ++ /* enable vector */ ++ hns3_mask_vector_irq(tqp_vector, 1); ++} ++ ++static void hns3_vector_disable(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ /* disable vector */ ++ hns3_mask_vector_irq(tqp_vector, 0); ++ ++ disable_irq(tqp_vector->vector_irq); ++ napi_disable(&tqp_vector->napi); ++ cancel_work_sync(&tqp_vector->rx_group.dim.work); ++ cancel_work_sync(&tqp_vector->tx_group.dim.work); ++} ++ ++void hns3_set_vector_coalesce_rl(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 rl_value) ++{ ++ u32 rl_reg = hns3_rl_usec_to_reg(rl_value); ++ ++ /* this defines the configuration for RL (Interrupt Rate Limiter). ++ * Rl defines rate of interrupts i.e. number of interrupts-per-second ++ * GL and RL(Rate Limiter) are 2 ways to acheive interrupt coalescing ++ */ ++ if (rl_reg > 0 && !tqp_vector->tx_group.coal.adapt_enable && ++ !tqp_vector->rx_group.coal.adapt_enable) ++ /* According to the hardware, the range of rl_reg is ++ * 0-59 and the unit is 4. ++ */ ++ rl_reg |= HNS3_INT_RL_ENABLE_MASK; ++ ++ writel(rl_reg, tqp_vector->mask_addr + HNS3_VECTOR_RL_OFFSET); ++} ++ ++void hns3_set_vector_coalesce_rx_gl(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 gl_value) ++{ ++ u32 new_val; ++ ++ if (tqp_vector->rx_group.coal.unit_1us) ++ new_val = gl_value | HNS3_INT_GL_1US; ++ else ++ new_val = hns3_gl_usec_to_reg(gl_value); ++ ++ writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL0_OFFSET); ++} ++ ++void hns3_set_vector_coalesce_tx_gl(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 gl_value) ++{ ++ u32 new_val; ++ ++ if (tqp_vector->tx_group.coal.unit_1us) ++ new_val = gl_value | HNS3_INT_GL_1US; ++ else ++ new_val = hns3_gl_usec_to_reg(gl_value); ++ ++ writel(new_val, tqp_vector->mask_addr + HNS3_VECTOR_GL1_OFFSET); ++} ++ ++void hns3_set_vector_coalesce_tx_ql(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 ql_value) ++{ ++ writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_TX_QL_OFFSET); ++} ++ ++void hns3_set_vector_coalesce_rx_ql(struct hns3_enet_tqp_vector *tqp_vector, ++ u32 ql_value) ++{ ++ writel(ql_value, tqp_vector->mask_addr + HNS3_VECTOR_RX_QL_OFFSET); ++} ++ ++static void hns3_vector_coalesce_init(struct hns3_enet_tqp_vector *tqp_vector, ++ struct hns3_nic_priv *priv) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); ++ struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal; ++ struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal; ++ struct hns3_enet_coalesce *ptx_coal = &priv->tx_coal; ++ struct hns3_enet_coalesce *prx_coal = &priv->rx_coal; ++ ++ tx_coal->adapt_enable = ptx_coal->adapt_enable; ++ rx_coal->adapt_enable = prx_coal->adapt_enable; ++ ++ tx_coal->int_gl = ptx_coal->int_gl; ++ rx_coal->int_gl = prx_coal->int_gl; ++ ++ rx_coal->flow_level = prx_coal->flow_level; ++ tx_coal->flow_level = ptx_coal->flow_level; ++ ++ /* device version above V3(include V3), GL can configure 1us ++ * unit, so uses 1us unit. ++ */ ++ if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) { ++ tx_coal->unit_1us = 1; ++ rx_coal->unit_1us = 1; ++ } ++ ++ if (ae_dev->dev_specs.int_ql_max) { ++ tx_coal->ql_enable = 1; ++ rx_coal->ql_enable = 1; ++ tx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max; ++ rx_coal->int_ql_max = ae_dev->dev_specs.int_ql_max; ++ tx_coal->int_ql = ptx_coal->int_ql; ++ rx_coal->int_ql = prx_coal->int_ql; ++ } ++} ++ ++static void ++hns3_vector_coalesce_init_hw(struct hns3_enet_tqp_vector *tqp_vector, ++ struct hns3_nic_priv *priv) ++{ ++ struct hns3_enet_coalesce *tx_coal = &tqp_vector->tx_group.coal; ++ struct hns3_enet_coalesce *rx_coal = &tqp_vector->rx_group.coal; ++ struct hnae3_handle *h = priv->ae_handle; ++ ++ hns3_set_vector_coalesce_tx_gl(tqp_vector, tx_coal->int_gl); ++ hns3_set_vector_coalesce_rx_gl(tqp_vector, rx_coal->int_gl); ++ hns3_set_vector_coalesce_rl(tqp_vector, h->kinfo.int_rl_setting); ++ ++ if (tx_coal->ql_enable) ++ hns3_set_vector_coalesce_tx_ql(tqp_vector, tx_coal->int_ql); ++ ++ if (rx_coal->ql_enable) ++ hns3_set_vector_coalesce_rx_ql(tqp_vector, rx_coal->int_ql); ++} ++ ++static int hns3_nic_set_real_num_queue(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct hnae3_knic_private_info *kinfo = &h->kinfo; ++ struct hnae3_tc_info *tc_info = &kinfo->tc_info; ++ unsigned int queue_size = kinfo->num_tqps; ++ int i, ret; ++ ++ if (tc_info->num_tc <= 1 && !tc_info->mqprio_active) { ++ netdev_reset_tc(netdev); ++ } else { ++ ret = netdev_set_num_tc(netdev, tc_info->num_tc); ++ if (ret) { ++ netdev_err(netdev, ++ "netdev_set_num_tc fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ for (i = 0; i < tc_info->num_tc; i++) ++ netdev_set_tc_queue(netdev, i, tc_info->tqp_count[i], ++ tc_info->tqp_offset[i]); ++ } ++ ++ ret = netif_set_real_num_tx_queues(netdev, queue_size); ++ if (ret) { ++ netdev_err(netdev, ++ "netif_set_real_num_tx_queues fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ ret = netif_set_real_num_rx_queues(netdev, queue_size); ++ if (ret) { ++ netdev_err(netdev, ++ "netif_set_real_num_rx_queues fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++u16 hns3_get_max_available_channels(struct hnae3_handle *h) ++{ ++ u16 alloc_tqps, max_rss_size, rss_size; ++ ++ h->ae_algo->ops->get_tqps_and_rss_info(h, &alloc_tqps, &max_rss_size); ++ rss_size = alloc_tqps / h->kinfo.tc_info.num_tc; ++ ++ return min_t(u16, rss_size, max_rss_size); ++} ++ ++static void hns3_tqp_enable(struct hnae3_queue *tqp) ++{ ++ u32 rcb_reg; ++ ++ rcb_reg = hns3_read_dev(tqp, HNS3_RING_EN_REG); ++ rcb_reg |= BIT(HNS3_RING_EN_B); ++ hns3_write_dev(tqp, HNS3_RING_EN_REG, rcb_reg); ++} ++ ++static void hns3_tqp_disable(struct hnae3_queue *tqp) ++{ ++ u32 rcb_reg; ++ ++ rcb_reg = hns3_read_dev(tqp, HNS3_RING_EN_REG); ++ rcb_reg &= ~BIT(HNS3_RING_EN_B); ++ hns3_write_dev(tqp, HNS3_RING_EN_REG, rcb_reg); ++} ++ ++static void hns3_free_rx_cpu_rmap(struct net_device *netdev) ++{ ++#ifdef CONFIG_RFS_ACCEL ++ free_irq_cpu_rmap(netdev->rx_cpu_rmap); ++ netdev->rx_cpu_rmap = NULL; ++#endif ++} ++ ++static int hns3_set_rx_cpu_rmap(struct net_device *netdev) ++{ ++#ifdef CONFIG_RFS_ACCEL ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hns3_enet_tqp_vector *tqp_vector; ++ int i, ret; ++ ++ if (!netdev->rx_cpu_rmap) { ++ netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->vector_num); ++ if (!netdev->rx_cpu_rmap) ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vector = &priv->tqp_vector[i]; ++ ret = irq_cpu_rmap_add(netdev->rx_cpu_rmap, ++ tqp_vector->vector_irq); ++ if (ret) { ++ hns3_free_rx_cpu_rmap(netdev); ++ return ret; ++ } ++ } ++#endif ++ return 0; ++} ++ ++static int hns3_nic_net_up(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = priv->ae_handle; ++ int i, j; ++ int ret; ++ ++ ret = hns3_nic_reset_all_ring(h); ++ if (ret) ++ return ret; ++ ++ clear_bit(HNS3_NIC_STATE_DOWN, &priv->state); ++ ++ /* enable the vectors */ ++ for (i = 0; i < priv->vector_num; i++) ++ hns3_vector_enable(&priv->tqp_vector[i]); ++ ++ /* enable rcb */ ++ for (j = 0; j < h->kinfo.num_tqps; j++) ++ hns3_tqp_enable(h->kinfo.tqp[j]); ++ ++ /* start the ae_dev */ ++ ret = h->ae_algo->ops->start ? h->ae_algo->ops->start(h) : 0; ++ if (ret) { ++ set_bit(HNS3_NIC_STATE_DOWN, &priv->state); ++ while (j--) ++ hns3_tqp_disable(h->kinfo.tqp[j]); ++ ++ for (j = i - 1; j >= 0; j--) ++ hns3_vector_disable(&priv->tqp_vector[j]); ++ } ++ ++ return ret; ++} ++ ++static void hns3_config_xps(struct hns3_nic_priv *priv) ++{ ++ int i; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ struct hns3_enet_tqp_vector *tqp_vector = &priv->tqp_vector[i]; ++ struct hns3_enet_ring *ring = tqp_vector->tx_group.ring; ++ ++ while (ring) { ++ int ret; ++ ++ ret = netif_set_xps_queue(priv->netdev, ++ &tqp_vector->affinity_mask, ++ ring->tqp->tqp_index); ++ if (ret) ++ netdev_warn(priv->netdev, ++ "set xps queue failed: %d", ret); ++ ++ ring = ring->next; ++ } ++ } ++} ++ ++static int hns3_nic_net_open(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct hnae3_knic_private_info *kinfo; ++ int i, ret; ++ ++ if (hns3_nic_resetting(netdev)) ++ return -EBUSY; ++ ++ if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) { ++ netdev_warn(netdev, "net open repeatedly!\n"); ++ return 0; ++ } ++ ++ netif_carrier_off(netdev); ++ ++ ret = hns3_nic_set_real_num_queue(netdev); ++ if (ret) ++ return ret; ++ ++ ret = hns3_nic_net_up(netdev); ++ if (ret) { ++ netdev_err(netdev, "net up fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ kinfo = &h->kinfo; ++ for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) ++ netdev_set_prio_tc_map(netdev, i, kinfo->tc_info.prio_tc[i]); ++ ++ if (h->ae_algo->ops->set_timer_task) ++ h->ae_algo->ops->set_timer_task(priv->ae_handle, true); ++ ++ hns3_config_xps(priv); ++ ++ netif_dbg(h, drv, netdev, "net open\n"); ++ ++ return 0; ++} ++ ++static void hns3_reset_tx_queue(struct hnae3_handle *h) ++{ ++ struct net_device *ndev = h->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ struct netdev_queue *dev_queue; ++ u32 i; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ dev_queue = netdev_get_tx_queue(ndev, ++ priv->ring[i].queue_index); ++ netdev_tx_reset_queue(dev_queue); ++ } ++} ++ ++static void hns3_nic_net_down(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ const struct hnae3_ae_ops *ops; ++ int i; ++ ++ /* disable vectors */ ++ for (i = 0; i < priv->vector_num; i++) ++ hns3_vector_disable(&priv->tqp_vector[i]); ++ ++ /* disable rcb */ ++ for (i = 0; i < h->kinfo.num_tqps; i++) ++ hns3_tqp_disable(h->kinfo.tqp[i]); ++ ++ /* stop ae_dev */ ++ ops = priv->ae_handle->ae_algo->ops; ++ if (ops->stop) ++ ops->stop(priv->ae_handle); ++ ++ /* delay ring buffer clearing to hns3_reset_notify_uninit_enet ++ * during reset process, because driver may not be able ++ * to disable the ring through firmware when downing the netdev. ++ */ ++ if (!hns3_nic_resetting(netdev)) ++ hns3_clear_all_ring(priv->ae_handle, false); ++ ++ hns3_reset_tx_queue(priv->ae_handle); ++} ++ ++static int hns3_nic_net_stop(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state)) ++ return 0; ++ ++ netif_dbg(h, drv, netdev, "net stop\n"); ++ ++ if (h->ae_algo->ops->set_timer_task) ++ h->ae_algo->ops->set_timer_task(priv->ae_handle, false); ++ ++ netif_carrier_off(netdev); ++ netif_tx_disable(netdev); ++ ++ hns3_nic_net_down(netdev); ++ ++ return 0; ++} ++ ++static int hns3_nic_uc_sync(struct net_device *netdev, ++ const unsigned char *addr) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (h->ae_algo->ops->add_uc_addr) ++ return h->ae_algo->ops->add_uc_addr(h, addr); ++ ++ return 0; ++} ++ ++static int hns3_nic_uc_unsync(struct net_device *netdev, ++ const unsigned char *addr) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ /* need ignore the request of removing device address, because ++ * we store the device address and other addresses of uc list ++ * in the function's mac filter list. ++ */ ++ if (ether_addr_equal(addr, netdev->dev_addr)) ++ return 0; ++ ++ if (h->ae_algo->ops->rm_uc_addr) ++ return h->ae_algo->ops->rm_uc_addr(h, addr); ++ ++ return 0; ++} ++ ++static int hns3_nic_mc_sync(struct net_device *netdev, ++ const unsigned char *addr) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (h->ae_algo->ops->add_mc_addr) ++ return h->ae_algo->ops->add_mc_addr(h, addr); ++ ++ return 0; ++} ++ ++static int hns3_nic_mc_unsync(struct net_device *netdev, ++ const unsigned char *addr) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (h->ae_algo->ops->rm_mc_addr) ++ return h->ae_algo->ops->rm_mc_addr(h, addr); ++ ++ return 0; ++} ++ ++static u8 hns3_get_netdev_flags(struct net_device *netdev) ++{ ++ u8 flags = 0; ++ ++ if (netdev->flags & IFF_PROMISC) ++ flags = HNAE3_USER_UPE | HNAE3_USER_MPE | HNAE3_BPE; ++ else if (netdev->flags & IFF_ALLMULTI) ++ flags = HNAE3_USER_MPE; ++ ++ return flags; ++} ++ ++static void hns3_nic_set_rx_mode(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ u8 new_flags; ++ ++ new_flags = hns3_get_netdev_flags(netdev); ++ ++ __dev_uc_sync(netdev, hns3_nic_uc_sync, hns3_nic_uc_unsync); ++ __dev_mc_sync(netdev, hns3_nic_mc_sync, hns3_nic_mc_unsync); ++ ++ /* User mode Promisc mode enable and vlan filtering is disabled to ++ * let all packets in. ++ */ ++ h->netdev_flags = new_flags; ++ hns3_request_update_promisc_mode(h); ++} ++ ++void hns3_request_update_promisc_mode(struct hnae3_handle *handle) ++{ ++ const struct hnae3_ae_ops *ops = handle->ae_algo->ops; ++ ++ if (ops->request_update_promisc_mode) ++ ops->request_update_promisc_mode(handle); ++} ++ ++static u32 hns3_tx_spare_space(struct hns3_enet_ring *ring) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ u32 ntc, ntu; ++ ++ /* This smp_load_acquire() pairs with smp_store_release() in ++ * hns3_tx_spare_update() called in tx desc cleaning process. ++ */ ++ ntc = smp_load_acquire(&tx_spare->last_to_clean); ++ ntu = tx_spare->next_to_use; ++ ++ if (ntc > ntu) ++ return ntc - ntu - 1; ++ ++ /* The free tx buffer is divided into two part, so pick the ++ * larger one. ++ */ ++ return max(ntc, tx_spare->len - ntu) - 1; ++} ++ ++static void hns3_tx_spare_update(struct hns3_enet_ring *ring) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ ++ if (!tx_spare || ++ tx_spare->last_to_clean == tx_spare->next_to_clean) ++ return; ++ ++ /* This smp_store_release() pairs with smp_load_acquire() in ++ * hns3_tx_spare_space() called in xmit process. ++ */ ++ smp_store_release(&tx_spare->last_to_clean, ++ tx_spare->next_to_clean); ++} ++ ++static bool hns3_can_use_tx_bounce(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ u32 space) ++{ ++ u32 len = skb->len <= ring->tx_copybreak ? skb->len : ++ skb_headlen(skb); ++ ++ if (len > ring->tx_copybreak) ++ return false; ++ ++ if (ALIGN(len, dma_get_cache_alignment()) > space) { ++ hns3_ring_stats_update(ring, tx_spare_full); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ u32 space) ++{ ++ if (skb->len <= ring->tx_copybreak || !tx_sgl || ++ (!skb_has_frag_list(skb) && ++ skb_shinfo(skb)->nr_frags < tx_sgl)) ++ return false; ++ ++ if (space < HNS3_MAX_SGL_SIZE) { ++ hns3_ring_stats_update(ring, tx_spare_full); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) ++{ ++ u32 alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size; ++ struct hns3_tx_spare *tx_spare; ++ struct page *page; ++ dma_addr_t dma; ++ int order; ++ ++ if (!alloc_size) ++ return; ++ ++ order = get_order(alloc_size); ++ if (order >= MAX_ORDER) { ++ if (net_ratelimit()) ++ dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n"); ++ return; ++ } ++ ++ tx_spare = devm_kzalloc(ring_to_dev(ring), sizeof(*tx_spare), ++ GFP_KERNEL); ++ if (!tx_spare) { ++ /* The driver still work without the tx spare buffer */ ++ dev_warn(ring_to_dev(ring), "failed to allocate hns3_tx_spare\n"); ++ goto devm_kzalloc_error; ++ } ++ ++ page = alloc_pages_node(dev_to_node(ring_to_dev(ring)), ++ GFP_KERNEL, order); ++ if (!page) { ++ dev_warn(ring_to_dev(ring), "failed to allocate tx spare pages\n"); ++ goto alloc_pages_error; ++ } ++ ++ dma = dma_map_page(ring_to_dev(ring), page, 0, ++ PAGE_SIZE << order, DMA_TO_DEVICE); ++ if (dma_mapping_error(ring_to_dev(ring), dma)) { ++ dev_warn(ring_to_dev(ring), "failed to map pages for tx spare\n"); ++ goto dma_mapping_error; ++ } ++ ++ tx_spare->dma = dma; ++ tx_spare->buf = page_address(page); ++ tx_spare->len = PAGE_SIZE << order; ++ ring->tx_spare = tx_spare; ++ return; ++ ++dma_mapping_error: ++ put_page(page); ++alloc_pages_error: ++ devm_kfree(ring_to_dev(ring), tx_spare); ++devm_kzalloc_error: ++ ring->tqp->handle->kinfo.tx_spare_buf_size = 0; ++} ++ ++/* Use hns3_tx_spare_space() to make sure there is enough buffer ++ * before calling below function to allocate tx buffer. ++ */ ++static void *hns3_tx_spare_alloc(struct hns3_enet_ring *ring, ++ unsigned int size, dma_addr_t *dma, ++ u32 *cb_len) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ u32 ntu = tx_spare->next_to_use; ++ ++ size = ALIGN(size, dma_get_cache_alignment()); ++ *cb_len = size; ++ ++ /* Tx spare buffer wraps back here because the end of ++ * freed tx buffer is not enough. ++ */ ++ if (ntu + size > tx_spare->len) { ++ *cb_len += (tx_spare->len - ntu); ++ ntu = 0; ++ } ++ ++ tx_spare->next_to_use = ntu + size; ++ if (tx_spare->next_to_use == tx_spare->len) ++ tx_spare->next_to_use = 0; ++ ++ *dma = tx_spare->dma + ntu; ++ ++ return tx_spare->buf + ntu; ++} ++ ++static void hns3_tx_spare_rollback(struct hns3_enet_ring *ring, u32 len) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ ++ if (len > tx_spare->next_to_use) { ++ len -= tx_spare->next_to_use; ++ tx_spare->next_to_use = tx_spare->len - len; ++ } else { ++ tx_spare->next_to_use -= len; ++ } ++} ++ ++static void hns3_tx_spare_reclaim_cb(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb) ++{ ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ u32 ntc = tx_spare->next_to_clean; ++ u32 len = cb->length; ++ ++ tx_spare->next_to_clean += len; ++ ++ if (tx_spare->next_to_clean >= tx_spare->len) { ++ tx_spare->next_to_clean -= tx_spare->len; ++ ++ if (tx_spare->next_to_clean) { ++ ntc = 0; ++ len = tx_spare->next_to_clean; ++ } ++ } ++ ++ /* This tx spare buffer is only really reclaimed after calling ++ * hns3_tx_spare_update(), so it is still safe to use the info in ++ * the tx buffer to do the dma sync or sg unmapping after ++ * tx_spare->next_to_clean is moved forword. ++ */ ++ if (cb->type & (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) { ++ dma_addr_t dma = tx_spare->dma + ntc; ++ ++ dma_sync_single_for_cpu(ring_to_dev(ring), dma, len, ++ DMA_TO_DEVICE); ++ } else { ++ struct sg_table *sgt = tx_spare->buf + ntc; ++ ++ dma_unmap_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents, ++ DMA_TO_DEVICE); ++ } ++} ++ ++static int hns3_set_tso(struct sk_buff *skb, u32 *paylen_fdop_ol4cs, ++ u16 *mss, u32 *type_cs_vlan_tso, u32 *send_bytes) ++{ ++ u32 l4_offset, hdr_len; ++ union l3_hdr_info l3; ++ union l4_hdr_info l4; ++ u32 l4_paylen; ++ int ret; ++ ++ if (!skb_is_gso(skb)) ++ return 0; ++ ++ ret = skb_cow_head(skb, 0); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ l3.hdr = skb_network_header(skb); ++ l4.hdr = skb_transport_header(skb); ++ ++ /* Software should clear the IPv4's checksum field when tso is ++ * needed. ++ */ ++ if (l3.v4->version == 4) ++ l3.v4->check = 0; ++ ++ /* tunnel packet */ ++ if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | ++ SKB_GSO_GRE_CSUM | ++ SKB_GSO_UDP_TUNNEL | ++ SKB_GSO_UDP_TUNNEL_CSUM)) { ++ /* reset l3&l4 pointers from outer to inner headers */ ++ l3.hdr = skb_inner_network_header(skb); ++ l4.hdr = skb_inner_transport_header(skb); ++ ++ /* Software should clear the IPv4's checksum field when ++ * tso is needed. ++ */ ++ if (l3.v4->version == 4) ++ l3.v4->check = 0; ++ } ++ ++ /* normal or tunnel packet */ ++ l4_offset = l4.hdr - skb->data; ++ ++ /* remove payload length from inner pseudo checksum when tso */ ++ l4_paylen = skb->len - l4_offset; ++ ++ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { ++ hdr_len = sizeof(*l4.udp) + l4_offset; ++ csum_replace_by_diff(&l4.udp->check, ++ (__force __wsum)htonl(l4_paylen)); ++ } else { ++ hdr_len = (l4.tcp->doff << 2) + l4_offset; ++ csum_replace_by_diff(&l4.tcp->check, ++ (__force __wsum)htonl(l4_paylen)); ++ } ++ ++ *send_bytes = (skb_shinfo(skb)->gso_segs - 1) * hdr_len + skb->len; ++ ++ /* find the txbd field values */ ++ *paylen_fdop_ol4cs = skb->len - hdr_len; ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_TSO_B, 1); ++ ++ /* offload outer UDP header checksum */ ++ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) ++ hns3_set_field(*paylen_fdop_ol4cs, HNS3_TXD_OL4CS_B, 1); ++ ++ /* get MSS for TSO */ ++ *mss = skb_shinfo(skb)->gso_size; ++ ++ trace_hns3_tso(skb); ++ ++ return 0; ++} ++ ++static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto, ++ u8 *il4_proto) ++{ ++ union l3_hdr_info l3; ++ unsigned char *l4_hdr; ++ unsigned char *exthdr; ++ u8 l4_proto_tmp; ++ __be16 frag_off; ++ ++ /* find outer header point */ ++ l3.hdr = skb_network_header(skb); ++ l4_hdr = skb_transport_header(skb); ++ ++ if (skb->protocol == htons(ETH_P_IPV6)) { ++ exthdr = l3.hdr + sizeof(*l3.v6); ++ l4_proto_tmp = l3.v6->nexthdr; ++ if (l4_hdr != exthdr) ++ ipv6_skip_exthdr(skb, exthdr - skb->data, ++ &l4_proto_tmp, &frag_off); ++ } else if (skb->protocol == htons(ETH_P_IP)) { ++ l4_proto_tmp = l3.v4->protocol; ++ } else { ++ return -EINVAL; ++ } ++ ++ *ol4_proto = l4_proto_tmp; ++ ++ /* tunnel packet */ ++ if (!skb->encapsulation) { ++ *il4_proto = 0; ++ return 0; ++ } ++ ++ /* find inner header point */ ++ l3.hdr = skb_inner_network_header(skb); ++ l4_hdr = skb_inner_transport_header(skb); ++ ++ if (l3.v6->version == 6) { ++ exthdr = l3.hdr + sizeof(*l3.v6); ++ l4_proto_tmp = l3.v6->nexthdr; ++ if (l4_hdr != exthdr) ++ ipv6_skip_exthdr(skb, exthdr - skb->data, ++ &l4_proto_tmp, &frag_off); ++ } else if (l3.v4->version == 4) { ++ l4_proto_tmp = l3.v4->protocol; ++ } ++ ++ *il4_proto = l4_proto_tmp; ++ ++ return 0; ++} ++ ++/* when skb->encapsulation is 0, skb->ip_summed is CHECKSUM_PARTIAL ++ * and it is udp packet, which has a dest port as the IANA assigned. ++ * the hardware is expected to do the checksum offload, but the ++ * hardware will not do the checksum offload when udp dest port is ++ * 4789, 4790 or 6081. ++ */ ++static bool hns3_tunnel_csum_bug(struct sk_buff *skb) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(skb->dev); ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); ++ union l4_hdr_info l4; ++ ++ /* device version above V3(include V3), the hardware can ++ * do this checksum offload. ++ */ ++ if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) ++ return false; ++ ++ l4.hdr = skb_transport_header(skb); ++ ++ if (!(!skb->encapsulation && ++ (l4.udp->dest == htons(IANA_VXLAN_UDP_PORT) || ++ l4.udp->dest == htons(GENEVE_UDP_PORT) || ++ l4.udp->dest == htons(IANA_VXLAN_GPE_UDP_PORT)))) ++ return false; ++ ++ return true; ++} ++ ++static void hns3_set_outer_l2l3l4(struct sk_buff *skb, u8 ol4_proto, ++ u32 *ol_type_vlan_len_msec) ++{ ++ u32 l2_len, l3_len, l4_len; ++ unsigned char *il2_hdr; ++ union l3_hdr_info l3; ++ union l4_hdr_info l4; ++ ++ l3.hdr = skb_network_header(skb); ++ l4.hdr = skb_transport_header(skb); ++ ++ /* compute OL2 header size, defined in 2 Bytes */ ++ l2_len = l3.hdr - skb->data; ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L2LEN_S, l2_len >> 1); ++ ++ /* compute OL3 header size, defined in 4 Bytes */ ++ l3_len = l4.hdr - l3.hdr; ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L3LEN_S, l3_len >> 2); ++ ++ il2_hdr = skb_inner_mac_header(skb); ++ /* compute OL4 header size, defined in 4 Bytes */ ++ l4_len = il2_hdr - l4.hdr; ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_L4LEN_S, l4_len >> 2); ++ ++ /* define outer network header type */ ++ if (skb->protocol == htons(ETH_P_IP)) { ++ if (skb_is_gso(skb)) ++ hns3_set_field(*ol_type_vlan_len_msec, ++ HNS3_TXD_OL3T_S, ++ HNS3_OL3T_IPV4_CSUM); ++ else ++ hns3_set_field(*ol_type_vlan_len_msec, ++ HNS3_TXD_OL3T_S, ++ HNS3_OL3T_IPV4_NO_CSUM); ++ } else if (skb->protocol == htons(ETH_P_IPV6)) { ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_OL3T_S, ++ HNS3_OL3T_IPV6); ++ } ++ ++ if (ol4_proto == IPPROTO_UDP) ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_TUNTYPE_S, ++ HNS3_TUN_MAC_IN_UDP); ++ else if (ol4_proto == IPPROTO_GRE) ++ hns3_set_field(*ol_type_vlan_len_msec, HNS3_TXD_TUNTYPE_S, ++ HNS3_TUN_NVGRE); ++} ++ ++static void hns3_set_l3_type(struct sk_buff *skb, union l3_hdr_info l3, ++ u32 *type_cs_vlan_tso) ++{ ++ if (l3.v4->version == 4) { ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S, ++ HNS3_L3T_IPV4); ++ ++ /* the stack computes the IP header already, the only time we ++ * need the hardware to recompute it is in the case of TSO. ++ */ ++ if (skb_is_gso(skb)) ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3CS_B, 1); ++ } else if (l3.v6->version == 6) { ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3T_S, ++ HNS3_L3T_IPV6); ++ } ++} ++ ++static int hns3_set_l4_csum_length(struct sk_buff *skb, union l4_hdr_info l4, ++ u32 l4_proto, u32 *type_cs_vlan_tso) ++{ ++ /* compute inner(/normal) L4 header size, defined in 4 Bytes */ ++ switch (l4_proto) { ++ case IPPROTO_TCP: ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S, ++ HNS3_L4T_TCP); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S, ++ l4.tcp->doff); ++ break; ++ case IPPROTO_UDP: ++ if (hns3_tunnel_csum_bug(skb)) { ++ int ret = skb_put_padto(skb, HNS3_MIN_TUN_PKT_LEN); ++ ++ return ret ? ret : skb_checksum_help(skb); ++ } ++ ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S, ++ HNS3_L4T_UDP); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S, ++ (sizeof(struct udphdr) >> 2)); ++ break; ++ case IPPROTO_SCTP: ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4CS_B, 1); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4T_S, ++ HNS3_L4T_SCTP); ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L4LEN_S, ++ (sizeof(struct sctphdr) >> 2)); ++ break; ++ default: ++ /* drop the skb tunnel packet if hardware don't support, ++ * because hardware can't calculate csum when TSO. ++ */ ++ if (skb_is_gso(skb)) ++ return -EDOM; ++ ++ /* the stack computes the IP header already, ++ * driver calculate l4 checksum when not TSO. ++ */ ++ return skb_checksum_help(skb); ++ } ++ ++ return 0; ++} ++ ++static int hns3_set_l2l3l4(struct sk_buff *skb, u8 ol4_proto, ++ u8 il4_proto, u32 *type_cs_vlan_tso, ++ u32 *ol_type_vlan_len_msec) ++{ ++ unsigned char *l2_hdr = skb->data; ++ u32 l4_proto = ol4_proto; ++ union l4_hdr_info l4; ++ union l3_hdr_info l3; ++ u32 l2_len, l3_len; ++ ++ l4.hdr = skb_transport_header(skb); ++ l3.hdr = skb_network_header(skb); ++ ++ /* handle encapsulation skb */ ++ if (skb->encapsulation) { ++ /* If this is a not UDP/GRE encapsulation skb */ ++ if (!(ol4_proto == IPPROTO_UDP || ol4_proto == IPPROTO_GRE)) { ++ /* drop the skb tunnel packet if hardware don't support, ++ * because hardware can't calculate csum when TSO. ++ */ ++ if (skb_is_gso(skb)) ++ return -EDOM; ++ ++ /* the stack computes the IP header already, ++ * driver calculate l4 checksum when not TSO. ++ */ ++ return skb_checksum_help(skb); ++ } ++ ++ hns3_set_outer_l2l3l4(skb, ol4_proto, ol_type_vlan_len_msec); ++ ++ /* switch to inner header */ ++ l2_hdr = skb_inner_mac_header(skb); ++ l3.hdr = skb_inner_network_header(skb); ++ l4.hdr = skb_inner_transport_header(skb); ++ l4_proto = il4_proto; ++ } ++ ++ hns3_set_l3_type(skb, l3, type_cs_vlan_tso); ++ ++ /* compute inner(/normal) L2 header size, defined in 2 Bytes */ ++ l2_len = l3.hdr - l2_hdr; ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L2LEN_S, l2_len >> 1); ++ ++ /* compute inner(/normal) L3 header size, defined in 4 Bytes */ ++ l3_len = l4.hdr - l3.hdr; ++ hns3_set_field(*type_cs_vlan_tso, HNS3_TXD_L3LEN_S, l3_len >> 2); ++ ++ return hns3_set_l4_csum_length(skb, l4, l4_proto, type_cs_vlan_tso); ++} ++ ++static int hns3_handle_vtags(struct hns3_enet_ring *tx_ring, ++ struct sk_buff *skb) ++{ ++ struct hnae3_handle *handle = tx_ring->tqp->handle; ++ struct hnae3_ae_dev *ae_dev; ++ struct vlan_ethhdr *vhdr; ++ int rc; ++ ++ if (!(skb->protocol == htons(ETH_P_8021Q) || ++ skb_vlan_tag_present(skb))) ++ return 0; ++ ++ /* For HW limitation on HNAE3_DEVICE_VERSION_V2, if port based insert ++ * VLAN enabled, only one VLAN header is allowed in skb, otherwise it ++ * will cause RAS error. ++ */ ++ ae_dev = pci_get_drvdata(handle->pdev); ++ if (unlikely(skb_vlan_tagged_multi(skb) && ++ ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2 && ++ handle->port_base_vlan_state == ++ HNAE3_PORT_BASE_VLAN_ENABLE)) ++ return -EINVAL; ++ ++ if (skb->protocol == htons(ETH_P_8021Q) && ++ !(handle->kinfo.netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) { ++ /* When HW VLAN acceleration is turned off, and the stack ++ * sets the protocol to 802.1q, the driver just need to ++ * set the protocol to the encapsulated ethertype. ++ */ ++ skb->protocol = vlan_get_protocol(skb); ++ return 0; ++ } ++ ++ if (skb_vlan_tag_present(skb)) { ++ /* Based on hw strategy, use out_vtag in two layer tag case, ++ * and use inner_vtag in one tag case. ++ */ ++ if (skb->protocol == htons(ETH_P_8021Q) && ++ handle->port_base_vlan_state == ++ HNAE3_PORT_BASE_VLAN_DISABLE) ++ rc = HNS3_OUTER_VLAN_TAG; ++ else ++ rc = HNS3_INNER_VLAN_TAG; ++ ++ skb->protocol = vlan_get_protocol(skb); ++ return rc; ++ } ++ ++ rc = skb_cow_head(skb, 0); ++ if (unlikely(rc < 0)) ++ return rc; ++ ++ vhdr = (struct vlan_ethhdr *)skb->data; ++ vhdr->h_vlan_TCI |= cpu_to_be16((skb->priority << VLAN_PRIO_SHIFT) ++ & VLAN_PRIO_MASK); ++ ++ skb->protocol = vlan_get_protocol(skb); ++ return 0; ++} ++ ++/* check if the hardware is capable of checksum offloading */ ++static bool hns3_check_hw_tx_csum(struct sk_buff *skb) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(skb->dev); ++ ++ /* Kindly note, due to backward compatibility of the TX descriptor, ++ * HW checksum of the non-IP packets and GSO packets is handled at ++ * different place in the following code ++ */ ++ if (skb_csum_is_sctp(skb) || skb_is_gso(skb) || ++ !test_bit(HNS3_NIC_STATE_HW_TX_CSUM_ENABLE, &priv->state)) ++ return false; ++ ++ return true; ++} ++ ++struct hns3_desc_param { ++ u32 paylen_ol4cs; ++ u32 ol_type_vlan_len_msec; ++ u32 type_cs_vlan_tso; ++ u16 mss_hw_csum; ++ u16 inner_vtag; ++ u16 out_vtag; ++}; ++ ++static void hns3_init_desc_data(struct sk_buff *skb, struct hns3_desc_param *pa) ++{ ++ pa->paylen_ol4cs = skb->len; ++ pa->ol_type_vlan_len_msec = 0; ++ pa->type_cs_vlan_tso = 0; ++ pa->mss_hw_csum = 0; ++ pa->inner_vtag = 0; ++ pa->out_vtag = 0; ++} ++ ++static int hns3_handle_vlan_info(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ struct hns3_desc_param *param) ++{ ++ int ret; ++ ++ ret = hns3_handle_vtags(ring, skb); ++ if (unlikely(ret < 0)) { ++ hns3_ring_stats_update(ring, tx_vlan_err); ++ return ret; ++ } else if (ret == HNS3_INNER_VLAN_TAG) { ++ param->inner_vtag = skb_vlan_tag_get(skb); ++ param->inner_vtag |= (skb->priority << VLAN_PRIO_SHIFT) & ++ VLAN_PRIO_MASK; ++ hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_VLAN_B, 1); ++ } else if (ret == HNS3_OUTER_VLAN_TAG) { ++ param->out_vtag = skb_vlan_tag_get(skb); ++ param->out_vtag |= (skb->priority << VLAN_PRIO_SHIFT) & ++ VLAN_PRIO_MASK; ++ hns3_set_field(param->ol_type_vlan_len_msec, HNS3_TXD_OVLAN_B, ++ 1); ++ } ++ return 0; ++} ++ ++static int hns3_handle_csum_partial(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ struct hns3_desc_cb *desc_cb, ++ struct hns3_desc_param *param) ++{ ++ u8 ol4_proto, il4_proto; ++ int ret; ++ ++ if (hns3_check_hw_tx_csum(skb)) { ++ /* set checksum start and offset, defined in 2 Bytes */ ++ hns3_set_field(param->type_cs_vlan_tso, HNS3_TXD_CSUM_START_S, ++ skb_checksum_start_offset(skb) >> 1); ++ hns3_set_field(param->ol_type_vlan_len_msec, ++ HNS3_TXD_CSUM_OFFSET_S, ++ skb->csum_offset >> 1); ++ param->mss_hw_csum |= BIT(HNS3_TXD_HW_CS_B); ++ return 0; ++ } ++ ++ skb_reset_mac_len(skb); ++ ++ ret = hns3_get_l4_protocol(skb, &ol4_proto, &il4_proto); ++ if (unlikely(ret < 0)) { ++ hns3_ring_stats_update(ring, tx_l4_proto_err); ++ return ret; ++ } ++ ++ ret = hns3_set_l2l3l4(skb, ol4_proto, il4_proto, ++ ¶m->type_cs_vlan_tso, ++ ¶m->ol_type_vlan_len_msec); ++ if (unlikely(ret < 0)) { ++ hns3_ring_stats_update(ring, tx_l2l3l4_err); ++ return ret; ++ } ++ ++ ret = hns3_set_tso(skb, ¶m->paylen_ol4cs, ¶m->mss_hw_csum, ++ ¶m->type_cs_vlan_tso, &desc_cb->send_bytes); ++ if (unlikely(ret < 0)) { ++ hns3_ring_stats_update(ring, tx_tso_err); ++ return ret; ++ } ++ return 0; ++} ++ ++static int hns3_fill_skb_desc(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, struct hns3_desc *desc, ++ struct hns3_desc_cb *desc_cb) ++{ ++ struct hns3_desc_param param; ++ int ret; ++ ++ hns3_init_desc_data(skb, ¶m); ++ ret = hns3_handle_vlan_info(ring, skb, ¶m); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ desc_cb->send_bytes = skb->len; ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ ret = hns3_handle_csum_partial(ring, skb, desc_cb, ¶m); ++ if (ret) ++ return ret; ++ } ++ ++ /* Set txbd */ ++ desc->tx.ol_type_vlan_len_msec = ++ cpu_to_le32(param.ol_type_vlan_len_msec); ++ desc->tx.type_cs_vlan_tso_len = cpu_to_le32(param.type_cs_vlan_tso); ++ desc->tx.paylen_ol4cs = cpu_to_le32(param.paylen_ol4cs); ++ desc->tx.mss_hw_csum = cpu_to_le16(param.mss_hw_csum); ++ desc->tx.vlan_tag = cpu_to_le16(param.inner_vtag); ++ desc->tx.outer_vlan_tag = cpu_to_le16(param.out_vtag); ++ ++ return 0; ++} ++ ++static int hns3_fill_desc(struct hns3_enet_ring *ring, dma_addr_t dma, ++ unsigned int size) ++{ ++#define HNS3_LIKELY_BD_NUM 1 ++ ++ struct hns3_desc *desc = &ring->desc[ring->next_to_use]; ++ unsigned int frag_buf_num; ++ int k, sizeoflast; ++ ++ if (likely(size <= HNS3_MAX_BD_SIZE)) { ++ desc->addr = cpu_to_le64(dma); ++ desc->tx.send_size = cpu_to_le16(size); ++ desc->tx.bdtp_fe_sc_vld_ra_ri = ++ cpu_to_le16(BIT(HNS3_TXD_VLD_B)); ++ ++ trace_hns3_tx_desc(ring, ring->next_to_use); ++ ring_ptr_move_fw(ring, next_to_use); ++ return HNS3_LIKELY_BD_NUM; ++ } ++ ++ frag_buf_num = hns3_tx_bd_count(size); ++ sizeoflast = size % HNS3_MAX_BD_SIZE; ++ sizeoflast = sizeoflast ? sizeoflast : HNS3_MAX_BD_SIZE; ++ ++ /* When frag size is bigger than hardware limit, split this frag */ ++ for (k = 0; k < frag_buf_num; k++) { ++ /* now, fill the descriptor */ ++ desc->addr = cpu_to_le64(dma + HNS3_MAX_BD_SIZE * k); ++ desc->tx.send_size = cpu_to_le16((k == frag_buf_num - 1) ? ++ (u16)sizeoflast : (u16)HNS3_MAX_BD_SIZE); ++ desc->tx.bdtp_fe_sc_vld_ra_ri = ++ cpu_to_le16(BIT(HNS3_TXD_VLD_B)); ++ ++ trace_hns3_tx_desc(ring, ring->next_to_use); ++ /* move ring pointer to next */ ++ ring_ptr_move_fw(ring, next_to_use); ++ ++ desc = &ring->desc[ring->next_to_use]; ++ } ++ ++ return frag_buf_num; ++} ++ ++static int hns3_map_and_fill_desc(struct hns3_enet_ring *ring, void *priv, ++ unsigned int type) ++{ ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; ++ struct device *dev = ring_to_dev(ring); ++ unsigned int size; ++ dma_addr_t dma; ++ ++ if (type & (DESC_TYPE_FRAGLIST_SKB | DESC_TYPE_SKB)) { ++ struct sk_buff *skb = (struct sk_buff *)priv; ++ ++ size = skb_headlen(skb); ++ if (!size) ++ return 0; ++ ++ dma = dma_map_single(dev, skb->data, size, DMA_TO_DEVICE); ++ } else if (type & DESC_TYPE_BOUNCE_HEAD) { ++ /* Head data has been filled in hns3_handle_tx_bounce(), ++ * just return 0 here. ++ */ ++ return 0; ++ } else { ++ skb_frag_t *frag = (skb_frag_t *)priv; ++ ++ size = skb_frag_size(frag); ++ if (!size) ++ return 0; ++ ++ dma = skb_frag_dma_map(dev, frag, 0, size, DMA_TO_DEVICE); ++ } ++ ++ if (unlikely(dma_mapping_error(dev, dma))) { ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ return -ENOMEM; ++ } ++ ++ desc_cb->priv = priv; ++ desc_cb->length = size; ++ desc_cb->dma = dma; ++ desc_cb->type = type; ++ ++ return hns3_fill_desc(ring, dma, size); ++} ++ ++static unsigned int hns3_skb_bd_num(struct sk_buff *skb, unsigned int *bd_size, ++ unsigned int bd_num) ++{ ++ unsigned int size; ++ int i; ++ ++ size = skb_headlen(skb); ++ while (size > HNS3_MAX_BD_SIZE) { ++ bd_size[bd_num++] = HNS3_MAX_BD_SIZE; ++ size -= HNS3_MAX_BD_SIZE; ++ ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ if (size) { ++ bd_size[bd_num++] = size; ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ size = skb_frag_size(frag); ++ if (!size) ++ continue; ++ ++ while (size > HNS3_MAX_BD_SIZE) { ++ bd_size[bd_num++] = HNS3_MAX_BD_SIZE; ++ size -= HNS3_MAX_BD_SIZE; ++ ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ bd_size[bd_num++] = size; ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ return bd_num; ++} ++ ++static unsigned int hns3_tx_bd_num(struct sk_buff *skb, unsigned int *bd_size, ++ u8 max_non_tso_bd_num, unsigned int bd_num, ++ unsigned int recursion_level) ++{ ++#define HNS3_MAX_RECURSION_LEVEL 24 ++ ++ struct sk_buff *frag_skb; ++ ++ /* If the total len is within the max bd limit */ ++ if (likely(skb->len <= HNS3_MAX_BD_SIZE && !recursion_level && ++ !skb_has_frag_list(skb) && ++ skb_shinfo(skb)->nr_frags < max_non_tso_bd_num)) ++ return skb_shinfo(skb)->nr_frags + 1U; ++ ++ if (unlikely(recursion_level >= HNS3_MAX_RECURSION_LEVEL)) ++ return UINT_MAX; ++ ++ bd_num = hns3_skb_bd_num(skb, bd_size, bd_num); ++ if (!skb_has_frag_list(skb) || bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ ++ skb_walk_frags(skb, frag_skb) { ++ bd_num = hns3_tx_bd_num(frag_skb, bd_size, max_non_tso_bd_num, ++ bd_num, recursion_level + 1); ++ if (bd_num > HNS3_MAX_TSO_BD_NUM) ++ return bd_num; ++ } ++ ++ return bd_num; ++} ++ ++static unsigned int hns3_gso_hdr_len(struct sk_buff *skb) ++{ ++ if (!skb->encapsulation) ++ return skb_tcp_all_headers(skb); ++ ++ return skb_inner_tcp_all_headers(skb); ++} ++ ++/* HW need every continuous max_non_tso_bd_num buffer data to be larger ++ * than MSS, we simplify it by ensuring skb_headlen + the first continuous ++ * max_non_tso_bd_num - 1 frags to be larger than gso header len + mss, ++ * and the remaining continuous max_non_tso_bd_num - 1 frags to be larger ++ * than MSS except the last max_non_tso_bd_num - 1 frags. ++ */ ++static bool hns3_skb_need_linearized(struct sk_buff *skb, unsigned int *bd_size, ++ unsigned int bd_num, u8 max_non_tso_bd_num) ++{ ++ unsigned int tot_len = 0; ++ int i; ++ ++ for (i = 0; i < max_non_tso_bd_num - 1U; i++) ++ tot_len += bd_size[i]; ++ ++ /* ensure the first max_non_tso_bd_num frags is greater than ++ * mss + header ++ */ ++ if (tot_len + bd_size[max_non_tso_bd_num - 1U] < ++ skb_shinfo(skb)->gso_size + hns3_gso_hdr_len(skb)) ++ return true; ++ ++ /* ensure every continuous max_non_tso_bd_num - 1 buffer is greater ++ * than mss except the last one. ++ */ ++ for (i = 0; i < bd_num - max_non_tso_bd_num; i++) { ++ tot_len -= bd_size[i]; ++ tot_len += bd_size[i + max_non_tso_bd_num - 1U]; ++ ++ if (tot_len < skb_shinfo(skb)->gso_size) ++ return true; ++ } ++ ++ return false; ++} ++ ++void hns3_shinfo_pack(struct skb_shared_info *shinfo, __u32 *size) ++{ ++ int i; ++ ++ for (i = 0; i < MAX_SKB_FRAGS; i++) ++ size[i] = skb_frag_size(&shinfo->frags[i]); ++} ++ ++static int hns3_skb_linearize(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ unsigned int bd_num) ++{ ++ /* 'bd_num == UINT_MAX' means the skb' fraglist has a ++ * recursion level of over HNS3_MAX_RECURSION_LEVEL. ++ */ ++ if (bd_num == UINT_MAX) { ++ hns3_ring_stats_update(ring, over_max_recursion); ++ return -ENOMEM; ++ } ++ ++ /* The skb->len has exceeded the hw limitation, linearization ++ * will not help. ++ */ ++ if (skb->len > HNS3_MAX_TSO_SIZE || ++ (!skb_is_gso(skb) && skb->len > HNS3_MAX_NON_TSO_SIZE)) { ++ hns3_ring_stats_update(ring, hw_limitation); ++ return -ENOMEM; ++ } ++ ++ if (__skb_linearize(skb)) { ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static int hns3_nic_maybe_stop_tx(struct hns3_enet_ring *ring, ++ struct net_device *netdev, ++ struct sk_buff *skb) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ u8 max_non_tso_bd_num = priv->max_non_tso_bd_num; ++ unsigned int bd_size[HNS3_MAX_TSO_BD_NUM + 1U]; ++ unsigned int bd_num; ++ ++ bd_num = hns3_tx_bd_num(skb, bd_size, max_non_tso_bd_num, 0, 0); ++ if (unlikely(bd_num > max_non_tso_bd_num)) { ++ if (bd_num <= HNS3_MAX_TSO_BD_NUM && skb_is_gso(skb) && ++ !hns3_skb_need_linearized(skb, bd_size, bd_num, ++ max_non_tso_bd_num)) { ++ trace_hns3_over_max_bd(skb); ++ goto out; ++ } ++ ++ if (hns3_skb_linearize(ring, skb, bd_num)) ++ return -ENOMEM; ++ ++ bd_num = hns3_tx_bd_count(skb->len); ++ ++ hns3_ring_stats_update(ring, tx_copy); ++ } ++ ++out: ++ if (likely(ring_space(ring) >= bd_num)) ++ return bd_num; ++ ++ netif_stop_subqueue(netdev, ring->queue_index); ++ smp_mb(); /* Memory barrier before checking ring_space */ ++ ++ /* Start queue in case hns3_clean_tx_ring has just made room ++ * available and has not seen the queue stopped state performed ++ * by netif_stop_subqueue above. ++ */ ++ if (ring_space(ring) >= bd_num && netif_carrier_ok(netdev) && ++ !test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) { ++ netif_start_subqueue(netdev, ring->queue_index); ++ return bd_num; ++ } ++ ++ hns3_ring_stats_update(ring, tx_busy); ++ ++ return -EBUSY; ++} ++ ++static void hns3_clear_desc(struct hns3_enet_ring *ring, int next_to_use_orig) ++{ ++ struct device *dev = ring_to_dev(ring); ++ unsigned int i; ++ ++ for (i = 0; i < ring->desc_num; i++) { ++ struct hns3_desc *desc = &ring->desc[ring->next_to_use]; ++ struct hns3_desc_cb *desc_cb; ++ ++ memset(desc, 0, sizeof(*desc)); ++ ++ /* check if this is where we started */ ++ if (ring->next_to_use == next_to_use_orig) ++ break; ++ ++ /* rollback one */ ++ ring_ptr_move_bw(ring, next_to_use); ++ ++ desc_cb = &ring->desc_cb[ring->next_to_use]; ++ ++ if (!desc_cb->dma) ++ continue; ++ ++ /* unmap the descriptor dma address */ ++ if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB)) ++ dma_unmap_single(dev, desc_cb->dma, desc_cb->length, ++ DMA_TO_DEVICE); ++ else if (desc_cb->type & ++ (DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL)) ++ hns3_tx_spare_rollback(ring, desc_cb->length); ++ else if (desc_cb->length) ++ dma_unmap_page(dev, desc_cb->dma, desc_cb->length, ++ DMA_TO_DEVICE); ++ ++ desc_cb->length = 0; ++ desc_cb->dma = 0; ++ desc_cb->type = DESC_TYPE_UNKNOWN; ++ } ++} ++ ++static int hns3_fill_skb_to_desc(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, unsigned int type) ++{ ++ struct sk_buff *frag_skb; ++ int i, ret, bd_num = 0; ++ ++ ret = hns3_map_and_fill_desc(ring, skb, type); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ bd_num += ret; ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ ret = hns3_map_and_fill_desc(ring, frag, DESC_TYPE_PAGE); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ bd_num += ret; ++ } ++ ++ skb_walk_frags(skb, frag_skb) { ++ ret = hns3_fill_skb_to_desc(ring, frag_skb, ++ DESC_TYPE_FRAGLIST_SKB); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ bd_num += ret; ++ } ++ ++ return bd_num; ++} ++ ++static void hns3_tx_push_bd(struct hns3_enet_ring *ring, int num) ++{ ++#define HNS3_BYTES_PER_64BIT 8 ++ ++ struct hns3_desc desc[HNS3_MAX_PUSH_BD_NUM] = {}; ++ int offset = 0; ++ ++ /* make sure everything is visible to device before ++ * excuting tx push or updating doorbell ++ */ ++ dma_wmb(); ++ ++ do { ++ int idx = (ring->next_to_use - num + ring->desc_num) % ++ ring->desc_num; ++ ++ u64_stats_update_begin(&ring->syncp); ++ ring->stats.tx_push++; ++ u64_stats_update_end(&ring->syncp); ++ memcpy(&desc[offset], &ring->desc[idx], ++ sizeof(struct hns3_desc)); ++ offset++; ++ } while (--num); ++ ++ __iowrite64_copy(ring->tqp->mem_base, desc, ++ (sizeof(struct hns3_desc) * HNS3_MAX_PUSH_BD_NUM) / ++ HNS3_BYTES_PER_64BIT); ++ ++ io_stop_wc(); ++} ++ ++static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring) ++{ ++#define HNS3_MEM_DOORBELL_OFFSET 64 ++ ++ __le64 bd_num = cpu_to_le64((u64)ring->pending_buf); ++ ++ /* make sure everything is visible to device before ++ * excuting tx push or updating doorbell ++ */ ++ dma_wmb(); ++ ++ __iowrite64_copy(ring->tqp->mem_base + HNS3_MEM_DOORBELL_OFFSET, ++ &bd_num, 1); ++ u64_stats_update_begin(&ring->syncp); ++ ring->stats.tx_mem_doorbell += ring->pending_buf; ++ u64_stats_update_end(&ring->syncp); ++ ++ io_stop_wc(); ++} ++ ++static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num, ++ bool doorbell) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ /* when tx push is enabled, the packet whose number of BD below ++ * HNS3_MAX_PUSH_BD_NUM can be pushed directly. ++ */ ++ if (test_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state) && num && ++ !ring->pending_buf && num <= HNS3_MAX_PUSH_BD_NUM && doorbell) { ++ hns3_tx_push_bd(ring, num); ++ WRITE_ONCE(ring->last_to_use, ring->next_to_use); ++ return; ++ } ++ ++ ring->pending_buf += num; ++ ++ if (!doorbell) { ++ hns3_ring_stats_update(ring, tx_more); ++ return; ++ } ++ ++ if (ring->tqp->mem_base) ++ hns3_tx_mem_doorbell(ring); ++ else ++ writel(ring->pending_buf, ++ ring->tqp->io_base + HNS3_RING_TX_RING_TAIL_REG); ++ ++ ring->pending_buf = 0; ++ WRITE_ONCE(ring->last_to_use, ring->next_to_use); ++} ++ ++static void hns3_tsyn(struct net_device *netdev, struct sk_buff *skb, ++ struct hns3_desc *desc) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (!(h->ae_algo->ops->set_tx_hwts_info && ++ h->ae_algo->ops->set_tx_hwts_info(h, skb))) ++ return; ++ ++ desc->tx.bdtp_fe_sc_vld_ra_ri |= cpu_to_le16(BIT(HNS3_TXD_TSYN_B)); ++} ++ ++static int hns3_handle_tx_bounce(struct hns3_enet_ring *ring, ++ struct sk_buff *skb) ++{ ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; ++ unsigned int type = DESC_TYPE_BOUNCE_HEAD; ++ unsigned int size = skb_headlen(skb); ++ dma_addr_t dma; ++ int bd_num = 0; ++ u32 cb_len; ++ void *buf; ++ int ret; ++ ++ if (skb->len <= ring->tx_copybreak) { ++ size = skb->len; ++ type = DESC_TYPE_BOUNCE_ALL; ++ } ++ ++ /* hns3_can_use_tx_bounce() is called to ensure the below ++ * function can always return the tx buffer. ++ */ ++ buf = hns3_tx_spare_alloc(ring, size, &dma, &cb_len); ++ ++ ret = skb_copy_bits(skb, 0, buf, size); ++ if (unlikely(ret < 0)) { ++ hns3_tx_spare_rollback(ring, cb_len); ++ hns3_ring_stats_update(ring, copy_bits_err); ++ return ret; ++ } ++ ++ desc_cb->priv = skb; ++ desc_cb->length = cb_len; ++ desc_cb->dma = dma; ++ desc_cb->type = type; ++ ++ bd_num += hns3_fill_desc(ring, dma, size); ++ ++ if (type == DESC_TYPE_BOUNCE_HEAD) { ++ ret = hns3_fill_skb_to_desc(ring, skb, ++ DESC_TYPE_BOUNCE_HEAD); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ bd_num += ret; ++ } ++ ++ dma_sync_single_for_device(ring_to_dev(ring), dma, size, ++ DMA_TO_DEVICE); ++ ++ hns3_ring_stats_update(ring, tx_bounce); ++ ++ return bd_num; ++} ++ ++static int hns3_handle_tx_sgl(struct hns3_enet_ring *ring, ++ struct sk_buff *skb) ++{ ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; ++ u32 nfrag = skb_shinfo(skb)->nr_frags + 1; ++ struct sg_table *sgt; ++ int i, bd_num = 0; ++ dma_addr_t dma; ++ u32 cb_len; ++ int nents; ++ ++ if (skb_has_frag_list(skb)) ++ nfrag = HNS3_MAX_TSO_BD_NUM; ++ ++ /* hns3_can_use_tx_sgl() is called to ensure the below ++ * function can always return the tx buffer. ++ */ ++ sgt = hns3_tx_spare_alloc(ring, HNS3_SGL_SIZE(nfrag), ++ &dma, &cb_len); ++ ++ /* scatterlist follows by the sg table */ ++ sgt->sgl = (struct scatterlist *)(sgt + 1); ++ sg_init_table(sgt->sgl, nfrag); ++ nents = skb_to_sgvec(skb, sgt->sgl, 0, skb->len); ++ if (unlikely(nents < 0)) { ++ hns3_tx_spare_rollback(ring, cb_len); ++ hns3_ring_stats_update(ring, skb2sgl_err); ++ return -ENOMEM; ++ } ++ ++ sgt->orig_nents = nents; ++ sgt->nents = dma_map_sg(ring_to_dev(ring), sgt->sgl, sgt->orig_nents, ++ DMA_TO_DEVICE); ++ if (unlikely(!sgt->nents)) { ++ hns3_tx_spare_rollback(ring, cb_len); ++ hns3_ring_stats_update(ring, map_sg_err); ++ return -ENOMEM; ++ } ++ ++ desc_cb->priv = skb; ++ desc_cb->length = cb_len; ++ desc_cb->dma = dma; ++ desc_cb->type = DESC_TYPE_SGL_SKB; ++ ++ for (i = 0; i < sgt->nents; i++) ++ bd_num += hns3_fill_desc(ring, sg_dma_address(sgt->sgl + i), ++ sg_dma_len(sgt->sgl + i)); ++ hns3_ring_stats_update(ring, tx_sgl); ++ ++ return bd_num; ++} ++ ++static int hns3_handle_desc_filling(struct hns3_enet_ring *ring, ++ struct sk_buff *skb) ++{ ++ u32 space; ++ ++ if (!ring->tx_spare) ++ goto out; ++ ++ space = hns3_tx_spare_space(ring); ++ ++ if (hns3_can_use_tx_sgl(ring, skb, space)) ++ return hns3_handle_tx_sgl(ring, skb); ++ ++ if (hns3_can_use_tx_bounce(ring, skb, space)) ++ return hns3_handle_tx_bounce(ring, skb); ++ ++out: ++ return hns3_fill_skb_to_desc(ring, skb, DESC_TYPE_SKB); ++} ++ ++static int hns3_handle_skb_desc(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, ++ struct hns3_desc_cb *desc_cb, ++ int next_to_use_head) ++{ ++ int ret; ++ ++ ret = hns3_fill_skb_desc(ring, skb, &ring->desc[ring->next_to_use], ++ desc_cb); ++ if (unlikely(ret < 0)) ++ goto fill_err; ++ ++ /* 'ret < 0' means filling error, 'ret == 0' means skb->len is ++ * zero, which is unlikely, and 'ret > 0' means how many tx desc ++ * need to be notified to the hw. ++ */ ++ ret = hns3_handle_desc_filling(ring, skb); ++ if (likely(ret > 0)) ++ return ret; ++ ++fill_err: ++ hns3_clear_desc(ring, next_to_use_head); ++ return ret; ++} ++ ++netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hns3_enet_ring *ring = &priv->ring[skb->queue_mapping]; ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; ++ struct netdev_queue *dev_queue; ++ int pre_ntu, ret; ++ bool doorbell; ++ ++ /* Hardware can only handle short frames above 32 bytes */ ++ if (skb_put_padto(skb, HNS3_MIN_TX_LEN)) { ++ hns3_tx_doorbell(ring, 0, !netdev_xmit_more()); ++ ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ ++ return NETDEV_TX_OK; ++ } ++ ++ /* Prefetch the data used later */ ++ prefetch(skb->data); ++ ++ ret = hns3_nic_maybe_stop_tx(ring, netdev, skb); ++ if (unlikely(ret <= 0)) { ++ if (ret == -EBUSY) { ++ hns3_tx_doorbell(ring, 0, true); ++ return NETDEV_TX_BUSY; ++ } ++ ++ hns3_rl_err(netdev, "xmit error: %d!\n", ret); ++ goto out_err_tx_ok; ++ } ++ ++ ret = hns3_handle_skb_desc(ring, skb, desc_cb, ring->next_to_use); ++ if (unlikely(ret <= 0)) ++ goto out_err_tx_ok; ++ ++ pre_ntu = ring->next_to_use ? (ring->next_to_use - 1) : ++ (ring->desc_num - 1); ++ ++ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) ++ hns3_tsyn(netdev, skb, &ring->desc[pre_ntu]); ++ ++ ring->desc[pre_ntu].tx.bdtp_fe_sc_vld_ra_ri |= ++ cpu_to_le16(BIT(HNS3_TXD_FE_B)); ++ trace_hns3_tx_desc(ring, pre_ntu); ++ ++ skb_tx_timestamp(skb); ++ ++ /* Complete translate all packets */ ++ dev_queue = netdev_get_tx_queue(netdev, ring->queue_index); ++ doorbell = __netdev_tx_sent_queue(dev_queue, desc_cb->send_bytes, ++ netdev_xmit_more()); ++ hns3_tx_doorbell(ring, ret, doorbell); ++ ++ return NETDEV_TX_OK; ++ ++out_err_tx_ok: ++ dev_kfree_skb_any(skb); ++ hns3_tx_doorbell(ring, 0, !netdev_xmit_more()); ++ return NETDEV_TX_OK; ++} ++ ++static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p) ++{ ++ char format_mac_addr_perm[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ char format_mac_addr_sa[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct sockaddr *mac_addr = p; ++ int ret; ++ ++ if (!mac_addr || !is_valid_ether_addr((const u8 *)mac_addr->sa_data)) ++ return -EADDRNOTAVAIL; ++ ++ if (ether_addr_equal(netdev->dev_addr, mac_addr->sa_data)) { ++ hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data); ++ netdev_info(netdev, "already using mac address %s\n", ++ format_mac_addr_sa); ++ return 0; ++ } ++ ++ /* For VF device, if there is a perm_addr, then the user will not ++ * be allowed to change the address. ++ */ ++ if (!hns3_is_phys_func(h->pdev) && ++ !is_zero_ether_addr(netdev->perm_addr)) { ++ hnae3_format_mac_addr(format_mac_addr_perm, netdev->perm_addr); ++ hnae3_format_mac_addr(format_mac_addr_sa, mac_addr->sa_data); ++ netdev_err(netdev, "has permanent MAC %s, user MAC %s not allow\n", ++ format_mac_addr_perm, format_mac_addr_sa); ++ return -EPERM; ++ } ++ ++ ret = h->ae_algo->ops->set_mac_addr(h, mac_addr->sa_data, false); ++ if (ret) { ++ netdev_err(netdev, "set_mac_address fail, ret=%d!\n", ret); ++ return ret; ++ } ++ ++ eth_hw_addr_set(netdev, mac_addr->sa_data); ++ ++ return 0; ++} ++ ++static int hns3_nic_do_ioctl(struct net_device *netdev, ++ struct ifreq *ifr, int cmd) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (!netif_running(netdev)) ++ return -EINVAL; ++ ++ if (!h->ae_algo->ops->do_ioctl) ++ return -EOPNOTSUPP; ++ ++ return h->ae_algo->ops->do_ioctl(h, ifr, cmd); ++} ++ ++static int hns3_nic_set_features(struct net_device *netdev, ++ netdev_features_t features) ++{ ++ netdev_features_t changed = netdev->features ^ features; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct hnae3_handle *h = priv->ae_handle; ++ bool enable; ++ int ret; ++ ++ if (changed & (NETIF_F_GRO_HW) && h->ae_algo->ops->set_gro_en) { ++ enable = !!(features & NETIF_F_GRO_HW); ++ ret = h->ae_algo->ops->set_gro_en(h, enable); ++ if (ret) ++ return ret; ++ } ++ ++ if ((changed & NETIF_F_HW_VLAN_CTAG_RX) && ++ h->ae_algo->ops->enable_hw_strip_rxvtag) { ++ enable = !!(features & NETIF_F_HW_VLAN_CTAG_RX); ++ ret = h->ae_algo->ops->enable_hw_strip_rxvtag(h, enable); ++ if (ret) ++ return ret; ++ } ++ ++ if ((changed & NETIF_F_NTUPLE) && h->ae_algo->ops->enable_fd) { ++ enable = !!(features & NETIF_F_NTUPLE); ++ h->ae_algo->ops->enable_fd(h, enable); ++ } ++ ++ if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) && ++ h->ae_algo->ops->cls_flower_active(h)) { ++ netdev_err(netdev, ++ "there are offloaded TC filters active, cannot disable HW TC offload"); ++ return -EINVAL; ++ } ++ ++ if ((changed & NETIF_F_HW_VLAN_CTAG_FILTER) && ++ h->ae_algo->ops->enable_vlan_filter) { ++ enable = !!(features & NETIF_F_HW_VLAN_CTAG_FILTER); ++ ret = h->ae_algo->ops->enable_vlan_filter(h, enable); ++ if (ret) ++ return ret; ++ } ++ ++ netdev->features = features; ++ return 0; ++} ++ ++static netdev_features_t hns3_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) ++{ ++#define HNS3_MAX_HDR_LEN 480U ++#define HNS3_MAX_L4_HDR_LEN 60U ++ ++ size_t len; ++ ++ if (skb->ip_summed != CHECKSUM_PARTIAL) ++ return features; ++ ++ if (skb->encapsulation) ++ len = skb_inner_transport_header(skb) - skb->data; ++ else ++ len = skb_transport_header(skb) - skb->data; ++ ++ /* Assume L4 is 60 byte as TCP is the only protocol with a ++ * a flexible value, and it's max len is 60 bytes. ++ */ ++ len += HNS3_MAX_L4_HDR_LEN; ++ ++ /* Hardware only supports checksum on the skb with a max header ++ * len of 480 bytes. ++ */ ++ if (len > HNS3_MAX_HDR_LEN) ++ features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); ++ ++ return features; ++} ++ ++static void hns3_fetch_stats(struct rtnl_link_stats64 *stats, ++ struct hns3_enet_ring *ring, bool is_tx) ++{ ++ unsigned int start; ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&ring->syncp); ++ if (is_tx) { ++ stats->tx_bytes += ring->stats.tx_bytes; ++ stats->tx_packets += ring->stats.tx_pkts; ++ stats->tx_dropped += ring->stats.sw_err_cnt; ++ stats->tx_dropped += ring->stats.tx_vlan_err; ++ stats->tx_dropped += ring->stats.tx_l4_proto_err; ++ stats->tx_dropped += ring->stats.tx_l2l3l4_err; ++ stats->tx_dropped += ring->stats.tx_tso_err; ++ stats->tx_dropped += ring->stats.over_max_recursion; ++ stats->tx_dropped += ring->stats.hw_limitation; ++ stats->tx_dropped += ring->stats.copy_bits_err; ++ stats->tx_dropped += ring->stats.skb2sgl_err; ++ stats->tx_dropped += ring->stats.map_sg_err; ++ stats->tx_errors += ring->stats.sw_err_cnt; ++ stats->tx_errors += ring->stats.tx_vlan_err; ++ stats->tx_errors += ring->stats.tx_l4_proto_err; ++ stats->tx_errors += ring->stats.tx_l2l3l4_err; ++ stats->tx_errors += ring->stats.tx_tso_err; ++ stats->tx_errors += ring->stats.over_max_recursion; ++ stats->tx_errors += ring->stats.hw_limitation; ++ stats->tx_errors += ring->stats.copy_bits_err; ++ stats->tx_errors += ring->stats.skb2sgl_err; ++ stats->tx_errors += ring->stats.map_sg_err; ++ } else { ++ stats->rx_bytes += ring->stats.rx_bytes; ++ stats->rx_packets += ring->stats.rx_pkts; ++ stats->rx_dropped += ring->stats.l2_err; ++ stats->rx_errors += ring->stats.l2_err; ++ stats->rx_errors += ring->stats.l3l4_csum_err; ++ stats->rx_crc_errors += ring->stats.l2_err; ++ stats->multicast += ring->stats.rx_multicast; ++ stats->rx_length_errors += ring->stats.err_pkt_len; ++ } ++ } while (u64_stats_fetch_retry_irq(&ring->syncp, start)); ++} ++ ++static void hns3_nic_get_stats64(struct net_device *netdev, ++ struct rtnl_link_stats64 *stats) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ int queue_num = priv->ae_handle->kinfo.num_tqps; ++ struct hnae3_handle *handle = priv->ae_handle; ++ struct rtnl_link_stats64 ring_total_stats; ++ struct hns3_enet_ring *ring; ++ unsigned int idx; ++ ++ if (test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) ++ return; ++ ++ handle->ae_algo->ops->update_stats(handle, &netdev->stats); ++ ++ memset(&ring_total_stats, 0, sizeof(ring_total_stats)); ++ for (idx = 0; idx < queue_num; idx++) { ++ /* fetch the tx stats */ ++ ring = &priv->ring[idx]; ++ hns3_fetch_stats(&ring_total_stats, ring, true); ++ ++ /* fetch the rx stats */ ++ ring = &priv->ring[idx + queue_num]; ++ hns3_fetch_stats(&ring_total_stats, ring, false); ++ } ++ ++ stats->tx_bytes = ring_total_stats.tx_bytes; ++ stats->tx_packets = ring_total_stats.tx_packets; ++ stats->rx_bytes = ring_total_stats.rx_bytes; ++ stats->rx_packets = ring_total_stats.rx_packets; ++ ++ stats->rx_errors = ring_total_stats.rx_errors; ++ stats->multicast = ring_total_stats.multicast; ++ stats->rx_length_errors = ring_total_stats.rx_length_errors; ++ stats->rx_crc_errors = ring_total_stats.rx_crc_errors; ++ stats->rx_missed_errors = netdev->stats.rx_missed_errors; ++ ++ stats->tx_errors = ring_total_stats.tx_errors; ++ stats->rx_dropped = ring_total_stats.rx_dropped; ++ stats->tx_dropped = ring_total_stats.tx_dropped; ++ stats->collisions = netdev->stats.collisions; ++ stats->rx_over_errors = netdev->stats.rx_over_errors; ++ stats->rx_frame_errors = netdev->stats.rx_frame_errors; ++ stats->rx_fifo_errors = netdev->stats.rx_fifo_errors; ++ stats->tx_aborted_errors = netdev->stats.tx_aborted_errors; ++ stats->tx_carrier_errors = netdev->stats.tx_carrier_errors; ++ stats->tx_fifo_errors = netdev->stats.tx_fifo_errors; ++ stats->tx_heartbeat_errors = netdev->stats.tx_heartbeat_errors; ++ stats->tx_window_errors = netdev->stats.tx_window_errors; ++ stats->rx_compressed = netdev->stats.rx_compressed; ++ stats->tx_compressed = netdev->stats.tx_compressed; ++} ++ ++static int hns3_setup_tc(struct net_device *netdev, void *type_data) ++{ ++ struct tc_mqprio_qopt_offload *mqprio_qopt = type_data; ++ struct hnae3_knic_private_info *kinfo; ++ u8 tc = mqprio_qopt->qopt.num_tc; ++ u16 mode = mqprio_qopt->mode; ++ u8 hw = mqprio_qopt->qopt.hw; ++ struct hnae3_handle *h; ++ ++ if (!((hw == TC_MQPRIO_HW_OFFLOAD_TCS && ++ mode == TC_MQPRIO_MODE_CHANNEL) || (!hw && tc == 0))) ++ return -EOPNOTSUPP; ++ ++ if (tc > HNAE3_MAX_TC) ++ return -EINVAL; ++ ++ if (!netdev) ++ return -EINVAL; ++ ++ h = hns3_get_handle(netdev); ++ kinfo = &h->kinfo; ++ ++ netif_dbg(h, drv, netdev, "setup tc: num_tc=%u\n", tc); ++ ++ return (kinfo->dcb_ops && kinfo->dcb_ops->setup_tc) ? ++ kinfo->dcb_ops->setup_tc(h, mqprio_qopt) : -EOPNOTSUPP; ++} ++ ++static int hns3_setup_tc_cls_flower(struct hns3_nic_priv *priv, ++ struct flow_cls_offload *flow) ++{ ++ int tc = tc_classid_to_hwtc(priv->netdev, flow->classid); ++ struct hnae3_handle *h = hns3_get_handle(priv->netdev); ++ ++ switch (flow->command) { ++ case FLOW_CLS_REPLACE: ++ if (h->ae_algo->ops->add_cls_flower) ++ return h->ae_algo->ops->add_cls_flower(h, flow, tc); ++ break; ++ case FLOW_CLS_DESTROY: ++ if (h->ae_algo->ops->del_cls_flower) ++ return h->ae_algo->ops->del_cls_flower(h, flow); ++ break; ++ default: ++ break; ++ } ++ ++ return -EOPNOTSUPP; ++} ++ ++static int hns3_setup_tc_block_cb(enum tc_setup_type type, void *type_data, ++ void *cb_priv) ++{ ++ struct hns3_nic_priv *priv = cb_priv; ++ ++ if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data)) ++ return -EOPNOTSUPP; ++ ++ switch (type) { ++ case TC_SETUP_CLSFLOWER: ++ return hns3_setup_tc_cls_flower(priv, type_data); ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++static LIST_HEAD(hns3_block_cb_list); ++ ++static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type, ++ void *type_data) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(dev); ++ int ret; ++ ++ switch (type) { ++ case TC_SETUP_QDISC_MQPRIO: ++ ret = hns3_setup_tc(dev, type_data); ++ break; ++ case TC_SETUP_BLOCK: ++ ret = flow_block_cb_setup_simple(type_data, ++ &hns3_block_cb_list, ++ hns3_setup_tc_block_cb, ++ priv, priv, true); ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return ret; ++} ++ ++static int hns3_vlan_rx_add_vid(struct net_device *netdev, ++ __be16 proto, u16 vid) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret = -EIO; ++ ++ if (h->ae_algo->ops->set_vlan_filter) ++ ret = h->ae_algo->ops->set_vlan_filter(h, proto, vid, false); ++ ++ return ret; ++} ++ ++static int hns3_vlan_rx_kill_vid(struct net_device *netdev, ++ __be16 proto, u16 vid) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret = -EIO; ++ ++ if (h->ae_algo->ops->set_vlan_filter) ++ ret = h->ae_algo->ops->set_vlan_filter(h, proto, vid, true); ++ ++ return ret; ++} ++ ++static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, ++ u8 qos, __be16 vlan_proto) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret = -EIO; ++ ++ netif_dbg(h, drv, netdev, ++ "set vf vlan: vf=%d, vlan=%u, qos=%u, vlan_proto=0x%x\n", ++ vf, vlan, qos, ntohs(vlan_proto)); ++ ++ if (h->ae_algo->ops->set_vf_vlan_filter) ++ ret = h->ae_algo->ops->set_vf_vlan_filter(h, vf, vlan, ++ qos, vlan_proto); ++ ++ return ret; ++} ++ ++static int hns3_set_vf_spoofchk(struct net_device *netdev, int vf, bool enable) ++{ ++ struct hnae3_handle *handle = hns3_get_handle(netdev); ++ ++ if (hns3_nic_resetting(netdev)) ++ return -EBUSY; ++ ++ if (!handle->ae_algo->ops->set_vf_spoofchk) ++ return -EOPNOTSUPP; ++ ++ return handle->ae_algo->ops->set_vf_spoofchk(handle, vf, enable); ++} ++ ++static int hns3_set_vf_trust(struct net_device *netdev, int vf, bool enable) ++{ ++ struct hnae3_handle *handle = hns3_get_handle(netdev); ++ ++ if (!handle->ae_algo->ops->set_vf_trust) ++ return -EOPNOTSUPP; ++ ++ return handle->ae_algo->ops->set_vf_trust(handle, vf, enable); ++} ++ ++static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret; ++ ++ if (hns3_nic_resetting(netdev)) ++ return -EBUSY; ++ ++ if (!h->ae_algo->ops->set_mtu) ++ return -EOPNOTSUPP; ++ ++ netif_dbg(h, drv, netdev, ++ "change mtu from %u to %d\n", netdev->mtu, new_mtu); ++ ++ ret = h->ae_algo->ops->set_mtu(h, new_mtu); ++ if (ret) ++ netdev_err(netdev, "failed to change MTU in hardware %d\n", ++ ret); ++ else ++ netdev->mtu = new_mtu; ++ ++ return ret; ++} ++ ++static int hns3_get_timeout_queue(struct net_device *ndev) ++{ ++ int i; ++ ++ /* Find the stopped queue the same way the stack does */ ++ for (i = 0; i < ndev->num_tx_queues; i++) { ++ struct netdev_queue *q; ++ unsigned long trans_start; ++ ++ q = netdev_get_tx_queue(ndev, i); ++ trans_start = READ_ONCE(q->trans_start); ++ if (netif_xmit_stopped(q) && ++ time_after(jiffies, ++ (trans_start + ndev->watchdog_timeo))) { ++#ifdef CONFIG_BQL ++ struct dql *dql = &q->dql; ++ ++ netdev_info(ndev, "DQL info last_cnt: %u, queued: %u, adj_limit: %u, completed: %u\n", ++ dql->last_obj_cnt, dql->num_queued, ++ dql->adj_limit, dql->num_completed); ++#endif ++ netdev_info(ndev, "queue state: 0x%lx, delta msecs: %u\n", ++ q->state, ++ jiffies_to_msecs(jiffies - trans_start)); ++ break; ++ } ++ } ++ ++ return i; ++} ++ ++static void hns3_dump_queue_stats(struct net_device *ndev, ++ struct hns3_enet_ring *tx_ring, ++ int timeout_queue) ++{ ++ struct napi_struct *napi = &tx_ring->tqp_vector->napi; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ ++ netdev_info(ndev, ++ "tx_timeout count: %llu, queue id: %d, SW_NTU: 0x%x, SW_NTC: 0x%x, napi state: %lu\n", ++ priv->tx_timeout_count, timeout_queue, tx_ring->next_to_use, ++ tx_ring->next_to_clean, napi->state); ++ ++ netdev_info(ndev, ++ "tx_pkts: %llu, tx_bytes: %llu, sw_err_cnt: %llu, tx_pending: %d\n", ++ tx_ring->stats.tx_pkts, tx_ring->stats.tx_bytes, ++ tx_ring->stats.sw_err_cnt, tx_ring->pending_buf); ++ ++ netdev_info(ndev, ++ "seg_pkt_cnt: %llu, tx_more: %llu, restart_queue: %llu, tx_busy: %llu\n", ++ tx_ring->stats.seg_pkt_cnt, tx_ring->stats.tx_more, ++ tx_ring->stats.restart_queue, tx_ring->stats.tx_busy); ++ ++ netdev_info(ndev, "tx_push: %llu, tx_mem_doorbell: %llu\n", ++ tx_ring->stats.tx_push, tx_ring->stats.tx_mem_doorbell); ++} ++ ++static void hns3_dump_queue_reg(struct net_device *ndev, ++ struct hns3_enet_ring *tx_ring) ++{ ++ netdev_info(ndev, ++ "BD_NUM: 0x%x HW_HEAD: 0x%x, HW_TAIL: 0x%x, BD_ERR: 0x%x, INT: 0x%x\n", ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_NUM_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_HEAD_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TAIL_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_BD_ERR_REG), ++ readl(tx_ring->tqp_vector->mask_addr)); ++ netdev_info(ndev, ++ "RING_EN: 0x%x, TC: 0x%x, FBD_NUM: 0x%x FBD_OFT: 0x%x, EBD_NUM: 0x%x, EBD_OFT: 0x%x\n", ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_EN_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_TC_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_FBDNUM_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_OFFSET_REG), ++ hns3_tqp_read_reg(tx_ring, HNS3_RING_TX_RING_EBDNUM_REG), ++ hns3_tqp_read_reg(tx_ring, ++ HNS3_RING_TX_RING_EBD_OFFSET_REG)); ++} ++ ++static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ struct hnae3_handle *h = hns3_get_handle(ndev); ++ struct hns3_enet_ring *tx_ring; ++ int timeout_queue; ++ ++ timeout_queue = hns3_get_timeout_queue(ndev); ++ if (timeout_queue >= ndev->num_tx_queues) { ++ netdev_info(ndev, ++ "no netdev TX timeout queue found, timeout count: %llu\n", ++ priv->tx_timeout_count); ++ return false; ++ } ++ ++ priv->tx_timeout_count++; ++ ++ tx_ring = &priv->ring[timeout_queue]; ++ hns3_dump_queue_stats(ndev, tx_ring, timeout_queue); ++ ++ /* When mac received many pause frames continuous, it's unable to send ++ * packets, which may cause tx timeout ++ */ ++ if (h->ae_algo->ops->get_mac_stats) { ++ struct hns3_mac_stats mac_stats; ++ ++ h->ae_algo->ops->get_mac_stats(h, &mac_stats); ++ netdev_info(ndev, "tx_pause_cnt: %llu, rx_pause_cnt: %llu\n", ++ mac_stats.tx_pause_cnt, mac_stats.rx_pause_cnt); ++ } ++ ++ hns3_dump_queue_reg(ndev, tx_ring); ++ ++ return true; ++} ++ ++static void hns3_nic_net_timeout(struct net_device *ndev, unsigned int txqueue) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ struct hnae3_handle *h = priv->ae_handle; ++ ++ if (!hns3_get_tx_timeo_queue_info(ndev)) ++ return; ++ ++ /* request the reset, and let the hclge to determine ++ * which reset level should be done ++ */ ++ if (h->ae_algo->ops->reset_event) ++ h->ae_algo->ops->reset_event(h->pdev, h); ++} ++ ++#ifdef CONFIG_RFS_ACCEL ++static int hns3_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, ++ u16 rxq_index, u32 flow_id) ++{ ++ struct hnae3_handle *h = hns3_get_handle(dev); ++ struct flow_keys fkeys; ++ ++ if (!h->ae_algo->ops->add_arfs_entry) ++ return -EOPNOTSUPP; ++ ++ if (skb->encapsulation) ++ return -EPROTONOSUPPORT; ++ ++ if (!skb_flow_dissect_flow_keys(skb, &fkeys, 0)) ++ return -EPROTONOSUPPORT; ++ ++ if ((fkeys.basic.n_proto != htons(ETH_P_IP) && ++ fkeys.basic.n_proto != htons(ETH_P_IPV6)) || ++ (fkeys.basic.ip_proto != IPPROTO_TCP && ++ fkeys.basic.ip_proto != IPPROTO_UDP)) ++ return -EPROTONOSUPPORT; ++ ++ return h->ae_algo->ops->add_arfs_entry(h, rxq_index, flow_id, &fkeys); ++} ++#endif ++ ++static int hns3_nic_get_vf_config(struct net_device *ndev, int vf, ++ struct ifla_vf_info *ivf) ++{ ++ struct hnae3_handle *h = hns3_get_handle(ndev); ++ ++ if (!h->ae_algo->ops->get_vf_config) ++ return -EOPNOTSUPP; ++ ++ return h->ae_algo->ops->get_vf_config(h, vf, ivf); ++} ++ ++static int hns3_nic_set_vf_link_state(struct net_device *ndev, int vf, ++ int link_state) ++{ ++ struct hnae3_handle *h = hns3_get_handle(ndev); ++ ++ if (!h->ae_algo->ops->set_vf_link_state) ++ return -EOPNOTSUPP; ++ ++ return h->ae_algo->ops->set_vf_link_state(h, vf, link_state); ++} ++ ++static int hns3_nic_set_vf_rate(struct net_device *ndev, int vf, ++ int min_tx_rate, int max_tx_rate) ++{ ++ struct hnae3_handle *h = hns3_get_handle(ndev); ++ ++ if (!h->ae_algo->ops->set_vf_rate) ++ return -EOPNOTSUPP; ++ ++ return h->ae_algo->ops->set_vf_rate(h, vf, min_tx_rate, max_tx_rate, ++ false); ++} ++ ++static int hns3_nic_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ ++ if (!h->ae_algo->ops->set_vf_mac) ++ return -EOPNOTSUPP; ++ ++ if (is_multicast_ether_addr(mac)) { ++ hnae3_format_mac_addr(format_mac_addr, mac); ++ netdev_err(netdev, ++ "Invalid MAC:%s specified. Could not set MAC\n", ++ format_mac_addr); ++ return -EINVAL; ++ } ++ ++ return h->ae_algo->ops->set_vf_mac(h, vf_id, mac); ++} ++ ++static const struct net_device_ops hns3_nic_netdev_ops = { ++ .ndo_open = hns3_nic_net_open, ++ .ndo_stop = hns3_nic_net_stop, ++ .ndo_start_xmit = hns3_nic_net_xmit, ++ .ndo_tx_timeout = hns3_nic_net_timeout, ++ .ndo_set_mac_address = hns3_nic_net_set_mac_address, ++ .ndo_eth_ioctl = hns3_nic_do_ioctl, ++ .ndo_change_mtu = hns3_nic_change_mtu, ++ .ndo_set_features = hns3_nic_set_features, ++ .ndo_features_check = hns3_features_check, ++ .ndo_get_stats64 = hns3_nic_get_stats64, ++ .ndo_setup_tc = hns3_nic_setup_tc, ++ .ndo_set_rx_mode = hns3_nic_set_rx_mode, ++ .ndo_vlan_rx_add_vid = hns3_vlan_rx_add_vid, ++ .ndo_vlan_rx_kill_vid = hns3_vlan_rx_kill_vid, ++ .ndo_set_vf_vlan = hns3_ndo_set_vf_vlan, ++ .ndo_set_vf_spoofchk = hns3_set_vf_spoofchk, ++ .ndo_set_vf_trust = hns3_set_vf_trust, ++#ifdef CONFIG_RFS_ACCEL ++ .ndo_rx_flow_steer = hns3_rx_flow_steer, ++#endif ++ .ndo_get_vf_config = hns3_nic_get_vf_config, ++ .ndo_set_vf_link_state = hns3_nic_set_vf_link_state, ++ .ndo_set_vf_rate = hns3_nic_set_vf_rate, ++ .ndo_set_vf_mac = hns3_nic_set_vf_mac, ++}; ++ ++bool hns3_is_phys_func(struct pci_dev *pdev) ++{ ++ u32 dev_id = pdev->device; ++ ++ switch (dev_id) { ++ case HNAE3_DEV_ID_GE: ++ case HNAE3_DEV_ID_25GE: ++ case HNAE3_DEV_ID_25GE_RDMA: ++ case HNAE3_DEV_ID_25GE_RDMA_MACSEC: ++ case HNAE3_DEV_ID_50GE_RDMA: ++ case HNAE3_DEV_ID_50GE_RDMA_MACSEC: ++ case HNAE3_DEV_ID_100G_RDMA_MACSEC: ++ case HNAE3_DEV_ID_200G_RDMA: ++ return true; ++ case HNAE3_DEV_ID_VF: ++ case HNAE3_DEV_ID_RDMA_DCB_PFC_VF: ++ return false; ++ default: ++ dev_warn(&pdev->dev, "un-recognized pci device-id %u", ++ dev_id); ++ } ++ ++ return false; ++} ++ ++static void hns3_disable_sriov(struct pci_dev *pdev) ++{ ++ /* If our VFs are assigned we cannot shut down SR-IOV ++ * without causing issues, so just leave the hardware ++ * available but disabled ++ */ ++ if (pci_vfs_assigned(pdev)) { ++ dev_warn(&pdev->dev, ++ "disabling driver while VFs are assigned\n"); ++ return; ++ } ++ ++ pci_disable_sriov(pdev); ++} ++ ++/* hns3_probe - Device initialization routine ++ * @pdev: PCI device information struct ++ * @ent: entry in hns3_pci_tbl ++ * ++ * hns3_probe initializes a PF identified by a pci_dev structure. ++ * The OS initialization, configuring of the PF private structure, ++ * and a hardware reset occur. ++ * ++ * Returns 0 on success, negative on failure ++ */ ++static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ++{ ++ struct hnae3_ae_dev *ae_dev; ++ int ret; ++ ++ ae_dev = devm_kzalloc(&pdev->dev, sizeof(*ae_dev), GFP_KERNEL); ++ if (!ae_dev) ++ return -ENOMEM; ++ ++ ae_dev->pdev = pdev; ++ ae_dev->flag = ent->driver_data; ++ pci_set_drvdata(pdev, ae_dev); ++ ++ ret = hnae3_register_ae_dev(ae_dev); ++ if (ret) ++ pci_set_drvdata(pdev, NULL); ++ ++ return ret; ++} ++ ++/** ++ * hns3_clean_vf_config ++ * @pdev: pointer to a pci_dev structure ++ * @num_vfs: number of VFs allocated ++ * ++ * Clean residual vf config after disable sriov ++ **/ ++static void hns3_clean_vf_config(struct pci_dev *pdev, int num_vfs) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ if (ae_dev->ops->clean_vf_config) ++ ae_dev->ops->clean_vf_config(ae_dev, num_vfs); ++} ++ ++/* hns3_remove - Device removal routine ++ * @pdev: PCI device information struct ++ */ ++static void hns3_remove(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ if (hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV)) ++ hns3_disable_sriov(pdev); ++ ++ hnae3_unregister_ae_dev(ae_dev); ++ pci_set_drvdata(pdev, NULL); ++} ++ ++/** ++ * hns3_pci_sriov_configure ++ * @pdev: pointer to a pci_dev structure ++ * @num_vfs: number of VFs to allocate ++ * ++ * Enable or change the number of VFs. Called when the user updates the number ++ * of VFs in sysfs. ++ **/ ++static int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) ++{ ++ int ret; ++ ++ if (!(hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))) { ++ dev_warn(&pdev->dev, "Can not config SRIOV\n"); ++ return -EINVAL; ++ } ++ ++ if (num_vfs) { ++ ret = pci_enable_sriov(pdev, num_vfs); ++ if (ret) ++ dev_err(&pdev->dev, "SRIOV enable failed %d\n", ret); ++ else ++ return num_vfs; ++ } else if (!pci_vfs_assigned(pdev)) { ++ int num_vfs_pre = pci_num_vf(pdev); ++ ++ pci_disable_sriov(pdev); ++ hns3_clean_vf_config(pdev, num_vfs_pre); ++ } else { ++ dev_warn(&pdev->dev, ++ "Unable to free VFs because some are assigned to VMs.\n"); ++ } ++ ++ return 0; ++} ++ ++static void hns3_shutdown(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ hnae3_unregister_ae_dev(ae_dev); ++ pci_set_drvdata(pdev, NULL); ++ ++ if (system_state == SYSTEM_POWER_OFF) ++ pci_set_power_state(pdev, PCI_D3hot); ++} ++ ++static int __maybe_unused hns3_suspend(struct device *dev) ++{ ++ struct hnae3_ae_dev *ae_dev = dev_get_drvdata(dev); ++ ++ if (ae_dev && hns3_is_phys_func(ae_dev->pdev)) { ++ dev_info(dev, "Begin to suspend.\n"); ++ if (ae_dev->ops && ae_dev->ops->reset_prepare) ++ ae_dev->ops->reset_prepare(ae_dev, HNAE3_FUNC_RESET); ++ } ++ ++ return 0; ++} ++ ++static int __maybe_unused hns3_resume(struct device *dev) ++{ ++ struct hnae3_ae_dev *ae_dev = dev_get_drvdata(dev); ++ ++ if (ae_dev && hns3_is_phys_func(ae_dev->pdev)) { ++ dev_info(dev, "Begin to resume.\n"); ++ if (ae_dev->ops && ae_dev->ops->reset_done) ++ ae_dev->ops->reset_done(ae_dev); ++ } ++ ++ return 0; ++} ++ ++static pci_ers_result_t hns3_error_detected(struct pci_dev *pdev, ++ pci_channel_state_t state) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ pci_ers_result_t ret; ++ ++ dev_info(&pdev->dev, "PCI error detected, state(=%u)!!\n", state); ++ ++ if (state == pci_channel_io_perm_failure) ++ return PCI_ERS_RESULT_DISCONNECT; ++ ++ if (!ae_dev || !ae_dev->ops) { ++ dev_err(&pdev->dev, ++ "Can't recover - error happened before device initialized\n"); ++ return PCI_ERS_RESULT_NONE; ++ } ++ ++ if (ae_dev->ops->handle_hw_ras_error) ++ ret = ae_dev->ops->handle_hw_ras_error(ae_dev); ++ else ++ return PCI_ERS_RESULT_NONE; ++ ++ return ret; ++} ++ ++static pci_ers_result_t hns3_slot_reset(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ const struct hnae3_ae_ops *ops; ++ enum hnae3_reset_type reset_type; ++ struct device *dev = &pdev->dev; ++ ++ if (!ae_dev || !ae_dev->ops) ++ return PCI_ERS_RESULT_NONE; ++ ++ ops = ae_dev->ops; ++ /* request the reset */ ++ if (ops->reset_event && ops->get_reset_level && ++ ops->set_default_reset_request) { ++ if (ae_dev->hw_err_reset_req) { ++ reset_type = ops->get_reset_level(ae_dev, ++ &ae_dev->hw_err_reset_req); ++ ops->set_default_reset_request(ae_dev, reset_type); ++ dev_info(dev, "requesting reset due to PCI error\n"); ++ ops->reset_event(pdev, NULL); ++ } ++ ++ return PCI_ERS_RESULT_RECOVERED; ++ } ++ ++ return PCI_ERS_RESULT_DISCONNECT; ++} ++ ++static void hns3_reset_prepare(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ dev_info(&pdev->dev, "FLR prepare\n"); ++ if (ae_dev && ae_dev->ops && ae_dev->ops->reset_prepare) ++ ae_dev->ops->reset_prepare(ae_dev, HNAE3_FLR_RESET); ++} ++ ++static void hns3_reset_done(struct pci_dev *pdev) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ dev_info(&pdev->dev, "FLR done\n"); ++ if (ae_dev && ae_dev->ops && ae_dev->ops->reset_done) ++ ae_dev->ops->reset_done(ae_dev); ++} ++ ++static const struct pci_error_handlers hns3_err_handler = { ++ .error_detected = hns3_error_detected, ++ .slot_reset = hns3_slot_reset, ++ .reset_prepare = hns3_reset_prepare, ++ .reset_done = hns3_reset_done, ++}; ++ ++static SIMPLE_DEV_PM_OPS(hns3_pm_ops, hns3_suspend, hns3_resume); ++ ++static struct pci_driver hns3_driver = { ++ .name = hns3_driver_name, ++ .id_table = hns3_pci_tbl, ++ .probe = hns3_probe, ++ .remove = hns3_remove, ++ .shutdown = hns3_shutdown, ++ .driver.pm = &hns3_pm_ops, ++ .sriov_configure = hns3_pci_sriov_configure, ++ .err_handler = &hns3_err_handler, ++}; ++ ++/* set default feature to hns3 */ ++static void hns3_set_default_feature(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct pci_dev *pdev = h->pdev; ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ netdev->priv_flags |= IFF_UNICAST_FLT; ++ ++ netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM; ++ ++ netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | ++ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | ++ NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO | ++ NETIF_F_GRO | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_GSO_GRE | ++ NETIF_F_GSO_GRE_CSUM | NETIF_F_GSO_UDP_TUNNEL | ++ NETIF_F_SCTP_CRC | NETIF_F_FRAGLIST; ++ ++ if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V2) { ++ netdev->features |= NETIF_F_GRO_HW; ++ ++ if (!(h->flags & HNAE3_SUPPORT_VF)) ++ netdev->features |= NETIF_F_NTUPLE; ++ } ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_UDP_GSO_B, ae_dev->caps)) ++ netdev->features |= NETIF_F_GSO_UDP_L4; ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_HW_TX_CSUM_B, ae_dev->caps)) ++ netdev->features |= NETIF_F_HW_CSUM; ++ else ++ netdev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_UDP_TUNNEL_CSUM_B, ae_dev->caps)) ++ netdev->features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_FD_FORWARD_TC_B, ae_dev->caps)) ++ netdev->features |= NETIF_F_HW_TC; ++ ++ netdev->hw_features |= netdev->features; ++ if (!test_bit(HNAE3_DEV_SUPPORT_VLAN_FLTR_MDF_B, ae_dev->caps)) ++ netdev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; ++ ++ netdev->vlan_features |= netdev->features & ++ ~(NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_TX | ++ NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_GRO_HW | NETIF_F_NTUPLE | ++ NETIF_F_HW_TC); ++ ++ netdev->hw_enc_features |= netdev->vlan_features | NETIF_F_TSO_MANGLEID; ++} ++ ++static int hns3_alloc_buffer(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb) ++{ ++ unsigned int order = hns3_page_order(ring); ++ struct page *p; ++ ++ if (ring->page_pool) { ++ p = page_pool_dev_alloc_frag(ring->page_pool, ++ &cb->page_offset, ++ hns3_buf_size(ring)); ++ if (unlikely(!p)) ++ return -ENOMEM; ++ ++ cb->priv = p; ++ cb->buf = page_address(p); ++ cb->dma = page_pool_get_dma_addr(p); ++ cb->type = DESC_TYPE_PP_FRAG; ++ cb->reuse_flag = 0; ++ return 0; ++ } ++ ++ p = dev_alloc_pages(order); ++ if (!p) ++ return -ENOMEM; ++ ++ cb->priv = p; ++ cb->page_offset = 0; ++ cb->reuse_flag = 0; ++ cb->buf = page_address(p); ++ cb->length = hns3_page_size(ring); ++ cb->type = DESC_TYPE_PAGE; ++ page_ref_add(p, USHRT_MAX - 1); ++ cb->pagecnt_bias = USHRT_MAX; ++ ++ return 0; ++} ++ ++static void hns3_free_buffer(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb, int budget) ++{ ++ if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD | ++ DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB)) ++ napi_consume_skb(cb->priv, budget); ++ else if (!HNAE3_IS_TX_RING(ring)) { ++ if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias) ++ __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); ++ else if (cb->type & DESC_TYPE_PP_FRAG) ++ page_pool_put_full_page(ring->page_pool, cb->priv, ++ false); ++ } ++ memset(cb, 0, sizeof(*cb)); ++} ++ ++static int hns3_map_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb) ++{ ++ cb->dma = dma_map_page(ring_to_dev(ring), cb->priv, 0, ++ cb->length, ring_to_dma_dir(ring)); ++ ++ if (unlikely(dma_mapping_error(ring_to_dev(ring), cb->dma))) ++ return -EIO; ++ ++ return 0; ++} ++ ++static void hns3_unmap_buffer(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb) ++{ ++ if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_FRAGLIST_SKB)) ++ dma_unmap_single(ring_to_dev(ring), cb->dma, cb->length, ++ ring_to_dma_dir(ring)); ++ else if ((cb->type & DESC_TYPE_PAGE) && cb->length) ++ dma_unmap_page(ring_to_dev(ring), cb->dma, cb->length, ++ ring_to_dma_dir(ring)); ++ else if (cb->type & (DESC_TYPE_BOUNCE_ALL | DESC_TYPE_BOUNCE_HEAD | ++ DESC_TYPE_SGL_SKB)) ++ hns3_tx_spare_reclaim_cb(ring, cb); ++} ++ ++static void hns3_buffer_detach(struct hns3_enet_ring *ring, int i) ++{ ++ hns3_unmap_buffer(ring, &ring->desc_cb[i]); ++ ring->desc[i].addr = 0; ++ ring->desc_cb[i].refill = 0; ++} ++ ++static void hns3_free_buffer_detach(struct hns3_enet_ring *ring, int i, ++ int budget) ++{ ++ struct hns3_desc_cb *cb = &ring->desc_cb[i]; ++ ++ if (!ring->desc_cb[i].dma) ++ return; ++ ++ hns3_buffer_detach(ring, i); ++ hns3_free_buffer(ring, cb, budget); ++} ++ ++static void hns3_free_buffers(struct hns3_enet_ring *ring) ++{ ++ int i; ++ ++ for (i = 0; i < ring->desc_num; i++) ++ hns3_free_buffer_detach(ring, i, 0); ++} ++ ++/* free desc along with its attached buffer */ ++static void hns3_free_desc(struct hns3_enet_ring *ring) ++{ ++ int size = ring->desc_num * sizeof(ring->desc[0]); ++ ++ hns3_free_buffers(ring); ++ ++ if (ring->desc) { ++ dma_free_coherent(ring_to_dev(ring), size, ++ ring->desc, ring->desc_dma_addr); ++ ring->desc = NULL; ++ } ++} ++ ++static int hns3_alloc_desc(struct hns3_enet_ring *ring) ++{ ++ int size = ring->desc_num * sizeof(ring->desc[0]); ++ ++ ring->desc = dma_alloc_coherent(ring_to_dev(ring), size, ++ &ring->desc_dma_addr, GFP_KERNEL); ++ if (!ring->desc) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int hns3_alloc_and_map_buffer(struct hns3_enet_ring *ring, ++ struct hns3_desc_cb *cb) ++{ ++ int ret; ++ ++ ret = hns3_alloc_buffer(ring, cb); ++ if (ret || ring->page_pool) ++ goto out; ++ ++ ret = hns3_map_buffer(ring, cb); ++ if (ret) ++ goto out_with_buf; ++ ++ return 0; ++ ++out_with_buf: ++ hns3_free_buffer(ring, cb, 0); ++out: ++ return ret; ++} ++ ++static int hns3_alloc_and_attach_buffer(struct hns3_enet_ring *ring, int i) ++{ ++ int ret = hns3_alloc_and_map_buffer(ring, &ring->desc_cb[i]); ++ ++ if (ret) ++ return ret; ++ ++ ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + ++ ring->desc_cb[i].page_offset); ++ ring->desc_cb[i].refill = 1; ++ ++ return 0; ++} ++ ++/* Allocate memory for raw pkg, and map with dma */ ++static int hns3_alloc_ring_buffers(struct hns3_enet_ring *ring) ++{ ++ int i, j, ret; ++ ++ for (i = 0; i < ring->desc_num; i++) { ++ ret = hns3_alloc_and_attach_buffer(ring, i); ++ if (ret) ++ goto out_buffer_fail; ++ } ++ ++ return 0; ++ ++out_buffer_fail: ++ for (j = i - 1; j >= 0; j--) ++ hns3_free_buffer_detach(ring, j, 0); ++ return ret; ++} ++ ++/* detach a in-used buffer and replace with a reserved one */ ++static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i, ++ struct hns3_desc_cb *res_cb) ++{ ++ hns3_unmap_buffer(ring, &ring->desc_cb[i]); ++ ring->desc_cb[i] = *res_cb; ++ ring->desc_cb[i].refill = 1; ++ ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + ++ ring->desc_cb[i].page_offset); ++ ring->desc[i].rx.bd_base_info = 0; ++} ++ ++static void hns3_reuse_buffer(struct hns3_enet_ring *ring, int i) ++{ ++ ring->desc_cb[i].reuse_flag = 0; ++ ring->desc_cb[i].refill = 1; ++ ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + ++ ring->desc_cb[i].page_offset); ++ ring->desc[i].rx.bd_base_info = 0; ++ ++ dma_sync_single_for_device(ring_to_dev(ring), ++ ring->desc_cb[i].dma + ring->desc_cb[i].page_offset, ++ hns3_buf_size(ring), ++ DMA_FROM_DEVICE); ++} ++ ++static bool hns3_nic_reclaim_desc(struct hns3_enet_ring *ring, ++ int *bytes, int *pkts, int budget) ++{ ++ /* pair with ring->last_to_use update in hns3_tx_doorbell(), ++ * smp_store_release() is not used in hns3_tx_doorbell() because ++ * the doorbell operation already have the needed barrier operation. ++ */ ++ int ltu = smp_load_acquire(&ring->last_to_use); ++ int ntc = ring->next_to_clean; ++ struct hns3_desc_cb *desc_cb; ++ bool reclaimed = false; ++ struct hns3_desc *desc; ++ ++ while (ltu != ntc) { ++ desc = &ring->desc[ntc]; ++ ++ if (le16_to_cpu(desc->tx.bdtp_fe_sc_vld_ra_ri) & ++ BIT(HNS3_TXD_VLD_B)) ++ break; ++ ++ desc_cb = &ring->desc_cb[ntc]; ++ ++ if (desc_cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_ALL | ++ DESC_TYPE_BOUNCE_HEAD | ++ DESC_TYPE_SGL_SKB)) { ++ (*pkts)++; ++ (*bytes) += desc_cb->send_bytes; ++ } ++ ++ /* desc_cb will be cleaned, after hnae3_free_buffer_detach */ ++ hns3_free_buffer_detach(ring, ntc, budget); ++ ++ if (++ntc == ring->desc_num) ++ ntc = 0; ++ ++ /* Issue prefetch for next Tx descriptor */ ++ prefetch(&ring->desc_cb[ntc]); ++ reclaimed = true; ++ } ++ ++ if (unlikely(!reclaimed)) ++ return false; ++ ++ /* This smp_store_release() pairs with smp_load_acquire() in ++ * ring_space called by hns3_nic_net_xmit. ++ */ ++ smp_store_release(&ring->next_to_clean, ntc); ++ ++ hns3_tx_spare_update(ring); ++ ++ return true; ++} ++ ++void hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ struct netdev_queue *dev_queue; ++ int bytes, pkts; ++ ++ bytes = 0; ++ pkts = 0; ++ ++ if (unlikely(!hns3_nic_reclaim_desc(ring, &bytes, &pkts, budget))) ++ return; ++ ++ ring->tqp_vector->tx_group.total_bytes += bytes; ++ ring->tqp_vector->tx_group.total_packets += pkts; ++ ++ u64_stats_update_begin(&ring->syncp); ++ ring->stats.tx_bytes += bytes; ++ ring->stats.tx_pkts += pkts; ++ u64_stats_update_end(&ring->syncp); ++ ++ dev_queue = netdev_get_tx_queue(netdev, ring->tqp->tqp_index); ++ netdev_tx_completed_queue(dev_queue, pkts, bytes); ++ ++ if (unlikely(netif_carrier_ok(netdev) && ++ ring_space(ring) > HNS3_MAX_TSO_BD_NUM)) { ++ /* Make sure that anybody stopping the queue after this ++ * sees the new next_to_clean. ++ */ ++ smp_mb(); ++ if (netif_tx_queue_stopped(dev_queue) && ++ !test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) { ++ netif_tx_wake_queue(dev_queue); ++ ring->stats.restart_queue++; ++ } ++ } ++} ++ ++static int hns3_desc_unused(struct hns3_enet_ring *ring) ++{ ++ int ntc = ring->next_to_clean; ++ int ntu = ring->next_to_use; ++ ++ if (unlikely(ntc == ntu && !ring->desc_cb[ntc].refill)) ++ return ring->desc_num; ++ ++ return ((ntc >= ntu) ? 0 : ring->desc_num) + ntc - ntu; ++} ++ ++/* Return true if there is any allocation failure */ ++static bool hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, ++ int cleand_count) ++{ ++ struct hns3_desc_cb *desc_cb; ++ struct hns3_desc_cb res_cbs; ++ int i, ret; ++ ++ for (i = 0; i < cleand_count; i++) { ++ desc_cb = &ring->desc_cb[ring->next_to_use]; ++ if (desc_cb->reuse_flag) { ++ hns3_ring_stats_update(ring, reuse_pg_cnt); ++ ++ hns3_reuse_buffer(ring, ring->next_to_use); ++ } else { ++ ret = hns3_alloc_and_map_buffer(ring, &res_cbs); ++ if (ret) { ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ ++ hns3_rl_err(ring_to_netdev(ring), ++ "alloc rx buffer failed: %d\n", ++ ret); ++ ++ writel(i, ring->tqp->io_base + ++ HNS3_RING_RX_RING_HEAD_REG); ++ return true; ++ } ++ hns3_replace_buffer(ring, ring->next_to_use, &res_cbs); ++ ++ hns3_ring_stats_update(ring, non_reuse_pg); ++ } ++ ++ ring_ptr_move_fw(ring, next_to_use); ++ } ++ ++ writel(i, ring->tqp->io_base + HNS3_RING_RX_RING_HEAD_REG); ++ return false; ++} ++ ++static bool hns3_can_reuse_page(struct hns3_desc_cb *cb) ++{ ++ return page_count(cb->priv) == cb->pagecnt_bias; ++} ++ ++static int hns3_handle_rx_copybreak(struct sk_buff *skb, int i, ++ struct hns3_enet_ring *ring, ++ int pull_len, ++ struct hns3_desc_cb *desc_cb) ++{ ++ struct hns3_desc *desc = &ring->desc[ring->next_to_clean]; ++ u32 frag_offset = desc_cb->page_offset + pull_len; ++ int size = le16_to_cpu(desc->rx.size); ++ u32 frag_size = size - pull_len; ++ void *frag = napi_alloc_frag(frag_size); ++ ++ if (unlikely(!frag)) { ++ hns3_ring_stats_update(ring, frag_alloc_err); ++ ++ hns3_rl_err(ring_to_netdev(ring), ++ "failed to allocate rx frag\n"); ++ return -ENOMEM; ++ } ++ ++ desc_cb->reuse_flag = 1; ++ memcpy(frag, desc_cb->buf + frag_offset, frag_size); ++ skb_add_rx_frag(skb, i, virt_to_page(frag), ++ offset_in_page(frag), frag_size, frag_size); ++ ++ hns3_ring_stats_update(ring, frag_alloc); ++ return 0; ++} ++ ++static void hns3_nic_reuse_page(struct sk_buff *skb, int i, ++ struct hns3_enet_ring *ring, int pull_len, ++ struct hns3_desc_cb *desc_cb) ++{ ++ struct hns3_desc *desc = &ring->desc[ring->next_to_clean]; ++ u32 frag_offset = desc_cb->page_offset + pull_len; ++ int size = le16_to_cpu(desc->rx.size); ++ u32 truesize = hns3_buf_size(ring); ++ u32 frag_size = size - pull_len; ++ int ret = 0; ++ bool reused; ++ ++ if (ring->page_pool) { ++ skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset, ++ frag_size, truesize); ++ return; ++ } ++ ++ /* Avoid re-using remote or pfmem page */ ++ if (unlikely(!dev_page_is_reusable(desc_cb->priv))) ++ goto out; ++ ++ reused = hns3_can_reuse_page(desc_cb); ++ ++ /* Rx page can be reused when: ++ * 1. Rx page is only owned by the driver when page_offset ++ * is zero, which means 0 @ truesize will be used by ++ * stack after skb_add_rx_frag() is called, and the rest ++ * of rx page can be reused by driver. ++ * Or ++ * 2. Rx page is only owned by the driver when page_offset ++ * is non-zero, which means page_offset @ truesize will ++ * be used by stack after skb_add_rx_frag() is called, ++ * and 0 @ truesize can be reused by driver. ++ */ ++ if ((!desc_cb->page_offset && reused) || ++ ((desc_cb->page_offset + truesize + truesize) <= ++ hns3_page_size(ring) && desc_cb->page_offset)) { ++ desc_cb->page_offset += truesize; ++ desc_cb->reuse_flag = 1; ++ } else if (desc_cb->page_offset && reused) { ++ desc_cb->page_offset = 0; ++ desc_cb->reuse_flag = 1; ++ } else if (frag_size <= ring->rx_copybreak) { ++ ret = hns3_handle_rx_copybreak(skb, i, ring, pull_len, desc_cb); ++ if (!ret) ++ return; ++ } ++ ++out: ++ desc_cb->pagecnt_bias--; ++ ++ if (unlikely(!desc_cb->pagecnt_bias)) { ++ page_ref_add(desc_cb->priv, USHRT_MAX); ++ desc_cb->pagecnt_bias = USHRT_MAX; ++ } ++ ++ skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset, ++ frag_size, truesize); ++ ++ if (unlikely(!desc_cb->reuse_flag)) ++ __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias); ++} ++ ++static int hns3_gro_complete(struct sk_buff *skb, u32 l234info) ++{ ++ __be16 type = skb->protocol; ++ struct tcphdr *th; ++ int depth = 0; ++ ++ while (eth_type_vlan(type)) { ++ struct vlan_hdr *vh; ++ ++ if ((depth + VLAN_HLEN) > skb_headlen(skb)) ++ return -EFAULT; ++ ++ vh = (struct vlan_hdr *)(skb->data + depth); ++ type = vh->h_vlan_encapsulated_proto; ++ depth += VLAN_HLEN; ++ } ++ ++ skb_set_network_header(skb, depth); ++ ++ if (type == htons(ETH_P_IP)) { ++ const struct iphdr *iph = ip_hdr(skb); ++ ++ depth += sizeof(struct iphdr); ++ skb_set_transport_header(skb, depth); ++ th = tcp_hdr(skb); ++ th->check = ~tcp_v4_check(skb->len - depth, iph->saddr, ++ iph->daddr, 0); ++ } else if (type == htons(ETH_P_IPV6)) { ++ const struct ipv6hdr *iph = ipv6_hdr(skb); ++ ++ depth += sizeof(struct ipv6hdr); ++ skb_set_transport_header(skb, depth); ++ th = tcp_hdr(skb); ++ th->check = ~tcp_v6_check(skb->len - depth, &iph->saddr, ++ &iph->daddr, 0); ++ } else { ++ hns3_rl_err(skb->dev, ++ "Error: FW GRO supports only IPv4/IPv6, not 0x%04x, depth: %d\n", ++ be16_to_cpu(type), depth); ++ return -EFAULT; ++ } ++ ++ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; ++ if (th->cwr) ++ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; ++ ++ if (l234info & BIT(HNS3_RXD_GRO_FIXID_B)) ++ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; ++ ++ skb->csum_start = (unsigned char *)th - skb->head; ++ skb->csum_offset = offsetof(struct tcphdr, check); ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ ++ trace_hns3_gro(skb); ++ ++ return 0; ++} ++ ++static bool hns3_checksum_complete(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, u32 ptype, u16 csum) ++{ ++ if (ptype == HNS3_INVALID_PTYPE || ++ hns3_rx_ptype_tbl[ptype].ip_summed != CHECKSUM_COMPLETE) ++ return false; ++ ++ hns3_ring_stats_update(ring, csum_complete); ++ skb->ip_summed = CHECKSUM_COMPLETE; ++ skb->csum = csum_unfold((__force __sum16)csum); ++ ++ return true; ++} ++ ++static void hns3_rx_handle_csum(struct sk_buff *skb, u32 l234info, ++ u32 ol_info, u32 ptype) ++{ ++ int l3_type, l4_type; ++ int ol4_type; ++ ++ if (ptype != HNS3_INVALID_PTYPE) { ++ skb->csum_level = hns3_rx_ptype_tbl[ptype].csum_level; ++ skb->ip_summed = hns3_rx_ptype_tbl[ptype].ip_summed; ++ ++ return; ++ } ++ ++ ol4_type = hnae3_get_field(ol_info, HNS3_RXD_OL4ID_M, ++ HNS3_RXD_OL4ID_S); ++ switch (ol4_type) { ++ case HNS3_OL4_TYPE_MAC_IN_UDP: ++ case HNS3_OL4_TYPE_NVGRE: ++ skb->csum_level = 1; ++ fallthrough; ++ case HNS3_OL4_TYPE_NO_TUN: ++ l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M, ++ HNS3_RXD_L3ID_S); ++ l4_type = hnae3_get_field(l234info, HNS3_RXD_L4ID_M, ++ HNS3_RXD_L4ID_S); ++ /* Can checksum ipv4 or ipv6 + UDP/TCP/SCTP packets */ ++ if ((l3_type == HNS3_L3_TYPE_IPV4 || ++ l3_type == HNS3_L3_TYPE_IPV6) && ++ (l4_type == HNS3_L4_TYPE_UDP || ++ l4_type == HNS3_L4_TYPE_TCP || ++ l4_type == HNS3_L4_TYPE_SCTP)) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ break; ++ default: ++ break; ++ } ++} ++ ++static void hns3_rx_checksum(struct hns3_enet_ring *ring, struct sk_buff *skb, ++ u32 l234info, u32 bd_base_info, u32 ol_info, ++ u16 csum) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ u32 ptype = HNS3_INVALID_PTYPE; ++ ++ skb->ip_summed = CHECKSUM_NONE; ++ ++ skb_checksum_none_assert(skb); ++ ++ if (!(netdev->features & NETIF_F_RXCSUM)) ++ return; ++ ++ if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) ++ ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M, ++ HNS3_RXD_PTYPE_S); ++ ++ if (hns3_checksum_complete(ring, skb, ptype, csum)) ++ return; ++ ++ /* check if hardware has done checksum */ ++ if (!(bd_base_info & BIT(HNS3_RXD_L3L4P_B))) ++ return; ++ ++ if (unlikely(l234info & (BIT(HNS3_RXD_L3E_B) | BIT(HNS3_RXD_L4E_B) | ++ BIT(HNS3_RXD_OL3E_B) | ++ BIT(HNS3_RXD_OL4E_B)))) { ++ hns3_ring_stats_update(ring, l3l4_csum_err); ++ ++ return; ++ } ++ ++ hns3_rx_handle_csum(skb, l234info, ol_info, ptype); ++} ++ ++static void hns3_rx_skb(struct hns3_enet_ring *ring, struct sk_buff *skb) ++{ ++ if (skb_has_frag_list(skb)) ++ napi_gro_flush(&ring->tqp_vector->napi, false); ++ ++ napi_gro_receive(&ring->tqp_vector->napi, skb); ++} ++ ++static bool hns3_parse_vlan_tag(struct hns3_enet_ring *ring, ++ struct hns3_desc *desc, u32 l234info, ++ u16 *vlan_tag) ++{ ++ struct hnae3_handle *handle = ring->tqp->handle; ++ struct pci_dev *pdev = ring->tqp->handle->pdev; ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ ++ if (unlikely(ae_dev->dev_version < HNAE3_DEVICE_VERSION_V2)) { ++ *vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag); ++ if (!(*vlan_tag & VLAN_VID_MASK)) ++ *vlan_tag = le16_to_cpu(desc->rx.vlan_tag); ++ ++ return (*vlan_tag != 0); ++ } ++ ++#define HNS3_STRP_OUTER_VLAN 0x1 ++#define HNS3_STRP_INNER_VLAN 0x2 ++#define HNS3_STRP_BOTH 0x3 ++ ++ /* Hardware always insert VLAN tag into RX descriptor when ++ * remove the tag from packet, driver needs to determine ++ * reporting which tag to stack. ++ */ ++ switch (hnae3_get_field(l234info, HNS3_RXD_STRP_TAGP_M, ++ HNS3_RXD_STRP_TAGP_S)) { ++ case HNS3_STRP_OUTER_VLAN: ++ if (handle->port_base_vlan_state != ++ HNAE3_PORT_BASE_VLAN_DISABLE) ++ return false; ++ ++ *vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag); ++ return true; ++ case HNS3_STRP_INNER_VLAN: ++ if (handle->port_base_vlan_state != ++ HNAE3_PORT_BASE_VLAN_DISABLE) ++ return false; ++ ++ *vlan_tag = le16_to_cpu(desc->rx.vlan_tag); ++ return true; ++ case HNS3_STRP_BOTH: ++ if (handle->port_base_vlan_state == ++ HNAE3_PORT_BASE_VLAN_DISABLE) ++ *vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag); ++ else ++ *vlan_tag = le16_to_cpu(desc->rx.vlan_tag); ++ ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static void hns3_rx_ring_move_fw(struct hns3_enet_ring *ring) ++{ ++ ring->desc[ring->next_to_clean].rx.bd_base_info &= ++ cpu_to_le32(~BIT(HNS3_RXD_VLD_B)); ++ ring->desc_cb[ring->next_to_clean].refill = 0; ++ ring->next_to_clean += 1; ++ ++ if (unlikely(ring->next_to_clean == ring->desc_num)) ++ ring->next_to_clean = 0; ++} ++ ++static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, ++ unsigned char *va) ++{ ++ struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_clean]; ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct sk_buff *skb; ++ ++ ring->skb = napi_alloc_skb(&ring->tqp_vector->napi, HNS3_RX_HEAD_SIZE); ++ skb = ring->skb; ++ if (unlikely(!skb)) { ++ hns3_rl_err(netdev, "alloc rx skb fail\n"); ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ ++ return -ENOMEM; ++ } ++ ++ trace_hns3_rx_desc(ring); ++ prefetchw(skb->data); ++ ++ ring->pending_buf = 1; ++ ring->frag_num = 0; ++ ring->tail_skb = NULL; ++ if (length <= HNS3_RX_HEAD_SIZE) { ++ memcpy(__skb_put(skb, length), va, ALIGN(length, sizeof(long))); ++ ++ /* We can reuse buffer as-is, just make sure it is reusable */ ++ if (dev_page_is_reusable(desc_cb->priv)) ++ desc_cb->reuse_flag = 1; ++ else if (desc_cb->type & DESC_TYPE_PP_FRAG) ++ page_pool_put_full_page(ring->page_pool, desc_cb->priv, ++ false); ++ else /* This page cannot be reused so discard it */ ++ __page_frag_cache_drain(desc_cb->priv, ++ desc_cb->pagecnt_bias); ++ ++ hns3_rx_ring_move_fw(ring); ++ return 0; ++ } ++ ++ if (ring->page_pool) ++ skb_mark_for_recycle(skb); ++ ++ hns3_ring_stats_update(ring, seg_pkt_cnt); ++ ++ ring->pull_len = eth_get_headlen(netdev, va, HNS3_RX_HEAD_SIZE); ++ __skb_put(skb, ring->pull_len); ++ hns3_nic_reuse_page(skb, ring->frag_num++, ring, ring->pull_len, ++ desc_cb); ++ hns3_rx_ring_move_fw(ring); ++ ++ return 0; ++} ++ ++static int hns3_add_frag(struct hns3_enet_ring *ring) ++{ ++ struct sk_buff *skb = ring->skb; ++ struct sk_buff *head_skb = skb; ++ struct sk_buff *new_skb; ++ struct hns3_desc_cb *desc_cb; ++ struct hns3_desc *desc; ++ u32 bd_base_info; ++ ++ do { ++ desc = &ring->desc[ring->next_to_clean]; ++ desc_cb = &ring->desc_cb[ring->next_to_clean]; ++ bd_base_info = le32_to_cpu(desc->rx.bd_base_info); ++ /* make sure HW write desc complete */ ++ dma_rmb(); ++ if (!(bd_base_info & BIT(HNS3_RXD_VLD_B))) ++ return -ENXIO; ++ ++ if (unlikely(ring->frag_num >= MAX_SKB_FRAGS)) { ++ new_skb = napi_alloc_skb(&ring->tqp_vector->napi, 0); ++ if (unlikely(!new_skb)) { ++ hns3_rl_err(ring_to_netdev(ring), ++ "alloc rx fraglist skb fail\n"); ++ return -ENXIO; ++ } ++ ++ if (ring->page_pool) ++ skb_mark_for_recycle(new_skb); ++ ++ ring->frag_num = 0; ++ ++ if (ring->tail_skb) { ++ ring->tail_skb->next = new_skb; ++ ring->tail_skb = new_skb; ++ } else { ++ skb_shinfo(skb)->frag_list = new_skb; ++ ring->tail_skb = new_skb; ++ } ++ } ++ ++ if (ring->tail_skb) { ++ head_skb->truesize += hns3_buf_size(ring); ++ head_skb->data_len += le16_to_cpu(desc->rx.size); ++ head_skb->len += le16_to_cpu(desc->rx.size); ++ skb = ring->tail_skb; ++ } ++ ++ dma_sync_single_for_cpu(ring_to_dev(ring), ++ desc_cb->dma + desc_cb->page_offset, ++ hns3_buf_size(ring), ++ DMA_FROM_DEVICE); ++ ++ hns3_nic_reuse_page(skb, ring->frag_num++, ring, 0, desc_cb); ++ trace_hns3_rx_desc(ring); ++ hns3_rx_ring_move_fw(ring); ++ ring->pending_buf++; ++ } while (!(bd_base_info & BIT(HNS3_RXD_FE_B))); ++ ++ return 0; ++} ++ ++static int hns3_set_gro_and_checksum(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, u32 l234info, ++ u32 bd_base_info, u32 ol_info, u16 csum) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ u32 l3_type; ++ ++ skb_shinfo(skb)->gso_size = hnae3_get_field(bd_base_info, ++ HNS3_RXD_GRO_SIZE_M, ++ HNS3_RXD_GRO_SIZE_S); ++ /* if there is no HW GRO, do not set gro params */ ++ if (!skb_shinfo(skb)->gso_size) { ++ hns3_rx_checksum(ring, skb, l234info, bd_base_info, ol_info, ++ csum); ++ return 0; ++ } ++ ++ NAPI_GRO_CB(skb)->count = hnae3_get_field(l234info, ++ HNS3_RXD_GRO_COUNT_M, ++ HNS3_RXD_GRO_COUNT_S); ++ ++ if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) { ++ u32 ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M, ++ HNS3_RXD_PTYPE_S); ++ ++ l3_type = hns3_rx_ptype_tbl[ptype].l3_type; ++ } else { ++ l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M, ++ HNS3_RXD_L3ID_S); ++ } ++ ++ if (l3_type == HNS3_L3_TYPE_IPV4) ++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; ++ else if (l3_type == HNS3_L3_TYPE_IPV6) ++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; ++ else ++ return -EFAULT; ++ ++ return hns3_gro_complete(skb, l234info); ++} ++ ++static void hns3_set_rx_skb_rss_type(struct hns3_enet_ring *ring, ++ struct sk_buff *skb, u32 rss_hash, ++ u32 l234info, u32 ol_info) ++{ ++ enum pkt_hash_types rss_type = PKT_HASH_TYPE_NONE; ++ struct net_device *netdev = ring_to_netdev(ring); ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ if (test_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state)) { ++ u32 ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M, ++ HNS3_RXD_PTYPE_S); ++ ++ rss_type = hns3_rx_ptype_tbl[ptype].hash_type; ++ } else { ++ int l3_type = hnae3_get_field(l234info, HNS3_RXD_L3ID_M, ++ HNS3_RXD_L3ID_S); ++ int l4_type = hnae3_get_field(l234info, HNS3_RXD_L4ID_M, ++ HNS3_RXD_L4ID_S); ++ ++ if (l3_type == HNS3_L3_TYPE_IPV4 || ++ l3_type == HNS3_L3_TYPE_IPV6) { ++ if (l4_type == HNS3_L4_TYPE_UDP || ++ l4_type == HNS3_L4_TYPE_TCP || ++ l4_type == HNS3_L4_TYPE_SCTP) ++ rss_type = PKT_HASH_TYPE_L4; ++ else if (l4_type == HNS3_L4_TYPE_IGMP || ++ l4_type == HNS3_L4_TYPE_ICMP) ++ rss_type = PKT_HASH_TYPE_L3; ++ } ++ } ++ ++ skb_set_hash(skb, rss_hash, rss_type); ++} ++ ++static void hns3_handle_rx_ts_info(struct net_device *netdev, ++ struct hns3_desc *desc, struct sk_buff *skb, ++ u32 bd_base_info) ++{ ++ if (unlikely(bd_base_info & BIT(HNS3_RXD_TS_VLD_B))) { ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ u32 nsec = le32_to_cpu(desc->ts_nsec); ++ u32 sec = le32_to_cpu(desc->ts_sec); ++ ++ if (h->ae_algo->ops->get_rx_hwts) ++ h->ae_algo->ops->get_rx_hwts(h, skb, nsec, sec); ++ } ++} ++ ++static void hns3_handle_rx_vlan_tag(struct hns3_enet_ring *ring, ++ struct hns3_desc *desc, struct sk_buff *skb, ++ u32 l234info) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ ++ /* Based on hw strategy, the tag offloaded will be stored at ++ * ot_vlan_tag in two layer tag case, and stored at vlan_tag ++ * in one layer tag case. ++ */ ++ if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) { ++ u16 vlan_tag; ++ ++ if (hns3_parse_vlan_tag(ring, desc, l234info, &vlan_tag)) ++ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ++ vlan_tag); ++ } ++} ++ ++static int hns3_handle_bdinfo(struct hns3_enet_ring *ring, struct sk_buff *skb) ++{ ++ struct net_device *netdev = ring_to_netdev(ring); ++ enum hns3_pkt_l2t_type l2_frame_type; ++ u32 bd_base_info, l234info, ol_info; ++ struct hns3_desc *desc; ++ unsigned int len; ++ int pre_ntc, ret; ++ u16 csum; ++ ++ /* bdinfo handled below is only valid on the last BD of the ++ * current packet, and ring->next_to_clean indicates the first ++ * descriptor of next packet, so need - 1 below. ++ */ ++ pre_ntc = ring->next_to_clean ? (ring->next_to_clean - 1) : ++ (ring->desc_num - 1); ++ desc = &ring->desc[pre_ntc]; ++ bd_base_info = le32_to_cpu(desc->rx.bd_base_info); ++ l234info = le32_to_cpu(desc->rx.l234_info); ++ ol_info = le32_to_cpu(desc->rx.ol_info); ++ csum = le16_to_cpu(desc->csum); ++ ++ hns3_handle_rx_ts_info(netdev, desc, skb, bd_base_info); ++ ++ hns3_handle_rx_vlan_tag(ring, desc, skb, l234info); ++ ++ if (unlikely(!desc->rx.pkt_len || (l234info & (BIT(HNS3_RXD_TRUNCAT_B) | ++ BIT(HNS3_RXD_L2E_B))))) { ++ u64_stats_update_begin(&ring->syncp); ++ if (l234info & BIT(HNS3_RXD_L2E_B)) ++ ring->stats.l2_err++; ++ else ++ ring->stats.err_pkt_len++; ++ u64_stats_update_end(&ring->syncp); ++ ++ return -EFAULT; ++ } ++ ++ len = skb->len; ++ ++ /* Do update ip stack process */ ++ skb->protocol = eth_type_trans(skb, netdev); ++ ++ /* This is needed in order to enable forwarding support */ ++ ret = hns3_set_gro_and_checksum(ring, skb, l234info, ++ bd_base_info, ol_info, csum); ++ if (unlikely(ret)) { ++ hns3_ring_stats_update(ring, rx_err_cnt); ++ return ret; ++ } ++ ++ l2_frame_type = hnae3_get_field(l234info, HNS3_RXD_DMAC_M, ++ HNS3_RXD_DMAC_S); ++ ++ u64_stats_update_begin(&ring->syncp); ++ ring->stats.rx_pkts++; ++ ring->stats.rx_bytes += len; ++ ++ if (l2_frame_type == HNS3_L2_TYPE_MULTICAST) ++ ring->stats.rx_multicast++; ++ ++ u64_stats_update_end(&ring->syncp); ++ ++ ring->tqp_vector->rx_group.total_bytes += len; ++ ++ hns3_set_rx_skb_rss_type(ring, skb, le32_to_cpu(desc->rx.rss_hash), ++ l234info, ol_info); ++ return 0; ++} ++ ++static int hns3_handle_rx_bd(struct hns3_enet_ring *ring) ++{ ++ struct sk_buff *skb = ring->skb; ++ struct hns3_desc_cb *desc_cb; ++ struct hns3_desc *desc; ++ unsigned int length; ++ u32 bd_base_info; ++ int ret; ++ ++ desc = &ring->desc[ring->next_to_clean]; ++ desc_cb = &ring->desc_cb[ring->next_to_clean]; ++ ++ prefetch(desc); ++ ++ if (!skb) { ++ bd_base_info = le32_to_cpu(desc->rx.bd_base_info); ++ /* Check valid BD */ ++ if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B)))) ++ return -ENXIO; ++ ++ dma_rmb(); ++ length = le16_to_cpu(desc->rx.size); ++ ++ ring->va = desc_cb->buf + desc_cb->page_offset; ++ ++ dma_sync_single_for_cpu(ring_to_dev(ring), ++ desc_cb->dma + desc_cb->page_offset, ++ hns3_buf_size(ring), ++ DMA_FROM_DEVICE); ++ ++ /* Prefetch first cache line of first page. ++ * Idea is to cache few bytes of the header of the packet. ++ * Our L1 Cache line size is 64B so need to prefetch twice to make ++ * it 128B. But in actual we can have greater size of caches with ++ * 128B Level 1 cache lines. In such a case, single fetch would ++ * suffice to cache in the relevant part of the header. ++ */ ++ net_prefetch(ring->va); ++ ++ ret = hns3_alloc_skb(ring, length, ring->va); ++ skb = ring->skb; ++ ++ if (ret < 0) /* alloc buffer fail */ ++ return ret; ++ if (!(bd_base_info & BIT(HNS3_RXD_FE_B))) { /* need add frag */ ++ ret = hns3_add_frag(ring); ++ if (ret) ++ return ret; ++ } ++ } else { ++ ret = hns3_add_frag(ring); ++ if (ret) ++ return ret; ++ } ++ ++ /* As the head data may be changed when GRO enable, copy ++ * the head data in after other data rx completed ++ */ ++ if (skb->len > HNS3_RX_HEAD_SIZE) ++ memcpy(skb->data, ring->va, ++ ALIGN(ring->pull_len, sizeof(long))); ++ ++ ret = hns3_handle_bdinfo(ring, skb); ++ if (unlikely(ret)) { ++ dev_kfree_skb_any(skb); ++ return ret; ++ } ++ ++ skb_record_rx_queue(skb, ring->tqp->tqp_index); ++ return 0; ++} ++ ++int hns3_clean_rx_ring(struct hns3_enet_ring *ring, int budget, ++ void (*rx_fn)(struct hns3_enet_ring *, struct sk_buff *)) ++{ ++#define RCB_NOF_ALLOC_RX_BUFF_ONCE 16 ++ int unused_count = hns3_desc_unused(ring); ++ bool failure = false; ++ int recv_pkts = 0; ++ int err; ++ ++ unused_count -= ring->pending_buf; ++ ++ while (recv_pkts < budget) { ++ /* Reuse or realloc buffers */ ++ if (unused_count >= RCB_NOF_ALLOC_RX_BUFF_ONCE) { ++ failure = failure || ++ hns3_nic_alloc_rx_buffers(ring, unused_count); ++ unused_count = 0; ++ } ++ ++ /* Poll one pkt */ ++ err = hns3_handle_rx_bd(ring); ++ /* Do not get FE for the packet or failed to alloc skb */ ++ if (unlikely(!ring->skb || err == -ENXIO)) { ++ goto out; ++ } else if (likely(!err)) { ++ rx_fn(ring, ring->skb); ++ recv_pkts++; ++ } ++ ++ unused_count += ring->pending_buf; ++ ring->skb = NULL; ++ ring->pending_buf = 0; ++ } ++ ++out: ++ /* sync head pointer before exiting, since hardware will calculate ++ * FBD number with head pointer ++ */ ++ if (unused_count > 0) ++ failure = failure || ++ hns3_nic_alloc_rx_buffers(ring, unused_count); ++ ++ return failure ? budget : recv_pkts; ++} ++ ++static void hns3_update_rx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ struct hns3_enet_ring_group *rx_group = &tqp_vector->rx_group; ++ struct dim_sample sample = {}; ++ ++ if (!rx_group->coal.adapt_enable) ++ return; ++ ++ dim_update_sample(tqp_vector->event_cnt, rx_group->total_packets, ++ rx_group->total_bytes, &sample); ++ net_dim(&rx_group->dim, sample); ++} ++ ++static void hns3_update_tx_int_coalesce(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ struct hns3_enet_ring_group *tx_group = &tqp_vector->tx_group; ++ struct dim_sample sample = {}; ++ ++ if (!tx_group->coal.adapt_enable) ++ return; ++ ++ dim_update_sample(tqp_vector->event_cnt, tx_group->total_packets, ++ tx_group->total_bytes, &sample); ++ net_dim(&tx_group->dim, sample); ++} ++ ++static int hns3_nic_common_poll(struct napi_struct *napi, int budget) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(napi->dev); ++ struct hns3_enet_ring *ring; ++ int rx_pkt_total = 0; ++ ++ struct hns3_enet_tqp_vector *tqp_vector = ++ container_of(napi, struct hns3_enet_tqp_vector, napi); ++ bool clean_complete = true; ++ int rx_budget = budget; ++ ++ if (unlikely(test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) { ++ napi_complete(napi); ++ return 0; ++ } ++ ++ /* Since the actual Tx work is minimal, we can give the Tx a larger ++ * budget and be more aggressive about cleaning up the Tx descriptors. ++ */ ++ hns3_for_each_ring(ring, tqp_vector->tx_group) ++ hns3_clean_tx_ring(ring, budget); ++ ++ /* make sure rx ring budget not smaller than 1 */ ++ if (tqp_vector->num_tqps > 1) ++ rx_budget = max(budget / tqp_vector->num_tqps, 1); ++ ++ hns3_for_each_ring(ring, tqp_vector->rx_group) { ++ int rx_cleaned = hns3_clean_rx_ring(ring, rx_budget, ++ hns3_rx_skb); ++ if (rx_cleaned >= rx_budget) ++ clean_complete = false; ++ ++ rx_pkt_total += rx_cleaned; ++ } ++ ++ tqp_vector->rx_group.total_packets += rx_pkt_total; ++ ++ if (!clean_complete) ++ return budget; ++ ++ if (napi_complete(napi) && ++ likely(!test_bit(HNS3_NIC_STATE_DOWN, &priv->state))) { ++ hns3_update_rx_int_coalesce(tqp_vector); ++ hns3_update_tx_int_coalesce(tqp_vector); ++ ++ hns3_mask_vector_irq(tqp_vector, 1); ++ } ++ ++ return rx_pkt_total; ++} ++ ++static int hns3_create_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, ++ struct hnae3_ring_chain_node **head, ++ bool is_tx) ++{ ++ u32 bit_value = is_tx ? HNAE3_RING_TYPE_TX : HNAE3_RING_TYPE_RX; ++ u32 field_value = is_tx ? HNAE3_RING_GL_TX : HNAE3_RING_GL_RX; ++ struct hnae3_ring_chain_node *cur_chain = *head; ++ struct pci_dev *pdev = tqp_vector->handle->pdev; ++ struct hnae3_ring_chain_node *chain; ++ struct hns3_enet_ring *ring; ++ ++ ring = is_tx ? tqp_vector->tx_group.ring : tqp_vector->rx_group.ring; ++ ++ if (cur_chain) { ++ while (cur_chain->next) ++ cur_chain = cur_chain->next; ++ } ++ ++ while (ring) { ++ chain = devm_kzalloc(&pdev->dev, sizeof(*chain), GFP_KERNEL); ++ if (!chain) ++ return -ENOMEM; ++ if (cur_chain) ++ cur_chain->next = chain; ++ else ++ *head = chain; ++ chain->tqp_index = ring->tqp->tqp_index; ++ hnae3_set_bit(chain->flag, HNAE3_RING_TYPE_B, ++ bit_value); ++ hnae3_set_field(chain->int_gl_idx, ++ HNAE3_RING_GL_IDX_M, ++ HNAE3_RING_GL_IDX_S, field_value); ++ ++ cur_chain = chain; ++ ++ ring = ring->next; ++ } ++ ++ return 0; ++} ++ ++static struct hnae3_ring_chain_node * ++hns3_get_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ struct pci_dev *pdev = tqp_vector->handle->pdev; ++ struct hnae3_ring_chain_node *cur_chain = NULL; ++ struct hnae3_ring_chain_node *chain; ++ ++ if (hns3_create_ring_chain(tqp_vector, &cur_chain, true)) ++ goto err_free_chain; ++ ++ if (hns3_create_ring_chain(tqp_vector, &cur_chain, false)) ++ goto err_free_chain; ++ ++ return cur_chain; ++ ++err_free_chain: ++ while (cur_chain) { ++ chain = cur_chain->next; ++ devm_kfree(&pdev->dev, cur_chain); ++ cur_chain = chain; ++ } ++ ++ return NULL; ++} ++ ++static void hns3_free_vector_ring_chain(struct hns3_enet_tqp_vector *tqp_vector, ++ struct hnae3_ring_chain_node *head) ++{ ++ struct pci_dev *pdev = tqp_vector->handle->pdev; ++ struct hnae3_ring_chain_node *chain_tmp, *chain; ++ ++ chain = head; ++ ++ while (chain) { ++ chain_tmp = chain->next; ++ devm_kfree(&pdev->dev, chain); ++ chain = chain_tmp; ++ } ++} ++ ++static void hns3_add_ring_to_group(struct hns3_enet_ring_group *group, ++ struct hns3_enet_ring *ring) ++{ ++ ring->next = group->ring; ++ group->ring = ring; ++ ++ group->count++; ++} ++ ++static void hns3_nic_set_cpumask(struct hns3_nic_priv *priv) ++{ ++ struct pci_dev *pdev = priv->ae_handle->pdev; ++ struct hns3_enet_tqp_vector *tqp_vector; ++ int num_vectors = priv->vector_num; ++ int numa_node; ++ int vector_i; ++ ++ numa_node = dev_to_node(&pdev->dev); ++ ++ for (vector_i = 0; vector_i < num_vectors; vector_i++) { ++ tqp_vector = &priv->tqp_vector[vector_i]; ++ cpumask_set_cpu(cpumask_local_spread(vector_i, numa_node), ++ &tqp_vector->affinity_mask); ++ } ++} ++ ++static void hns3_rx_dim_work(struct work_struct *work) ++{ ++ struct dim *dim = container_of(work, struct dim, work); ++ struct hns3_enet_ring_group *group = container_of(dim, ++ struct hns3_enet_ring_group, dim); ++ struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector; ++ struct dim_cq_moder cur_moder = ++ net_dim_get_rx_moderation(dim->mode, dim->profile_ix); ++ ++ hns3_set_vector_coalesce_rx_gl(group->ring->tqp_vector, cur_moder.usec); ++ tqp_vector->rx_group.coal.int_gl = cur_moder.usec; ++ ++ if (cur_moder.pkts < tqp_vector->rx_group.coal.int_ql_max) { ++ hns3_set_vector_coalesce_rx_ql(tqp_vector, cur_moder.pkts); ++ tqp_vector->rx_group.coal.int_ql = cur_moder.pkts; ++ } ++ ++ dim->state = DIM_START_MEASURE; ++} ++ ++static void hns3_tx_dim_work(struct work_struct *work) ++{ ++ struct dim *dim = container_of(work, struct dim, work); ++ struct hns3_enet_ring_group *group = container_of(dim, ++ struct hns3_enet_ring_group, dim); ++ struct hns3_enet_tqp_vector *tqp_vector = group->ring->tqp_vector; ++ struct dim_cq_moder cur_moder = ++ net_dim_get_tx_moderation(dim->mode, dim->profile_ix); ++ ++ hns3_set_vector_coalesce_tx_gl(tqp_vector, cur_moder.usec); ++ tqp_vector->tx_group.coal.int_gl = cur_moder.usec; ++ ++ if (cur_moder.pkts < tqp_vector->tx_group.coal.int_ql_max) { ++ hns3_set_vector_coalesce_tx_ql(tqp_vector, cur_moder.pkts); ++ tqp_vector->tx_group.coal.int_ql = cur_moder.pkts; ++ } ++ ++ dim->state = DIM_START_MEASURE; ++} ++ ++static void hns3_nic_init_dim(struct hns3_enet_tqp_vector *tqp_vector) ++{ ++ INIT_WORK(&tqp_vector->rx_group.dim.work, hns3_rx_dim_work); ++ INIT_WORK(&tqp_vector->tx_group.dim.work, hns3_tx_dim_work); ++} ++ ++static int hns3_nic_init_vector_data(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ struct hns3_enet_tqp_vector *tqp_vector; ++ int ret; ++ int i; ++ ++ hns3_nic_set_cpumask(priv); ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vector = &priv->tqp_vector[i]; ++ hns3_vector_coalesce_init_hw(tqp_vector, priv); ++ tqp_vector->num_tqps = 0; ++ hns3_nic_init_dim(tqp_vector); ++ } ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ u16 vector_i = i % priv->vector_num; ++ u16 tqp_num = h->kinfo.num_tqps; ++ ++ tqp_vector = &priv->tqp_vector[vector_i]; ++ ++ hns3_add_ring_to_group(&tqp_vector->tx_group, ++ &priv->ring[i]); ++ ++ hns3_add_ring_to_group(&tqp_vector->rx_group, ++ &priv->ring[i + tqp_num]); ++ ++ priv->ring[i].tqp_vector = tqp_vector; ++ priv->ring[i + tqp_num].tqp_vector = tqp_vector; ++ tqp_vector->num_tqps++; ++ } ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ struct hnae3_ring_chain_node *vector_ring_chain; ++ ++ tqp_vector = &priv->tqp_vector[i]; ++ ++ tqp_vector->rx_group.total_bytes = 0; ++ tqp_vector->rx_group.total_packets = 0; ++ tqp_vector->tx_group.total_bytes = 0; ++ tqp_vector->tx_group.total_packets = 0; ++ tqp_vector->handle = h; ++ ++ vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector); ++ if (!vector_ring_chain) { ++ ret = -ENOMEM; ++ goto map_ring_fail; ++ } ++ ++ ret = h->ae_algo->ops->map_ring_to_vector(h, ++ tqp_vector->vector_irq, vector_ring_chain); ++ ++ hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain); ++ ++ if (ret) ++ goto map_ring_fail; ++ ++ netif_napi_add(priv->netdev, &tqp_vector->napi, ++ hns3_nic_common_poll, NAPI_POLL_WEIGHT); ++ } ++ ++ return 0; ++ ++map_ring_fail: ++ while (i--) ++ netif_napi_del(&priv->tqp_vector[i].napi); ++ ++ return ret; ++} ++ ++static void hns3_nic_init_coal_cfg(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); ++ struct hns3_enet_coalesce *tx_coal = &priv->tx_coal; ++ struct hns3_enet_coalesce *rx_coal = &priv->rx_coal; ++ ++ /* initialize the configuration for interrupt coalescing. ++ * 1. GL (Interrupt Gap Limiter) ++ * 2. RL (Interrupt Rate Limiter) ++ * 3. QL (Interrupt Quantity Limiter) ++ * ++ * Default: enable interrupt coalescing self-adaptive and GL ++ */ ++ tx_coal->adapt_enable = 1; ++ rx_coal->adapt_enable = 1; ++ ++ tx_coal->int_gl = HNS3_INT_GL_50K; ++ rx_coal->int_gl = HNS3_INT_GL_50K; ++ ++ rx_coal->flow_level = HNS3_FLOW_LOW; ++ tx_coal->flow_level = HNS3_FLOW_LOW; ++ ++ if (ae_dev->dev_specs.int_ql_max) { ++ tx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG; ++ rx_coal->int_ql = HNS3_INT_QL_DEFAULT_CFG; ++ } ++} ++ ++static int hns3_nic_alloc_vector_data(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ struct hns3_enet_tqp_vector *tqp_vector; ++ struct hnae3_vector_info *vector; ++ struct pci_dev *pdev = h->pdev; ++ u16 tqp_num = h->kinfo.num_tqps; ++ u16 vector_num; ++ int ret = 0; ++ u16 i; ++ ++ /* RSS size, cpu online and vector_num should be the same */ ++ /* Should consider 2p/4p later */ ++ vector_num = min_t(u16, num_online_cpus(), tqp_num); ++ ++ vector = devm_kcalloc(&pdev->dev, vector_num, sizeof(*vector), ++ GFP_KERNEL); ++ if (!vector) ++ return -ENOMEM; ++ ++ /* save the actual available vector number */ ++ vector_num = h->ae_algo->ops->get_vector(h, vector_num, vector); ++ ++ priv->vector_num = vector_num; ++ priv->tqp_vector = (struct hns3_enet_tqp_vector *) ++ devm_kcalloc(&pdev->dev, vector_num, sizeof(*priv->tqp_vector), ++ GFP_KERNEL); ++ if (!priv->tqp_vector) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vector = &priv->tqp_vector[i]; ++ tqp_vector->idx = i; ++ tqp_vector->mask_addr = vector[i].io_addr; ++ tqp_vector->vector_irq = vector[i].vector; ++ hns3_vector_coalesce_init(tqp_vector, priv); ++ } ++ ++out: ++ devm_kfree(&pdev->dev, vector); ++ return ret; ++} ++ ++static void hns3_clear_ring_group(struct hns3_enet_ring_group *group) ++{ ++ group->ring = NULL; ++ group->count = 0; ++} ++ ++static void hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_ring_chain_node *vector_ring_chain; ++ struct hnae3_handle *h = priv->ae_handle; ++ struct hns3_enet_tqp_vector *tqp_vector; ++ int i; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ tqp_vector = &priv->tqp_vector[i]; ++ ++ if (!tqp_vector->rx_group.ring && !tqp_vector->tx_group.ring) ++ continue; ++ ++ /* Since the mapping can be overwritten, when fail to get the ++ * chain between vector and ring, we should go on to deal with ++ * the remaining options. ++ */ ++ vector_ring_chain = hns3_get_vector_ring_chain(tqp_vector); ++ if (!vector_ring_chain) ++ dev_warn(priv->dev, "failed to get ring chain\n"); ++ ++ h->ae_algo->ops->unmap_ring_from_vector(h, ++ tqp_vector->vector_irq, vector_ring_chain); ++ ++ hns3_free_vector_ring_chain(tqp_vector, vector_ring_chain); ++ ++ hns3_clear_ring_group(&tqp_vector->rx_group); ++ hns3_clear_ring_group(&tqp_vector->tx_group); ++ netif_napi_del(&priv->tqp_vector[i].napi); ++ } ++} ++ ++static void hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ struct pci_dev *pdev = h->pdev; ++ int i, ret; ++ ++ for (i = 0; i < priv->vector_num; i++) { ++ struct hns3_enet_tqp_vector *tqp_vector; ++ ++ tqp_vector = &priv->tqp_vector[i]; ++ ret = h->ae_algo->ops->put_vector(h, tqp_vector->vector_irq); ++ if (ret) ++ return; ++ } ++ ++ devm_kfree(&pdev->dev, priv->tqp_vector); ++} ++ ++static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv, ++ unsigned int ring_type) ++{ ++ int queue_num = priv->ae_handle->kinfo.num_tqps; ++ struct hns3_enet_ring *ring; ++ int desc_num; ++ ++ if (ring_type == HNAE3_RING_TYPE_TX) { ++ ring = &priv->ring[q->tqp_index]; ++ desc_num = priv->ae_handle->kinfo.num_tx_desc; ++ ring->queue_index = q->tqp_index; ++ ring->tx_copybreak = priv->tx_copybreak; ++ ring->last_to_use = 0; ++ } else { ++ ring = &priv->ring[q->tqp_index + queue_num]; ++ desc_num = priv->ae_handle->kinfo.num_rx_desc; ++ ring->queue_index = q->tqp_index; ++ ring->rx_copybreak = priv->rx_copybreak; ++ } ++ ++ hnae3_set_bit(ring->flag, HNAE3_RING_TYPE_B, ring_type); ++ ++ ring->tqp = q; ++ ring->desc = NULL; ++ ring->desc_cb = NULL; ++ ring->dev = priv->dev; ++ ring->desc_dma_addr = 0; ++ ring->buf_size = q->buf_size; ++ ring->desc_num = desc_num; ++ ring->next_to_use = 0; ++ ring->next_to_clean = 0; ++} ++ ++static void hns3_queue_to_ring(struct hnae3_queue *tqp, ++ struct hns3_nic_priv *priv) ++{ ++ hns3_ring_get_cfg(tqp, priv, HNAE3_RING_TYPE_TX); ++ hns3_ring_get_cfg(tqp, priv, HNAE3_RING_TYPE_RX); ++} ++ ++static int hns3_get_ring_config(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ struct pci_dev *pdev = h->pdev; ++ int i; ++ ++ priv->ring = devm_kzalloc(&pdev->dev, ++ array3_size(h->kinfo.num_tqps, ++ sizeof(*priv->ring), 2), ++ GFP_KERNEL); ++ if (!priv->ring) ++ return -ENOMEM; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) ++ hns3_queue_to_ring(h->kinfo.tqp[i], priv); ++ ++ return 0; ++} ++ ++static void hns3_put_ring_config(struct hns3_nic_priv *priv) ++{ ++ if (!priv->ring) ++ return; ++ ++ devm_kfree(priv->dev, priv->ring); ++ priv->ring = NULL; ++} ++ ++static void hns3_alloc_page_pool(struct hns3_enet_ring *ring) ++{ ++ struct page_pool_params pp_params = { ++ .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG | ++ PP_FLAG_DMA_SYNC_DEV, ++ .order = hns3_page_order(ring), ++ .pool_size = ring->desc_num * hns3_buf_size(ring) / ++ (PAGE_SIZE << hns3_page_order(ring)), ++ .nid = dev_to_node(ring_to_dev(ring)), ++ .dev = ring_to_dev(ring), ++ .dma_dir = DMA_FROM_DEVICE, ++ .offset = 0, ++ .max_len = PAGE_SIZE << hns3_page_order(ring), ++ }; ++ ++ ring->page_pool = page_pool_create(&pp_params); ++ if (IS_ERR(ring->page_pool)) { ++ dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n", ++ PTR_ERR(ring->page_pool)); ++ ring->page_pool = NULL; ++ } ++} ++ ++static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) ++{ ++ int ret; ++ ++ if (ring->desc_num <= 0 || ring->buf_size <= 0) ++ return -EINVAL; ++ ++ ring->desc_cb = devm_kcalloc(ring_to_dev(ring), ring->desc_num, ++ sizeof(ring->desc_cb[0]), GFP_KERNEL); ++ if (!ring->desc_cb) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ ret = hns3_alloc_desc(ring); ++ if (ret) ++ goto out_with_desc_cb; ++ ++ if (!HNAE3_IS_TX_RING(ring)) { ++ if (page_pool_enabled) ++ hns3_alloc_page_pool(ring); ++ ++ ret = hns3_alloc_ring_buffers(ring); ++ if (ret) ++ goto out_with_desc; ++ } else { ++ hns3_init_tx_spare_buffer(ring); ++ } ++ ++ return 0; ++ ++out_with_desc: ++ hns3_free_desc(ring); ++out_with_desc_cb: ++ devm_kfree(ring_to_dev(ring), ring->desc_cb); ++ ring->desc_cb = NULL; ++out: ++ return ret; ++} ++ ++void hns3_fini_ring(struct hns3_enet_ring *ring) ++{ ++ hns3_free_desc(ring); ++ devm_kfree(ring_to_dev(ring), ring->desc_cb); ++ ring->desc_cb = NULL; ++ ring->next_to_clean = 0; ++ ring->next_to_use = 0; ++ ring->last_to_use = 0; ++ ring->pending_buf = 0; ++ if (!HNAE3_IS_TX_RING(ring) && ring->skb) { ++ dev_kfree_skb_any(ring->skb); ++ ring->skb = NULL; ++ } else if (HNAE3_IS_TX_RING(ring) && ring->tx_spare) { ++ struct hns3_tx_spare *tx_spare = ring->tx_spare; ++ ++ dma_unmap_page(ring_to_dev(ring), tx_spare->dma, tx_spare->len, ++ DMA_TO_DEVICE); ++ free_pages((unsigned long)tx_spare->buf, ++ get_order(tx_spare->len)); ++ devm_kfree(ring_to_dev(ring), tx_spare); ++ ring->tx_spare = NULL; ++ } ++ ++ if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) { ++ page_pool_destroy(ring->page_pool); ++ ring->page_pool = NULL; ++ } ++} ++ ++static int hns3_buf_size2type(u32 buf_size) ++{ ++ int bd_size_type; ++ ++ switch (buf_size) { ++ case 512: ++ bd_size_type = HNS3_BD_SIZE_512_TYPE; ++ break; ++ case 1024: ++ bd_size_type = HNS3_BD_SIZE_1024_TYPE; ++ break; ++ case 2048: ++ bd_size_type = HNS3_BD_SIZE_2048_TYPE; ++ break; ++ case 4096: ++ bd_size_type = HNS3_BD_SIZE_4096_TYPE; ++ break; ++ default: ++ bd_size_type = HNS3_BD_SIZE_2048_TYPE; ++ } ++ ++ return bd_size_type; ++} ++ ++static void hns3_init_ring_hw(struct hns3_enet_ring *ring) ++{ ++ dma_addr_t dma = ring->desc_dma_addr; ++ struct hnae3_queue *q = ring->tqp; ++ ++ if (!HNAE3_IS_TX_RING(ring)) { ++ hns3_write_dev(q, HNS3_RING_RX_RING_BASEADDR_L_REG, (u32)dma); ++ hns3_write_dev(q, HNS3_RING_RX_RING_BASEADDR_H_REG, ++ (u32)((dma >> 31) >> 1)); ++ ++ hns3_write_dev(q, HNS3_RING_RX_RING_BD_LEN_REG, ++ hns3_buf_size2type(ring->buf_size)); ++ hns3_write_dev(q, HNS3_RING_RX_RING_BD_NUM_REG, ++ ring->desc_num / 8 - 1); ++ } else { ++ hns3_write_dev(q, HNS3_RING_TX_RING_BASEADDR_L_REG, ++ (u32)dma); ++ hns3_write_dev(q, HNS3_RING_TX_RING_BASEADDR_H_REG, ++ (u32)((dma >> 31) >> 1)); ++ ++ hns3_write_dev(q, HNS3_RING_TX_RING_BD_NUM_REG, ++ ring->desc_num / 8 - 1); ++ } ++} ++ ++static void hns3_init_tx_ring_tc(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_knic_private_info *kinfo = &priv->ae_handle->kinfo; ++ struct hnae3_tc_info *tc_info = &kinfo->tc_info; ++ int i; ++ ++ for (i = 0; i < tc_info->num_tc; i++) { ++ int j; ++ ++ for (j = 0; j < tc_info->tqp_count[i]; j++) { ++ struct hnae3_queue *q; ++ ++ q = priv->ring[tc_info->tqp_offset[i] + j].tqp; ++ hns3_write_dev(q, HNS3_RING_TX_RING_TC_REG, i); ++ } ++ } ++} ++ ++int hns3_init_all_ring(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ int ring_num = h->kinfo.num_tqps * 2; ++ int i, j; ++ int ret; ++ ++ for (i = 0; i < ring_num; i++) { ++ ret = hns3_alloc_ring_memory(&priv->ring[i]); ++ if (ret) { ++ dev_err(priv->dev, ++ "Alloc ring memory fail! ret=%d\n", ret); ++ goto out_when_alloc_ring_memory; ++ } ++ ++ u64_stats_init(&priv->ring[i].syncp); ++ } ++ ++ return 0; ++ ++out_when_alloc_ring_memory: ++ for (j = i - 1; j >= 0; j--) ++ hns3_fini_ring(&priv->ring[j]); ++ ++ return -ENOMEM; ++} ++ ++static void hns3_uninit_all_ring(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_handle *h = priv->ae_handle; ++ int i; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ hns3_fini_ring(&priv->ring[i]); ++ hns3_fini_ring(&priv->ring[i + h->kinfo.num_tqps]); ++ } ++} ++ ++/* Set mac addr if it is configured. or leave it to the AE driver */ ++static int hns3_init_mac_addr(struct net_device *netdev) ++{ ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ struct hnae3_handle *h = priv->ae_handle; ++ u8 mac_addr_temp[ETH_ALEN]; ++ int ret = 0; ++ ++ if (h->ae_algo->ops->get_mac_addr) ++ h->ae_algo->ops->get_mac_addr(h, mac_addr_temp); ++ ++ /* Check if the MAC address is valid, if not get a random one */ ++ if (!is_valid_ether_addr(mac_addr_temp)) { ++ eth_hw_addr_random(netdev); ++ hnae3_format_mac_addr(format_mac_addr, netdev->dev_addr); ++ dev_warn(priv->dev, "using random MAC address %s\n", ++ format_mac_addr); ++ } else if (!ether_addr_equal(netdev->dev_addr, mac_addr_temp)) { ++ eth_hw_addr_set(netdev, mac_addr_temp); ++ ether_addr_copy(netdev->perm_addr, mac_addr_temp); ++ } else { ++ return 0; ++ } ++ ++ if (h->ae_algo->ops->set_mac_addr) ++ ret = h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr, true); ++ ++ return ret; ++} ++ ++static int hns3_init_phy(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ int ret = 0; ++ ++ if (h->ae_algo->ops->mac_connect_phy) ++ ret = h->ae_algo->ops->mac_connect_phy(h); ++ ++ return ret; ++} ++ ++static void hns3_uninit_phy(struct net_device *netdev) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ ++ if (h->ae_algo->ops->mac_disconnect_phy) ++ h->ae_algo->ops->mac_disconnect_phy(h); ++} ++ ++static int hns3_client_start(struct hnae3_handle *handle) ++{ ++ if (!handle->ae_algo->ops->client_start) ++ return 0; ++ ++ return handle->ae_algo->ops->client_start(handle); ++} ++ ++static void hns3_client_stop(struct hnae3_handle *handle) ++{ ++ if (!handle->ae_algo->ops->client_stop) ++ return; ++ ++ handle->ae_algo->ops->client_stop(handle); ++} ++ ++static void hns3_info_show(struct hns3_nic_priv *priv) ++{ ++ struct hnae3_knic_private_info *kinfo = &priv->ae_handle->kinfo; ++ char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; ++ ++ hnae3_format_mac_addr(format_mac_addr, priv->netdev->dev_addr); ++ dev_info(priv->dev, "MAC address: %s\n", format_mac_addr); ++ dev_info(priv->dev, "Task queue pairs numbers: %u\n", kinfo->num_tqps); ++ dev_info(priv->dev, "RSS size: %u\n", kinfo->rss_size); ++ dev_info(priv->dev, "Allocated RSS size: %u\n", kinfo->req_rss_size); ++ dev_info(priv->dev, "RX buffer length: %u\n", kinfo->rx_buf_len); ++ dev_info(priv->dev, "Desc num per TX queue: %u\n", kinfo->num_tx_desc); ++ dev_info(priv->dev, "Desc num per RX queue: %u\n", kinfo->num_rx_desc); ++ dev_info(priv->dev, "Total number of enabled TCs: %u\n", ++ kinfo->tc_info.num_tc); ++ dev_info(priv->dev, "Max mtu size: %u\n", priv->netdev->max_mtu); ++} ++ ++static void hns3_set_cq_period_mode(struct hns3_nic_priv *priv, ++ enum dim_cq_period_mode mode, bool is_tx) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(priv->ae_handle->pdev); ++ struct hnae3_handle *handle = priv->ae_handle; ++ int i; ++ ++ if (is_tx) { ++ priv->tx_cqe_mode = mode; ++ ++ for (i = 0; i < priv->vector_num; i++) ++ priv->tqp_vector[i].tx_group.dim.mode = mode; ++ } else { ++ priv->rx_cqe_mode = mode; ++ ++ for (i = 0; i < priv->vector_num; i++) ++ priv->tqp_vector[i].rx_group.dim.mode = mode; ++ } ++ ++ if (hnae3_ae_dev_cq_supported(ae_dev)) { ++ u32 new_mode; ++ u64 reg; ++ ++ new_mode = (mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE) ? ++ HNS3_CQ_MODE_CQE : HNS3_CQ_MODE_EQE; ++ reg = is_tx ? HNS3_GL1_CQ_MODE_REG : HNS3_GL0_CQ_MODE_REG; ++ ++ writel(new_mode, handle->kinfo.io_base + reg); ++ } ++} ++ ++void hns3_cq_period_mode_init(struct hns3_nic_priv *priv, ++ enum dim_cq_period_mode tx_mode, ++ enum dim_cq_period_mode rx_mode) ++{ ++ hns3_set_cq_period_mode(priv, tx_mode, true); ++ hns3_set_cq_period_mode(priv, rx_mode, false); ++} ++ ++static void hns3_state_init(struct hnae3_handle *handle) ++{ ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(handle->pdev); ++ struct net_device *netdev = handle->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ set_bit(HNS3_NIC_STATE_INITED, &priv->state); ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps)) ++ set_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state); ++ ++ if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) ++ set_bit(HNAE3_PFLAG_LIMIT_PROMISC, &handle->supported_pflags); ++ ++ if (test_bit(HNAE3_DEV_SUPPORT_HW_TX_CSUM_B, ae_dev->caps)) ++ set_bit(HNS3_NIC_STATE_HW_TX_CSUM_ENABLE, &priv->state); ++ ++ if (hnae3_ae_dev_rxd_adv_layout_supported(ae_dev)) ++ set_bit(HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE, &priv->state); ++} ++ ++static void hns3_state_uninit(struct hnae3_handle *handle) ++{ ++ struct hns3_nic_priv *priv = handle->priv; ++ ++ clear_bit(HNS3_NIC_STATE_INITED, &priv->state); ++} ++ ++static int hns3_client_init(struct hnae3_handle *handle) ++{ ++ struct pci_dev *pdev = handle->pdev; ++ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); ++ u16 alloc_tqps, max_rss_size; ++ struct hns3_nic_priv *priv; ++ struct net_device *netdev; ++ int ret; ++ ++ handle->ae_algo->ops->get_tqps_and_rss_info(handle, &alloc_tqps, ++ &max_rss_size); ++ netdev = alloc_etherdev_mq(sizeof(struct hns3_nic_priv), alloc_tqps); ++ if (!netdev) ++ return -ENOMEM; ++ ++ priv = netdev_priv(netdev); ++ priv->dev = &pdev->dev; ++ priv->netdev = netdev; ++ priv->ae_handle = handle; ++ priv->tx_timeout_count = 0; ++ priv->max_non_tso_bd_num = ae_dev->dev_specs.max_non_tso_bd_num; ++ set_bit(HNS3_NIC_STATE_DOWN, &priv->state); ++ ++ handle->msg_enable = netif_msg_init(debug, DEFAULT_MSG_LEVEL); ++ ++ handle->kinfo.netdev = netdev; ++ handle->priv = (void *)priv; ++ ++ hns3_init_mac_addr(netdev); ++ ++ hns3_set_default_feature(netdev); ++ ++ netdev->watchdog_timeo = HNS3_TX_TIMEOUT; ++ netdev->priv_flags |= IFF_UNICAST_FLT; ++ netdev->netdev_ops = &hns3_nic_netdev_ops; ++ SET_NETDEV_DEV(netdev, &pdev->dev); ++ hns3_ethtool_set_ops(netdev); ++ ++ /* Carrier off reporting is important to ethtool even BEFORE open */ ++ netif_carrier_off(netdev); ++ ++ ret = hns3_get_ring_config(priv); ++ if (ret) { ++ ret = -ENOMEM; ++ goto out_get_ring_cfg; ++ } ++ ++ hns3_nic_init_coal_cfg(priv); ++ ++ ret = hns3_nic_alloc_vector_data(priv); ++ if (ret) { ++ ret = -ENOMEM; ++ goto out_alloc_vector_data; ++ } ++ ++ ret = hns3_nic_init_vector_data(priv); ++ if (ret) { ++ ret = -ENOMEM; ++ goto out_init_vector_data; ++ } ++ ++ ret = hns3_init_all_ring(priv); ++ if (ret) { ++ ret = -ENOMEM; ++ goto out_init_ring; ++ } ++ ++ hns3_cq_period_mode_init(priv, DIM_CQ_PERIOD_MODE_START_FROM_EQE, ++ DIM_CQ_PERIOD_MODE_START_FROM_EQE); ++ ++ ret = hns3_init_phy(netdev); ++ if (ret) ++ goto out_init_phy; ++ ++ /* the device can work without cpu rmap, only aRFS needs it */ ++ ret = hns3_set_rx_cpu_rmap(netdev); ++ if (ret) ++ dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret); ++ ++ ret = hns3_nic_init_irq(priv); ++ if (ret) { ++ dev_err(priv->dev, "init irq failed! ret=%d\n", ret); ++ hns3_free_rx_cpu_rmap(netdev); ++ goto out_init_irq_fail; ++ } ++ ++ ret = hns3_client_start(handle); ++ if (ret) { ++ dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret); ++ goto out_client_start; ++ } ++ ++ hns3_dcbnl_setup(handle); ++ ++ ret = hns3_dbg_init(handle); ++ if (ret) { ++ dev_err(priv->dev, "failed to init debugfs, ret = %d\n", ++ ret); ++ goto out_client_start; ++ } ++ ++ netdev->max_mtu = HNS3_MAX_MTU(ae_dev->dev_specs.max_frm_size); ++ ++ hns3_state_init(handle); ++ ++ ret = register_netdev(netdev); ++ if (ret) { ++ dev_err(priv->dev, "probe register netdev fail!\n"); ++ goto out_reg_netdev_fail; ++ } ++ ++ if (netif_msg_drv(handle)) ++ hns3_info_show(priv); ++ ++ return ret; ++ ++out_reg_netdev_fail: ++ hns3_state_uninit(handle); ++ hns3_dbg_uninit(handle); ++ hns3_client_stop(handle); ++out_client_start: ++ hns3_free_rx_cpu_rmap(netdev); ++ hns3_nic_uninit_irq(priv); ++out_init_irq_fail: ++ hns3_uninit_phy(netdev); ++out_init_phy: ++ hns3_uninit_all_ring(priv); ++out_init_ring: ++ hns3_nic_uninit_vector_data(priv); ++out_init_vector_data: ++ hns3_nic_dealloc_vector_data(priv); ++out_alloc_vector_data: ++ priv->ring = NULL; ++out_get_ring_cfg: ++ priv->ae_handle = NULL; ++ free_netdev(netdev); ++ return ret; ++} ++ ++static void hns3_client_uninit(struct hnae3_handle *handle, bool reset) ++{ ++ struct net_device *netdev = handle->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ if (netdev->reg_state != NETREG_UNINITIALIZED) ++ unregister_netdev(netdev); ++ ++ hns3_client_stop(handle); ++ ++ hns3_uninit_phy(netdev); ++ ++ if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) { ++ netdev_warn(netdev, "already uninitialized\n"); ++ goto out_netdev_free; ++ } ++ ++ hns3_free_rx_cpu_rmap(netdev); ++ ++ hns3_nic_uninit_irq(priv); ++ ++ hns3_clear_all_ring(handle, true); ++ ++ hns3_nic_uninit_vector_data(priv); ++ ++ hns3_nic_dealloc_vector_data(priv); ++ ++ hns3_uninit_all_ring(priv); ++ ++ hns3_put_ring_config(priv); ++ ++out_netdev_free: ++ hns3_dbg_uninit(handle); ++ free_netdev(netdev); ++} ++ ++static void hns3_link_status_change(struct hnae3_handle *handle, bool linkup) ++{ ++ struct net_device *netdev = handle->kinfo.netdev; ++ ++ if (!netdev) ++ return; ++ ++ if (linkup) { ++ netif_tx_wake_all_queues(netdev); ++ netif_carrier_on(netdev); ++ if (netif_msg_link(handle)) ++ netdev_info(netdev, "link up\n"); ++ } else { ++ netif_carrier_off(netdev); ++ netif_tx_stop_all_queues(netdev); ++ if (netif_msg_link(handle)) ++ netdev_info(netdev, "link down\n"); ++ } ++} ++ ++static void hns3_clear_tx_ring(struct hns3_enet_ring *ring) ++{ ++ while (ring->next_to_clean != ring->next_to_use) { ++ ring->desc[ring->next_to_clean].tx.bdtp_fe_sc_vld_ra_ri = 0; ++ hns3_free_buffer_detach(ring, ring->next_to_clean, 0); ++ ring_ptr_move_fw(ring, next_to_clean); ++ } ++ ++ ring->pending_buf = 0; ++} ++ ++static int hns3_clear_rx_ring(struct hns3_enet_ring *ring) ++{ ++ struct hns3_desc_cb res_cbs; ++ int ret; ++ ++ while (ring->next_to_use != ring->next_to_clean) { ++ /* When a buffer is not reused, it's memory has been ++ * freed in hns3_handle_rx_bd or will be freed by ++ * stack, so we need to replace the buffer here. ++ */ ++ if (!ring->desc_cb[ring->next_to_use].reuse_flag) { ++ ret = hns3_alloc_and_map_buffer(ring, &res_cbs); ++ if (ret) { ++ hns3_ring_stats_update(ring, sw_err_cnt); ++ /* if alloc new buffer fail, exit directly ++ * and reclear in up flow. ++ */ ++ netdev_warn(ring_to_netdev(ring), ++ "reserve buffer map failed, ret = %d\n", ++ ret); ++ return ret; ++ } ++ hns3_replace_buffer(ring, ring->next_to_use, &res_cbs); ++ } ++ ring_ptr_move_fw(ring, next_to_use); ++ } ++ ++ /* Free the pending skb in rx ring */ ++ if (ring->skb) { ++ dev_kfree_skb_any(ring->skb); ++ ring->skb = NULL; ++ ring->pending_buf = 0; ++ } ++ ++ return 0; ++} ++ ++static void hns3_force_clear_rx_ring(struct hns3_enet_ring *ring) ++{ ++ while (ring->next_to_use != ring->next_to_clean) { ++ /* When a buffer is not reused, it's memory has been ++ * freed in hns3_handle_rx_bd or will be freed by ++ * stack, so only need to unmap the buffer here. ++ */ ++ if (!ring->desc_cb[ring->next_to_use].reuse_flag) { ++ hns3_unmap_buffer(ring, ++ &ring->desc_cb[ring->next_to_use]); ++ ring->desc_cb[ring->next_to_use].dma = 0; ++ } ++ ++ ring_ptr_move_fw(ring, next_to_use); ++ } ++} ++ ++static void hns3_clear_all_ring(struct hnae3_handle *h, bool force) ++{ ++ struct net_device *ndev = h->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ u32 i; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ struct hns3_enet_ring *ring; ++ ++ ring = &priv->ring[i]; ++ hns3_clear_tx_ring(ring); ++ ++ ring = &priv->ring[i + h->kinfo.num_tqps]; ++ /* Continue to clear other rings even if clearing some ++ * rings failed. ++ */ ++ if (force) ++ hns3_force_clear_rx_ring(ring); ++ else ++ hns3_clear_rx_ring(ring); ++ } ++} ++ ++int hns3_nic_reset_all_ring(struct hnae3_handle *h) ++{ ++ struct net_device *ndev = h->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ struct hns3_enet_ring *rx_ring; ++ int i, j; ++ int ret; ++ ++ ret = h->ae_algo->ops->reset_queue(h); ++ if (ret) ++ return ret; ++ ++ for (i = 0; i < h->kinfo.num_tqps; i++) { ++ hns3_init_ring_hw(&priv->ring[i]); ++ ++ /* We need to clear tx ring here because self test will ++ * use the ring and will not run down before up ++ */ ++ hns3_clear_tx_ring(&priv->ring[i]); ++ priv->ring[i].next_to_clean = 0; ++ priv->ring[i].next_to_use = 0; ++ priv->ring[i].last_to_use = 0; ++ ++ rx_ring = &priv->ring[i + h->kinfo.num_tqps]; ++ hns3_init_ring_hw(rx_ring); ++ ret = hns3_clear_rx_ring(rx_ring); ++ if (ret) ++ return ret; ++ ++ /* We can not know the hardware head and tail when this ++ * function is called in reset flow, so we reuse all desc. ++ */ ++ for (j = 0; j < rx_ring->desc_num; j++) ++ hns3_reuse_buffer(rx_ring, j); ++ ++ rx_ring->next_to_clean = 0; ++ rx_ring->next_to_use = 0; ++ } ++ ++ hns3_init_tx_ring_tc(priv); ++ ++ return 0; ++} ++ ++static int hns3_reset_notify_down_enet(struct hnae3_handle *handle) ++{ ++ struct hnae3_knic_private_info *kinfo = &handle->kinfo; ++ struct net_device *ndev = kinfo->netdev; ++ struct hns3_nic_priv *priv = netdev_priv(ndev); ++ ++ if (test_and_set_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) ++ return 0; ++ ++ if (!netif_running(ndev)) ++ return 0; ++ ++ return hns3_nic_net_stop(ndev); ++} ++ ++static int hns3_reset_notify_up_enet(struct hnae3_handle *handle) ++{ ++ struct hnae3_knic_private_info *kinfo = &handle->kinfo; ++ struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev); ++ int ret = 0; ++ ++ if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state)) { ++ netdev_err(kinfo->netdev, "device is not initialized yet\n"); ++ return -EFAULT; ++ } ++ ++ clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state); ++ ++ if (netif_running(kinfo->netdev)) { ++ ret = hns3_nic_net_open(kinfo->netdev); ++ if (ret) { ++ set_bit(HNS3_NIC_STATE_RESETTING, &priv->state); ++ netdev_err(kinfo->netdev, ++ "net up fail, ret=%d!\n", ret); ++ return ret; ++ } ++ } ++ ++ return ret; ++} ++ ++static int hns3_reset_notify_init_enet(struct hnae3_handle *handle) ++{ ++ struct net_device *netdev = handle->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ int ret; ++ ++ /* Carrier off reporting is important to ethtool even BEFORE open */ ++ netif_carrier_off(netdev); ++ ++ ret = hns3_get_ring_config(priv); ++ if (ret) ++ return ret; ++ ++ ret = hns3_nic_alloc_vector_data(priv); ++ if (ret) ++ goto err_put_ring; ++ ++ ret = hns3_nic_init_vector_data(priv); ++ if (ret) ++ goto err_dealloc_vector; ++ ++ ret = hns3_init_all_ring(priv); ++ if (ret) ++ goto err_uninit_vector; ++ ++ hns3_cq_period_mode_init(priv, priv->tx_cqe_mode, priv->rx_cqe_mode); ++ ++ /* the device can work without cpu rmap, only aRFS needs it */ ++ ret = hns3_set_rx_cpu_rmap(netdev); ++ if (ret) ++ dev_warn(priv->dev, "set rx cpu rmap fail, ret=%d\n", ret); ++ ++ ret = hns3_nic_init_irq(priv); ++ if (ret) { ++ dev_err(priv->dev, "init irq failed! ret=%d\n", ret); ++ hns3_free_rx_cpu_rmap(netdev); ++ goto err_init_irq_fail; ++ } ++ ++ if (!hns3_is_phys_func(handle->pdev)) ++ hns3_init_mac_addr(netdev); ++ ++ ret = hns3_client_start(handle); ++ if (ret) { ++ dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret); ++ goto err_client_start_fail; ++ } ++ ++ set_bit(HNS3_NIC_STATE_INITED, &priv->state); ++ ++ return ret; ++ ++err_client_start_fail: ++ hns3_free_rx_cpu_rmap(netdev); ++ hns3_nic_uninit_irq(priv); ++err_init_irq_fail: ++ hns3_uninit_all_ring(priv); ++err_uninit_vector: ++ hns3_nic_uninit_vector_data(priv); ++err_dealloc_vector: ++ hns3_nic_dealloc_vector_data(priv); ++err_put_ring: ++ hns3_put_ring_config(priv); ++ ++ return ret; ++} ++ ++static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle) ++{ ++ struct net_device *netdev = handle->kinfo.netdev; ++ struct hns3_nic_priv *priv = netdev_priv(netdev); ++ ++ if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) { ++ netdev_warn(netdev, "already uninitialized\n"); ++ return 0; ++ } ++ ++ hns3_free_rx_cpu_rmap(netdev); ++ hns3_nic_uninit_irq(priv); ++ hns3_clear_all_ring(handle, true); ++ hns3_reset_tx_queue(priv->ae_handle); ++ ++ hns3_nic_uninit_vector_data(priv); ++ ++ hns3_nic_dealloc_vector_data(priv); ++ ++ hns3_uninit_all_ring(priv); ++ ++ hns3_put_ring_config(priv); ++ ++ return 0; ++} ++ ++int hns3_reset_notify(struct hnae3_handle *handle, ++ enum hnae3_reset_notify_type type) ++{ ++ int ret = 0; ++ ++ switch (type) { ++ case HNAE3_UP_CLIENT: ++ ret = hns3_reset_notify_up_enet(handle); ++ break; ++ case HNAE3_DOWN_CLIENT: ++ ret = hns3_reset_notify_down_enet(handle); ++ break; ++ case HNAE3_INIT_CLIENT: ++ ret = hns3_reset_notify_init_enet(handle); ++ break; ++ case HNAE3_UNINIT_CLIENT: ++ ret = hns3_reset_notify_uninit_enet(handle); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++static int hns3_change_channels(struct hnae3_handle *handle, u32 new_tqp_num, ++ bool rxfh_configured) ++{ ++ int ret; ++ ++ ret = handle->ae_algo->ops->set_channels(handle, new_tqp_num, ++ rxfh_configured); ++ if (ret) { ++ dev_err(&handle->pdev->dev, ++ "Change tqp num(%u) fail.\n", new_tqp_num); ++ return ret; ++ } ++ ++ ret = hns3_reset_notify(handle, HNAE3_INIT_CLIENT); ++ if (ret) ++ return ret; ++ ++ ret = hns3_reset_notify(handle, HNAE3_UP_CLIENT); ++ if (ret) ++ hns3_reset_notify(handle, HNAE3_UNINIT_CLIENT); ++ ++ return ret; ++} ++ ++int hns3_set_channels(struct net_device *netdev, ++ struct ethtool_channels *ch) ++{ ++ struct hnae3_handle *h = hns3_get_handle(netdev); ++ struct hnae3_knic_private_info *kinfo = &h->kinfo; ++ bool rxfh_configured = netif_is_rxfh_configured(netdev); ++ u32 new_tqp_num = ch->combined_count; ++ u16 org_tqp_num; ++ int ret; ++ ++ if (hns3_nic_resetting(netdev)) ++ return -EBUSY; ++ ++ if (ch->rx_count || ch->tx_count) ++ return -EINVAL; ++ ++ if (kinfo->tc_info.mqprio_active) { ++ dev_err(&netdev->dev, ++ "it's not allowed to set channels via ethtool when MQPRIO mode is on\n"); ++ return -EINVAL; ++ } ++ ++ if (new_tqp_num > hns3_get_max_available_channels(h) || ++ new_tqp_num < 1) { ++ dev_err(&netdev->dev, ++ "Change tqps fail, the tqp range is from 1 to %u", ++ hns3_get_max_available_channels(h)); ++ return -EINVAL; ++ } ++ ++ if (kinfo->rss_size == new_tqp_num) ++ return 0; ++ ++ netif_dbg(h, drv, netdev, ++ "set channels: tqp_num=%u, rxfh=%d\n", ++ new_tqp_num, rxfh_configured); ++ ++ ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT); ++ if (ret) ++ return ret; ++ ++ ret = hns3_reset_notify(h, HNAE3_UNINIT_CLIENT); ++ if (ret) ++ return ret; ++ ++ org_tqp_num = h->kinfo.num_tqps; ++ ret = hns3_change_channels(h, new_tqp_num, rxfh_configured); ++ if (ret) { ++ int ret1; ++ ++ netdev_warn(netdev, ++ "Change channels fail, revert to old value\n"); ++ ret1 = hns3_change_channels(h, org_tqp_num, rxfh_configured); ++ if (ret1) { ++ netdev_err(netdev, ++ "revert to old channel fail\n"); ++ return ret1; ++ } ++ ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static const struct hns3_hw_error_info hns3_hw_err[] = { ++ { .type = HNAE3_PPU_POISON_ERROR, ++ .msg = "PPU poison" }, ++ { .type = HNAE3_CMDQ_ECC_ERROR, ++ .msg = "IMP CMDQ error" }, ++ { .type = HNAE3_IMP_RD_POISON_ERROR, ++ .msg = "IMP RD poison" }, ++ { .type = HNAE3_ROCEE_AXI_RESP_ERROR, ++ .msg = "ROCEE AXI RESP error" }, ++}; ++ ++static void hns3_process_hw_error(struct hnae3_handle *handle, ++ enum hnae3_hw_error_type type) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(hns3_hw_err); i++) { ++ if (hns3_hw_err[i].type == type) { ++ dev_err(&handle->pdev->dev, "Detected %s!\n", ++ hns3_hw_err[i].msg); ++ break; ++ } ++ } ++} ++ ++static const struct hnae3_client_ops client_ops = { ++ .init_instance = hns3_client_init, ++ .uninit_instance = hns3_client_uninit, ++ .link_status_change = hns3_link_status_change, ++ .reset_notify = hns3_reset_notify, ++ .process_hw_error = hns3_process_hw_error, ++}; ++ ++/* hns3_init_module - Driver registration routine ++ * hns3_init_module is the first routine called when the driver is ++ * loaded. All it does is register with the PCI subsystem. ++ */ ++static int __init hns3_init_module(void) ++{ ++ int ret; ++ ++ pr_info("%s: %s - version\n", hns3_driver_name, hns3_driver_string); ++ pr_info("%s: %s\n", hns3_driver_name, hns3_copyright); ++ ++ client.type = HNAE3_CLIENT_KNIC; ++ snprintf(client.name, HNAE3_CLIENT_NAME_LENGTH, "%s", ++ hns3_driver_name); ++ ++ client.ops = &client_ops; ++ ++ INIT_LIST_HEAD(&client.node); ++ ++ hns3_dbg_register_debugfs(hns3_driver_name); ++ ++ ret = hnae3_register_client(&client); ++ if (ret) ++ goto err_reg_client; ++ ++ ret = pci_register_driver(&hns3_driver); ++ if (ret) ++ goto err_reg_driver; ++ ++ return ret; ++ ++err_reg_driver: ++ hnae3_unregister_client(&client); ++err_reg_client: ++ hns3_dbg_unregister_debugfs(); ++ return ret; ++} ++module_init(hns3_init_module); ++ ++/* hns3_exit_module - Driver exit cleanup routine ++ * hns3_exit_module is called just before the driver is removed ++ * from memory. ++ */ ++static void __exit hns3_exit_module(void) ++{ ++ pci_unregister_driver(&hns3_driver); ++ hnae3_unregister_client(&client); ++ hns3_dbg_unregister_debugfs(); ++} ++module_exit(hns3_exit_module); ++ ++MODULE_DESCRIPTION("HNS3: Hisilicon Ethernet Driver"); ++MODULE_AUTHOR("Huawei Tech. Co., Ltd."); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS("pci:hns-nic"); +diff -rupN linux.orig/drivers/net/ethernet/huawei/hinic/hinic_rx.c linux/drivers/net/ethernet/huawei/hinic/hinic_rx.c +--- linux.orig/drivers/net/ethernet/huawei/hinic/hinic_rx.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/huawei/hinic/hinic_rx.c 2022-12-04 10:40:26.684034126 -0500 +@@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rx unsigned int start; do { @@ -1599,11 +7449,10 @@ index e5828a658caf4..a866bea651103 100644 } /** -diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c -index 3b6c7b5857376..5051cdff2384b 100644 ---- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c -+++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c -@@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_txq *txq, struct hinic_txq_stats *stats) +diff -rupN linux.orig/drivers/net/ethernet/huawei/hinic/hinic_tx.c linux/drivers/net/ethernet/huawei/hinic/hinic_tx.c +--- linux.orig/drivers/net/ethernet/huawei/hinic/hinic_tx.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/huawei/hinic/hinic_tx.c 2022-12-04 10:40:26.684034126 -0500 +@@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_tx unsigned int start; do { @@ -1620,11 +7469,10 @@ index 3b6c7b5857376..5051cdff2384b 100644 } /** -diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c -index 2cca9e84e31e1..34ab5ff9823b7 100644 ---- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c -+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c -@@ -1229,10 +1229,10 @@ static void fm10k_get_stats64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c linux/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c +--- linux.orig/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c 2022-12-04 10:40:26.684034126 -0500 +@@ -1229,10 +1229,10 @@ static void fm10k_get_stats64(struct net continue; do { @@ -1637,7 +7485,7 @@ index 2cca9e84e31e1..34ab5ff9823b7 100644 stats->rx_packets += packets; stats->rx_bytes += bytes; -@@ -1245,10 +1245,10 @@ static void fm10k_get_stats64(struct net_device *netdev, +@@ -1245,10 +1245,10 @@ static void fm10k_get_stats64(struct net continue; do { @@ -1650,11 +7498,10 @@ index 2cca9e84e31e1..34ab5ff9823b7 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; -diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c -index e9cd0fa6a0d2f..90f2eee78a3ee 100644 ---- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c -+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c -@@ -154,7 +154,7 @@ __i40e_add_ethtool_stats(u64 **data, void *pointer, +diff -rupN linux.orig/drivers/net/ethernet/intel/i40e/i40e_ethtool.c linux/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 2022-12-04 10:40:26.684034126 -0500 +@@ -154,7 +154,7 @@ __i40e_add_ethtool_stats(u64 **data, voi * @ring: the ring to copy * * Queue statistics must be copied while protected by @@ -1663,7 +7510,7 @@ index e9cd0fa6a0d2f..90f2eee78a3ee 100644 * Assumes that queue stats are defined in i40e_gstrings_queue_stats. If the * ring pointer is null, zero out the queue stat values and update the data * pointer. Otherwise safely copy the stats from the ring into the supplied -@@ -172,16 +172,16 @@ i40e_add_queue_stats(u64 **data, struct i40e_ring *ring) +@@ -172,16 +172,16 @@ i40e_add_queue_stats(u64 **data, struct /* To avoid invalid statistics values, ensure that we keep retrying * the copy until we get a consistent value according to @@ -1683,11 +7530,10 @@ index e9cd0fa6a0d2f..90f2eee78a3ee 100644 /* Once we successfully copy the stats in, update the data pointer */ *data += size; -diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c -index e3d9804aeb25e..09a9f67d9ebc0 100644 ---- a/drivers/net/ethernet/intel/i40e/i40e_main.c -+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c -@@ -418,10 +418,10 @@ static void i40e_get_netdev_stats_struct_tx(struct i40e_ring *ring, +diff -rupN linux.orig/drivers/net/ethernet/intel/i40e/i40e_main.c linux/drivers/net/ethernet/intel/i40e/i40e_main.c +--- linux.orig/drivers/net/ethernet/intel/i40e/i40e_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/i40e/i40e_main.c 2022-12-04 10:40:26.684034126 -0500 +@@ -418,10 +418,10 @@ static void i40e_get_netdev_stats_struct unsigned int start; do { @@ -1700,7 +7546,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; -@@ -471,10 +471,10 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev, +@@ -471,10 +471,10 @@ static void i40e_get_netdev_stats_struct if (!ring) continue; do { @@ -1713,7 +7559,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 stats->rx_packets += packets; stats->rx_bytes += bytes; -@@ -896,10 +896,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) +@@ -896,10 +896,10 @@ static void i40e_update_vsi_stats(struct continue; do { @@ -1726,7 +7572,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 tx_b += bytes; tx_p += packets; tx_restart += p->tx_stats.restart_queue; -@@ -914,10 +914,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) +@@ -914,10 +914,10 @@ static void i40e_update_vsi_stats(struct continue; do { @@ -1739,7 +7585,7 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 rx_b += bytes; rx_p += packets; rx_buf += p->rx_stats.alloc_buff_failed; -@@ -934,10 +934,10 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi) +@@ -934,10 +934,10 @@ static void i40e_update_vsi_stats(struct continue; do { @@ -1752,11 +7598,10 @@ index e3d9804aeb25e..09a9f67d9ebc0 100644 tx_b += bytes; tx_p += packets; tx_restart += p->tx_stats.restart_queue; -diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c -index e535d4c3da49d..fafa3406e0bcc 100644 ---- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c -+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c -@@ -147,7 +147,7 @@ __iavf_add_ethtool_stats(u64 **data, void *pointer, +diff -rupN linux.orig/drivers/net/ethernet/intel/iavf/iavf_ethtool.c linux/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/iavf/iavf_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/iavf/iavf_ethtool.c 2022-12-04 10:40:26.684034126 -0500 +@@ -147,7 +147,7 @@ __iavf_add_ethtool_stats(u64 **data, voi * @ring: the ring to copy * * Queue statistics must be copied while protected by @@ -1765,7 +7610,7 @@ index e535d4c3da49d..fafa3406e0bcc 100644 * Assumes that queue stats are defined in iavf_gstrings_queue_stats. If the * ring pointer is null, zero out the queue stat values and update the data * pointer. Otherwise safely copy the stats from the ring into the supplied -@@ -165,14 +165,14 @@ iavf_add_queue_stats(u64 **data, struct iavf_ring *ring) +@@ -165,14 +165,14 @@ iavf_add_queue_stats(u64 **data, struct /* To avoid invalid statistics values, ensure that we keep retrying * the copy until we get a consistent value according to @@ -1783,11 +7628,10 @@ index e535d4c3da49d..fafa3406e0bcc 100644 /* Once we successfully copy the stats in, update the data pointer */ *data += size; -diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c -index e109cb93886be..b7394c7e5eed2 100644 ---- a/drivers/net/ethernet/intel/ice/ice_main.c -+++ b/drivers/net/ethernet/intel/ice/ice_main.c -@@ -6295,10 +6295,10 @@ ice_fetch_u64_stats_per_ring(struct u64_stats_sync *syncp, +diff -rupN linux.orig/drivers/net/ethernet/intel/ice/ice_main.c linux/drivers/net/ethernet/intel/ice/ice_main.c +--- linux.orig/drivers/net/ethernet/intel/ice/ice_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ice/ice_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -6295,10 +6295,10 @@ ice_fetch_u64_stats_per_ring(struct u64_ unsigned int start; do { @@ -1800,11 +7644,10 @@ index e109cb93886be..b7394c7e5eed2 100644 } /** -diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c -index c14fc871dd417..23c6fcfcb905c 100644 ---- a/drivers/net/ethernet/intel/igb/igb_ethtool.c -+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c -@@ -2311,15 +2311,15 @@ static void igb_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/igb/igb_ethtool.c linux/drivers/net/ethernet/intel/igb/igb_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/igb/igb_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/igb/igb_ethtool.c 2022-12-04 10:40:26.688034116 -0500 +@@ -2311,15 +2311,15 @@ static void igb_get_ethtool_stats(struct ring = adapter->tx_ring[j]; do { @@ -1824,7 +7667,7 @@ index c14fc871dd417..23c6fcfcb905c 100644 data[i+2] += restart2; i += IGB_TX_QUEUE_STATS_LEN; -@@ -2327,13 +2327,13 @@ static void igb_get_ethtool_stats(struct net_device *netdev, +@@ -2327,13 +2327,13 @@ static void igb_get_ethtool_stats(struct for (j = 0; j < adapter->num_rx_queues; j++) { ring = adapter->rx_ring[j]; do { @@ -1840,11 +7683,10 @@ index c14fc871dd417..23c6fcfcb905c 100644 i += IGB_RX_QUEUE_STATS_LEN; } spin_unlock(&adapter->stats64_lock); -diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c -index 2796e81d27260..98df55dc1e933 100644 ---- a/drivers/net/ethernet/intel/igb/igb_main.c -+++ b/drivers/net/ethernet/intel/igb/igb_main.c -@@ -6633,10 +6633,10 @@ void igb_update_stats(struct igb_adapter *adapter) +diff -rupN linux.orig/drivers/net/ethernet/intel/igb/igb_main.c linux/drivers/net/ethernet/intel/igb/igb_main.c +--- linux.orig/drivers/net/ethernet/intel/igb/igb_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/igb/igb_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -6633,10 +6633,10 @@ void igb_update_stats(struct igb_adapter } do { @@ -1857,7 +7699,7 @@ index 2796e81d27260..98df55dc1e933 100644 bytes += _bytes; packets += _packets; } -@@ -6649,10 +6649,10 @@ void igb_update_stats(struct igb_adapter *adapter) +@@ -6649,10 +6649,10 @@ void igb_update_stats(struct igb_adapter for (i = 0; i < adapter->num_tx_queues; i++) { struct igb_ring *ring = adapter->tx_ring[i]; do { @@ -1870,11 +7712,10 @@ index 2796e81d27260..98df55dc1e933 100644 bytes += _bytes; packets += _packets; } -diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c -index 8cc077b712add..5a26a7805ef80 100644 ---- a/drivers/net/ethernet/intel/igc/igc_ethtool.c -+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c -@@ -839,15 +839,15 @@ static void igc_ethtool_get_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/igc/igc_ethtool.c linux/drivers/net/ethernet/intel/igc/igc_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/igc/igc_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/igc/igc_ethtool.c 2022-12-04 10:40:26.688034116 -0500 +@@ -839,15 +839,15 @@ static void igc_ethtool_get_stats(struct ring = adapter->tx_ring[j]; do { @@ -1894,7 +7735,7 @@ index 8cc077b712add..5a26a7805ef80 100644 data[i + 2] += restart2; i += IGC_TX_QUEUE_STATS_LEN; -@@ -855,13 +855,13 @@ static void igc_ethtool_get_stats(struct net_device *netdev, +@@ -855,13 +855,13 @@ static void igc_ethtool_get_stats(struct for (j = 0; j < adapter->num_rx_queues; j++) { ring = adapter->rx_ring[j]; do { @@ -1910,11 +7751,10 @@ index 8cc077b712add..5a26a7805ef80 100644 i += IGC_RX_QUEUE_STATS_LEN; } spin_unlock(&adapter->stats64_lock); -diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c -index ebff0e04045d6..944299b06cc3d 100644 ---- a/drivers/net/ethernet/intel/igc/igc_main.c -+++ b/drivers/net/ethernet/intel/igc/igc_main.c -@@ -4645,10 +4645,10 @@ void igc_update_stats(struct igc_adapter *adapter) +diff -rupN linux.orig/drivers/net/ethernet/intel/igc/igc_main.c linux/drivers/net/ethernet/intel/igc/igc_main.c +--- linux.orig/drivers/net/ethernet/intel/igc/igc_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/igc/igc_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -4645,10 +4645,10 @@ void igc_update_stats(struct igc_adapter } do { @@ -1927,7 +7767,7 @@ index ebff0e04045d6..944299b06cc3d 100644 bytes += _bytes; packets += _packets; } -@@ -4662,10 +4662,10 @@ void igc_update_stats(struct igc_adapter *adapter) +@@ -4662,10 +4662,10 @@ void igc_update_stats(struct igc_adapter struct igc_ring *ring = adapter->tx_ring[i]; do { @@ -1940,11 +7780,10 @@ index ebff0e04045d6..944299b06cc3d 100644 bytes += _bytes; packets += _packets; } -diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c -index 04f453eabef64..51bcf0df3adcc 100644 ---- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c -+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c -@@ -1335,10 +1335,10 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c linux/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +--- linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 2022-12-04 10:40:26.688034116 -0500 +@@ -1335,10 +1335,10 @@ static void ixgbe_get_ethtool_stats(stru } do { @@ -1957,7 +7796,7 @@ index 04f453eabef64..51bcf0df3adcc 100644 i += 2; } for (j = 0; j < IXGBE_NUM_RX_QUEUES; j++) { -@@ -1351,10 +1351,10 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev, +@@ -1351,10 +1351,10 @@ static void ixgbe_get_ethtool_stats(stru } do { @@ -1970,11 +7809,10 @@ index 04f453eabef64..51bcf0df3adcc 100644 i += 2; } -diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c -index d1e430b8c8aa1..01c5548f181d5 100644 ---- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c -+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c -@@ -9041,10 +9041,10 @@ static void ixgbe_get_ring_stats64(struct rtnl_link_stats64 *stats, +diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c linux/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +--- linux.orig/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -9041,10 +9041,10 @@ static void ixgbe_get_ring_stats64(struc if (ring) { do { @@ -1987,7 +7825,7 @@ index d1e430b8c8aa1..01c5548f181d5 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; } -@@ -9064,10 +9064,10 @@ static void ixgbe_get_stats64(struct net_device *netdev, +@@ -9064,10 +9064,10 @@ static void ixgbe_get_stats64(struct net if (ring) { do { @@ -2000,11 +7838,10 @@ index d1e430b8c8aa1..01c5548f181d5 100644 stats->rx_packets += packets; stats->rx_bytes += bytes; } -diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c -index fed46872af2bf..b4632b67ab143 100644 ---- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c -+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c -@@ -458,10 +458,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbevf/ethtool.c linux/drivers/net/ethernet/intel/ixgbevf/ethtool.c +--- linux.orig/drivers/net/ethernet/intel/ixgbevf/ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ixgbevf/ethtool.c 2022-12-04 10:40:26.688034116 -0500 +@@ -458,10 +458,10 @@ static void ixgbevf_get_ethtool_stats(st } do { @@ -2017,7 +7854,7 @@ index fed46872af2bf..b4632b67ab143 100644 i += 2; } -@@ -475,10 +475,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev, +@@ -475,10 +475,10 @@ static void ixgbevf_get_ethtool_stats(st } do { @@ -2030,7 +7867,7 @@ index fed46872af2bf..b4632b67ab143 100644 i += 2; } -@@ -492,10 +492,10 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev, +@@ -492,10 +492,10 @@ static void ixgbevf_get_ethtool_stats(st } do { @@ -2043,11 +7880,10 @@ index fed46872af2bf..b4632b67ab143 100644 i += 2; } } -diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c -index 2f12fbe229c15..1d31b8cff4f10 100644 ---- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c -+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c -@@ -4350,10 +4350,10 @@ static void ixgbevf_get_tx_ring_stats(struct rtnl_link_stats64 *stats, +diff -rupN linux.orig/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c linux/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +--- linux.orig/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 2022-12-04 10:40:26.688034116 -0500 +@@ -4350,10 +4350,10 @@ static void ixgbevf_get_tx_ring_stats(st if (ring) { do { @@ -2060,7 +7896,7 @@ index 2f12fbe229c15..1d31b8cff4f10 100644 stats->tx_bytes += bytes; stats->tx_packets += packets; } -@@ -4376,10 +4376,10 @@ static void ixgbevf_get_stats(struct net_device *netdev, +@@ -4376,10 +4376,10 @@ static void ixgbevf_get_stats(struct net for (i = 0; i < adapter->num_rx_queues; i++) { ring = adapter->rx_ring[i]; do { @@ -2073,11 +7909,10 @@ index 2f12fbe229c15..1d31b8cff4f10 100644 stats->rx_bytes += bytes; stats->rx_packets += packets; } -diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c -index 0caa2df87c044..89ea3ef0ee162 100644 ---- a/drivers/net/ethernet/marvell/mvneta.c -+++ b/drivers/net/ethernet/marvell/mvneta.c -@@ -813,14 +813,14 @@ mvneta_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/marvell/mvneta.c linux/drivers/net/ethernet/marvell/mvneta.c +--- linux.orig/drivers/net/ethernet/marvell/mvneta.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/marvell/mvneta.c 2022-12-04 10:40:26.692034106 -0500 +@@ -813,14 +813,14 @@ mvneta_get_stats64(struct net_device *de cpu_stats = per_cpu_ptr(pp->stats, cpu); do { @@ -2094,7 +7929,7 @@ index 0caa2df87c044..89ea3ef0ee162 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -@@ -4762,7 +4762,7 @@ mvneta_ethtool_update_pcpu_stats(struct mvneta_port *pp, +@@ -4762,7 +4762,7 @@ mvneta_ethtool_update_pcpu_stats(struct stats = per_cpu_ptr(pp->stats, cpu); do { @@ -2103,7 +7938,7 @@ index 0caa2df87c044..89ea3ef0ee162 100644 skb_alloc_error = stats->es.skb_alloc_error; refill_error = stats->es.refill_error; xdp_redirect = stats->es.ps.xdp_redirect; -@@ -4772,7 +4772,7 @@ mvneta_ethtool_update_pcpu_stats(struct mvneta_port *pp, +@@ -4772,7 +4772,7 @@ mvneta_ethtool_update_pcpu_stats(struct xdp_xmit_err = stats->es.ps.xdp_xmit_err; xdp_tx = stats->es.ps.xdp_tx; xdp_tx_err = stats->es.ps.xdp_tx_err; @@ -2112,11 +7947,10 @@ index 0caa2df87c044..89ea3ef0ee162 100644 es->skb_alloc_error += skb_alloc_error; es->refill_error += refill_error; -diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c -index eaa51cd7456b6..9dd8e0315dd4f 100644 ---- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c -+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c -@@ -2008,7 +2008,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *port, struct mvpp2_pcpu_stats *xdp_stats) +diff -rupN linux.orig/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c linux/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +--- linux.orig/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 2022-12-04 10:40:26.692034106 -0500 +@@ -2008,7 +2008,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *p cpu_stats = per_cpu_ptr(port->stats, cpu); do { @@ -2125,7 +7959,7 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644 xdp_redirect = cpu_stats->xdp_redirect; xdp_pass = cpu_stats->xdp_pass; xdp_drop = cpu_stats->xdp_drop; -@@ -2016,7 +2016,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *port, struct mvpp2_pcpu_stats *xdp_stats) +@@ -2016,7 +2016,7 @@ mvpp2_get_xdp_stats(struct mvpp2_port *p xdp_xmit_err = cpu_stats->xdp_xmit_err; xdp_tx = cpu_stats->xdp_tx; xdp_tx_err = cpu_stats->xdp_tx_err; @@ -2134,7 +7968,7 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644 xdp_stats->xdp_redirect += xdp_redirect; xdp_stats->xdp_pass += xdp_pass; -@@ -5115,12 +5115,12 @@ mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +@@ -5115,12 +5115,12 @@ mvpp2_get_stats64(struct net_device *dev cpu_stats = per_cpu_ptr(port->stats, cpu); do { @@ -2149,11 +7983,10 @@ index eaa51cd7456b6..9dd8e0315dd4f 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c -index bbea5458000bf..c9bb92187719c 100644 ---- a/drivers/net/ethernet/marvell/sky2.c -+++ b/drivers/net/ethernet/marvell/sky2.c -@@ -3894,19 +3894,19 @@ static void sky2_get_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/marvell/sky2.c linux/drivers/net/ethernet/marvell/sky2.c +--- linux.orig/drivers/net/ethernet/marvell/sky2.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/marvell/sky2.c 2022-12-04 10:40:26.692034106 -0500 +@@ -3894,19 +3894,19 @@ static void sky2_get_stats(struct net_de u64 _bytes, _packets; do { @@ -2177,11 +8010,10 @@ index bbea5458000bf..c9bb92187719c 100644 stats->tx_packets = _packets; stats->tx_bytes = _bytes; -diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c -index b344632beaddf..988927f8c5d7d 100644 ---- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c -+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c -@@ -853,7 +853,7 @@ static void mtk_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c +--- linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c 2022-12-04 10:40:26.692034106 -0500 +@@ -853,7 +853,7 @@ static void mtk_get_stats64(struct net_d } do { @@ -2190,7 +8022,7 @@ index b344632beaddf..988927f8c5d7d 100644 storage->rx_packets = hw_stats->rx_packets; storage->tx_packets = hw_stats->tx_packets; storage->rx_bytes = hw_stats->rx_bytes; -@@ -865,7 +865,7 @@ static void mtk_get_stats64(struct net_device *dev, +@@ -865,7 +865,7 @@ static void mtk_get_stats64(struct net_d storage->rx_crc_errors = hw_stats->rx_fcs_errors; storage->rx_errors = hw_stats->rx_checksum_errors; storage->tx_aborted_errors = hw_stats->tx_skip; @@ -2199,7 +8031,7 @@ index b344632beaddf..988927f8c5d7d 100644 storage->tx_errors = dev->stats.tx_errors; storage->rx_dropped = dev->stats.rx_dropped; -@@ -3664,13 +3664,13 @@ static void mtk_get_ethtool_stats(struct net_device *dev, +@@ -3668,13 +3668,13 @@ static void mtk_get_ethtool_stats(struct do { data_dst = data; @@ -2215,11 +8047,4339 @@ index b344632beaddf..988927f8c5d7d 100644 } static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, -diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c -index 30c7b0e157218..fa2753318cdf7 100644 ---- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c -+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c -@@ -827,12 +827,12 @@ mlxsw_sp_port_get_sw_stats64(const struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig +--- linux.orig/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/ethernet/mediatek/mtk_eth_soc.c.orig 2022-12-04 10:40:18.136056029 -0500 +@@ -0,0 +1,4325 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * ++ * Copyright (C) 2009-2016 John Crispin ++ * Copyright (C) 2009-2016 Felix Fietkau ++ * Copyright (C) 2013-2016 Michael Lee ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "mtk_eth_soc.h" ++#include "mtk_wed.h" ++ ++static int mtk_msg_level = -1; ++module_param_named(msg_level, mtk_msg_level, int, 0); ++MODULE_PARM_DESC(msg_level, "Message level (-1=defaults,0=none,...,16=all)"); ++ ++#define MTK_ETHTOOL_STAT(x) { #x, \ ++ offsetof(struct mtk_hw_stats, x) / sizeof(u64) } ++ ++#define MTK_ETHTOOL_XDP_STAT(x) { #x, \ ++ offsetof(struct mtk_hw_stats, xdp_stats.x) / \ ++ sizeof(u64) } ++ ++static const struct mtk_reg_map mtk_reg_map = { ++ .tx_irq_mask = 0x1a1c, ++ .tx_irq_status = 0x1a18, ++ .pdma = { ++ .rx_ptr = 0x0900, ++ .rx_cnt_cfg = 0x0904, ++ .pcrx_ptr = 0x0908, ++ .glo_cfg = 0x0a04, ++ .rst_idx = 0x0a08, ++ .delay_irq = 0x0a0c, ++ .irq_status = 0x0a20, ++ .irq_mask = 0x0a28, ++ .int_grp = 0x0a50, ++ }, ++ .qdma = { ++ .qtx_cfg = 0x1800, ++ .rx_ptr = 0x1900, ++ .rx_cnt_cfg = 0x1904, ++ .qcrx_ptr = 0x1908, ++ .glo_cfg = 0x1a04, ++ .rst_idx = 0x1a08, ++ .delay_irq = 0x1a0c, ++ .fc_th = 0x1a10, ++ .int_grp = 0x1a20, ++ .hred = 0x1a44, ++ .ctx_ptr = 0x1b00, ++ .dtx_ptr = 0x1b04, ++ .crx_ptr = 0x1b10, ++ .drx_ptr = 0x1b14, ++ .fq_head = 0x1b20, ++ .fq_tail = 0x1b24, ++ .fq_count = 0x1b28, ++ .fq_blen = 0x1b2c, ++ }, ++ .gdm1_cnt = 0x2400, ++}; ++ ++static const struct mtk_reg_map mt7628_reg_map = { ++ .tx_irq_mask = 0x0a28, ++ .tx_irq_status = 0x0a20, ++ .pdma = { ++ .rx_ptr = 0x0900, ++ .rx_cnt_cfg = 0x0904, ++ .pcrx_ptr = 0x0908, ++ .glo_cfg = 0x0a04, ++ .rst_idx = 0x0a08, ++ .delay_irq = 0x0a0c, ++ .irq_status = 0x0a20, ++ .irq_mask = 0x0a28, ++ .int_grp = 0x0a50, ++ }, ++}; ++ ++static const struct mtk_reg_map mt7986_reg_map = { ++ .tx_irq_mask = 0x461c, ++ .tx_irq_status = 0x4618, ++ .pdma = { ++ .rx_ptr = 0x6100, ++ .rx_cnt_cfg = 0x6104, ++ .pcrx_ptr = 0x6108, ++ .glo_cfg = 0x6204, ++ .rst_idx = 0x6208, ++ .delay_irq = 0x620c, ++ .irq_status = 0x6220, ++ .irq_mask = 0x6228, ++ .int_grp = 0x6250, ++ }, ++ .qdma = { ++ .qtx_cfg = 0x4400, ++ .rx_ptr = 0x4500, ++ .rx_cnt_cfg = 0x4504, ++ .qcrx_ptr = 0x4508, ++ .glo_cfg = 0x4604, ++ .rst_idx = 0x4608, ++ .delay_irq = 0x460c, ++ .fc_th = 0x4610, ++ .int_grp = 0x4620, ++ .hred = 0x4644, ++ .ctx_ptr = 0x4700, ++ .dtx_ptr = 0x4704, ++ .crx_ptr = 0x4710, ++ .drx_ptr = 0x4714, ++ .fq_head = 0x4720, ++ .fq_tail = 0x4724, ++ .fq_count = 0x4728, ++ .fq_blen = 0x472c, ++ }, ++ .gdm1_cnt = 0x1c00, ++}; ++ ++/* strings used by ethtool */ ++static const struct mtk_ethtool_stats { ++ char str[ETH_GSTRING_LEN]; ++ u32 offset; ++} mtk_ethtool_stats[] = { ++ MTK_ETHTOOL_STAT(tx_bytes), ++ MTK_ETHTOOL_STAT(tx_packets), ++ MTK_ETHTOOL_STAT(tx_skip), ++ MTK_ETHTOOL_STAT(tx_collisions), ++ MTK_ETHTOOL_STAT(rx_bytes), ++ MTK_ETHTOOL_STAT(rx_packets), ++ MTK_ETHTOOL_STAT(rx_overflow), ++ MTK_ETHTOOL_STAT(rx_fcs_errors), ++ MTK_ETHTOOL_STAT(rx_short_errors), ++ MTK_ETHTOOL_STAT(rx_long_errors), ++ MTK_ETHTOOL_STAT(rx_checksum_errors), ++ MTK_ETHTOOL_STAT(rx_flow_control_packets), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_redirect), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_pass), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_drop), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_tx), ++ MTK_ETHTOOL_XDP_STAT(rx_xdp_tx_errors), ++ MTK_ETHTOOL_XDP_STAT(tx_xdp_xmit), ++ MTK_ETHTOOL_XDP_STAT(tx_xdp_xmit_errors), ++}; ++ ++static const char * const mtk_clks_source_name[] = { ++ "ethif", "sgmiitop", "esw", "gp0", "gp1", "gp2", "fe", "trgpll", ++ "sgmii_tx250m", "sgmii_rx250m", "sgmii_cdr_ref", "sgmii_cdr_fb", ++ "sgmii2_tx250m", "sgmii2_rx250m", "sgmii2_cdr_ref", "sgmii2_cdr_fb", ++ "sgmii_ck", "eth2pll", "wocpu0", "wocpu1", "netsys0", "netsys1" ++}; ++ ++void mtk_w32(struct mtk_eth *eth, u32 val, unsigned reg) ++{ ++ __raw_writel(val, eth->base + reg); ++} ++ ++u32 mtk_r32(struct mtk_eth *eth, unsigned reg) ++{ ++ return __raw_readl(eth->base + reg); ++} ++ ++static u32 mtk_m32(struct mtk_eth *eth, u32 mask, u32 set, unsigned reg) ++{ ++ u32 val; ++ ++ val = mtk_r32(eth, reg); ++ val &= ~mask; ++ val |= set; ++ mtk_w32(eth, val, reg); ++ return reg; ++} ++ ++static int mtk_mdio_busy_wait(struct mtk_eth *eth) ++{ ++ unsigned long t_start = jiffies; ++ ++ while (1) { ++ if (!(mtk_r32(eth, MTK_PHY_IAC) & PHY_IAC_ACCESS)) ++ return 0; ++ if (time_after(jiffies, t_start + PHY_IAC_TIMEOUT)) ++ break; ++ cond_resched(); ++ } ++ ++ dev_err(eth->dev, "mdio: MDIO timeout\n"); ++ return -ETIMEDOUT; ++} ++ ++static int _mtk_mdio_write(struct mtk_eth *eth, u32 phy_addr, u32 phy_reg, ++ u32 write_data) ++{ ++ int ret; ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ if (phy_reg & MII_ADDR_C45) { ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C45 | ++ PHY_IAC_CMD_C45_ADDR | ++ PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) | ++ PHY_IAC_ADDR(phy_addr) | ++ PHY_IAC_DATA(mdiobus_c45_regad(phy_reg)), ++ MTK_PHY_IAC); ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C45 | ++ PHY_IAC_CMD_WRITE | ++ PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) | ++ PHY_IAC_ADDR(phy_addr) | ++ PHY_IAC_DATA(write_data), ++ MTK_PHY_IAC); ++ } else { ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C22 | ++ PHY_IAC_CMD_WRITE | ++ PHY_IAC_REG(phy_reg) | ++ PHY_IAC_ADDR(phy_addr) | ++ PHY_IAC_DATA(write_data), ++ MTK_PHY_IAC); ++ } ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ return 0; ++} ++ ++static int _mtk_mdio_read(struct mtk_eth *eth, u32 phy_addr, u32 phy_reg) ++{ ++ int ret; ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ if (phy_reg & MII_ADDR_C45) { ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C45 | ++ PHY_IAC_CMD_C45_ADDR | ++ PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) | ++ PHY_IAC_ADDR(phy_addr) | ++ PHY_IAC_DATA(mdiobus_c45_regad(phy_reg)), ++ MTK_PHY_IAC); ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C45 | ++ PHY_IAC_CMD_C45_READ | ++ PHY_IAC_REG(mdiobus_c45_devad(phy_reg)) | ++ PHY_IAC_ADDR(phy_addr), ++ MTK_PHY_IAC); ++ } else { ++ mtk_w32(eth, PHY_IAC_ACCESS | ++ PHY_IAC_START_C22 | ++ PHY_IAC_CMD_C22_READ | ++ PHY_IAC_REG(phy_reg) | ++ PHY_IAC_ADDR(phy_addr), ++ MTK_PHY_IAC); ++ } ++ ++ ret = mtk_mdio_busy_wait(eth); ++ if (ret < 0) ++ return ret; ++ ++ return mtk_r32(eth, MTK_PHY_IAC) & PHY_IAC_DATA_MASK; ++} ++ ++static int mtk_mdio_write(struct mii_bus *bus, int phy_addr, ++ int phy_reg, u16 val) ++{ ++ struct mtk_eth *eth = bus->priv; ++ ++ return _mtk_mdio_write(eth, phy_addr, phy_reg, val); ++} ++ ++static int mtk_mdio_read(struct mii_bus *bus, int phy_addr, int phy_reg) ++{ ++ struct mtk_eth *eth = bus->priv; ++ ++ return _mtk_mdio_read(eth, phy_addr, phy_reg); ++} ++ ++static int mt7621_gmac0_rgmii_adjust(struct mtk_eth *eth, ++ phy_interface_t interface) ++{ ++ u32 val; ++ ++ /* Check DDR memory type. ++ * Currently TRGMII mode with DDR2 memory is not supported. ++ */ ++ regmap_read(eth->ethsys, ETHSYS_SYSCFG, &val); ++ if (interface == PHY_INTERFACE_MODE_TRGMII && ++ val & SYSCFG_DRAM_TYPE_DDR2) { ++ dev_err(eth->dev, ++ "TRGMII mode with DDR2 memory is not supported!\n"); ++ return -EOPNOTSUPP; ++ } ++ ++ val = (interface == PHY_INTERFACE_MODE_TRGMII) ? ++ ETHSYS_TRGMII_MT7621_DDR_PLL : 0; ++ ++ regmap_update_bits(eth->ethsys, ETHSYS_CLKCFG0, ++ ETHSYS_TRGMII_MT7621_MASK, val); ++ ++ return 0; ++} ++ ++static void mtk_gmac0_rgmii_adjust(struct mtk_eth *eth, ++ phy_interface_t interface, int speed) ++{ ++ u32 val; ++ int ret; ++ ++ if (interface == PHY_INTERFACE_MODE_TRGMII) { ++ mtk_w32(eth, TRGMII_MODE, INTF_MODE); ++ val = 500000000; ++ ret = clk_set_rate(eth->clks[MTK_CLK_TRGPLL], val); ++ if (ret) ++ dev_err(eth->dev, "Failed to set trgmii pll: %d\n", ret); ++ return; ++ } ++ ++ val = (speed == SPEED_1000) ? ++ INTF_MODE_RGMII_1000 : INTF_MODE_RGMII_10_100; ++ mtk_w32(eth, val, INTF_MODE); ++ ++ regmap_update_bits(eth->ethsys, ETHSYS_CLKCFG0, ++ ETHSYS_TRGMII_CLK_SEL362_5, ++ ETHSYS_TRGMII_CLK_SEL362_5); ++ ++ val = (speed == SPEED_1000) ? 250000000 : 500000000; ++ ret = clk_set_rate(eth->clks[MTK_CLK_TRGPLL], val); ++ if (ret) ++ dev_err(eth->dev, "Failed to set trgmii pll: %d\n", ret); ++ ++ val = (speed == SPEED_1000) ? ++ RCK_CTRL_RGMII_1000 : RCK_CTRL_RGMII_10_100; ++ mtk_w32(eth, val, TRGMII_RCK_CTRL); ++ ++ val = (speed == SPEED_1000) ? ++ TCK_CTRL_RGMII_1000 : TCK_CTRL_RGMII_10_100; ++ mtk_w32(eth, val, TRGMII_TCK_CTRL); ++} ++ ++static struct phylink_pcs *mtk_mac_select_pcs(struct phylink_config *config, ++ phy_interface_t interface) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ struct mtk_eth *eth = mac->hw; ++ unsigned int sid; ++ ++ if (interface == PHY_INTERFACE_MODE_SGMII || ++ phy_interface_mode_is_8023z(interface)) { ++ sid = (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_SGMII)) ? ++ 0 : mac->id; ++ ++ return mtk_sgmii_select_pcs(eth->sgmii, sid); ++ } ++ ++ return NULL; ++} ++ ++static void mtk_mac_config(struct phylink_config *config, unsigned int mode, ++ const struct phylink_link_state *state) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ struct mtk_eth *eth = mac->hw; ++ int val, ge_mode, err = 0; ++ u32 i; ++ ++ /* MT76x8 has no hardware settings between for the MAC */ ++ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) && ++ mac->interface != state->interface) { ++ /* Setup soc pin functions */ ++ switch (state->interface) { ++ case PHY_INTERFACE_MODE_TRGMII: ++ if (mac->id) ++ goto err_phy; ++ if (!MTK_HAS_CAPS(mac->hw->soc->caps, ++ MTK_GMAC1_TRGMII)) ++ goto err_phy; ++ fallthrough; ++ case PHY_INTERFACE_MODE_RGMII_TXID: ++ case PHY_INTERFACE_MODE_RGMII_RXID: ++ case PHY_INTERFACE_MODE_RGMII_ID: ++ case PHY_INTERFACE_MODE_RGMII: ++ case PHY_INTERFACE_MODE_MII: ++ case PHY_INTERFACE_MODE_REVMII: ++ case PHY_INTERFACE_MODE_RMII: ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_RGMII)) { ++ err = mtk_gmac_rgmii_path_setup(eth, mac->id); ++ if (err) ++ goto init_err; ++ } ++ break; ++ case PHY_INTERFACE_MODE_1000BASEX: ++ case PHY_INTERFACE_MODE_2500BASEX: ++ case PHY_INTERFACE_MODE_SGMII: ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) { ++ err = mtk_gmac_sgmii_path_setup(eth, mac->id); ++ if (err) ++ goto init_err; ++ } ++ break; ++ case PHY_INTERFACE_MODE_GMII: ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_GEPHY)) { ++ err = mtk_gmac_gephy_path_setup(eth, mac->id); ++ if (err) ++ goto init_err; ++ } ++ break; ++ default: ++ goto err_phy; ++ } ++ ++ /* Setup clock for 1st gmac */ ++ if (!mac->id && state->interface != PHY_INTERFACE_MODE_SGMII && ++ !phy_interface_mode_is_8023z(state->interface) && ++ MTK_HAS_CAPS(mac->hw->soc->caps, MTK_GMAC1_TRGMII)) { ++ if (MTK_HAS_CAPS(mac->hw->soc->caps, ++ MTK_TRGMII_MT7621_CLK)) { ++ if (mt7621_gmac0_rgmii_adjust(mac->hw, ++ state->interface)) ++ goto err_phy; ++ } else { ++ /* FIXME: this is incorrect. Not only does it ++ * use state->speed (which is not guaranteed ++ * to be correct) but it also makes use of it ++ * in a code path that will only be reachable ++ * when the PHY interface mode changes, not ++ * when the speed changes. Consequently, RGMII ++ * is probably broken. ++ */ ++ mtk_gmac0_rgmii_adjust(mac->hw, ++ state->interface, ++ state->speed); ++ ++ /* mt7623_pad_clk_setup */ ++ for (i = 0 ; i < NUM_TRGMII_CTRL; i++) ++ mtk_w32(mac->hw, ++ TD_DM_DRVP(8) | TD_DM_DRVN(8), ++ TRGMII_TD_ODT(i)); ++ ++ /* Assert/release MT7623 RXC reset */ ++ mtk_m32(mac->hw, 0, RXC_RST | RXC_DQSISEL, ++ TRGMII_RCK_CTRL); ++ mtk_m32(mac->hw, RXC_RST, 0, TRGMII_RCK_CTRL); ++ } ++ } ++ ++ ge_mode = 0; ++ switch (state->interface) { ++ case PHY_INTERFACE_MODE_MII: ++ case PHY_INTERFACE_MODE_GMII: ++ ge_mode = 1; ++ break; ++ case PHY_INTERFACE_MODE_REVMII: ++ ge_mode = 2; ++ break; ++ case PHY_INTERFACE_MODE_RMII: ++ if (mac->id) ++ goto err_phy; ++ ge_mode = 3; ++ break; ++ default: ++ break; ++ } ++ ++ /* put the gmac into the right mode */ ++ regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val); ++ val &= ~SYSCFG0_GE_MODE(SYSCFG0_GE_MASK, mac->id); ++ val |= SYSCFG0_GE_MODE(ge_mode, mac->id); ++ regmap_write(eth->ethsys, ETHSYS_SYSCFG0, val); ++ ++ mac->interface = state->interface; ++ } ++ ++ /* SGMII */ ++ if (state->interface == PHY_INTERFACE_MODE_SGMII || ++ phy_interface_mode_is_8023z(state->interface)) { ++ /* The path GMAC to SGMII will be enabled once the SGMIISYS is ++ * being setup done. ++ */ ++ regmap_read(eth->ethsys, ETHSYS_SYSCFG0, &val); ++ ++ regmap_update_bits(eth->ethsys, ETHSYS_SYSCFG0, ++ SYSCFG0_SGMII_MASK, ++ ~(u32)SYSCFG0_SGMII_MASK); ++ ++ /* Save the syscfg0 value for mac_finish */ ++ mac->syscfg0 = val; ++ } else if (phylink_autoneg_inband(mode)) { ++ dev_err(eth->dev, ++ "In-band mode not supported in non SGMII mode!\n"); ++ return; ++ } ++ ++ return; ++ ++err_phy: ++ dev_err(eth->dev, "%s: GMAC%d mode %s not supported!\n", __func__, ++ mac->id, phy_modes(state->interface)); ++ return; ++ ++init_err: ++ dev_err(eth->dev, "%s: GMAC%d mode %s err: %d!\n", __func__, ++ mac->id, phy_modes(state->interface), err); ++} ++ ++static int mtk_mac_finish(struct phylink_config *config, unsigned int mode, ++ phy_interface_t interface) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ struct mtk_eth *eth = mac->hw; ++ u32 mcr_cur, mcr_new; ++ ++ /* Enable SGMII */ ++ if (interface == PHY_INTERFACE_MODE_SGMII || ++ phy_interface_mode_is_8023z(interface)) ++ regmap_update_bits(eth->ethsys, ETHSYS_SYSCFG0, ++ SYSCFG0_SGMII_MASK, mac->syscfg0); ++ ++ /* Setup gmac */ ++ mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); ++ mcr_new = mcr_cur; ++ mcr_new |= MAC_MCR_IPG_CFG | MAC_MCR_FORCE_MODE | ++ MAC_MCR_BACKOFF_EN | MAC_MCR_BACKPR_EN | MAC_MCR_FORCE_LINK; ++ ++ /* Only update control register when needed! */ ++ if (mcr_new != mcr_cur) ++ mtk_w32(mac->hw, mcr_new, MTK_MAC_MCR(mac->id)); ++ ++ return 0; ++} ++ ++static void mtk_mac_pcs_get_state(struct phylink_config *config, ++ struct phylink_link_state *state) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ u32 pmsr = mtk_r32(mac->hw, MTK_MAC_MSR(mac->id)); ++ ++ state->link = (pmsr & MAC_MSR_LINK); ++ state->duplex = (pmsr & MAC_MSR_DPX) >> 1; ++ ++ switch (pmsr & (MAC_MSR_SPEED_1000 | MAC_MSR_SPEED_100)) { ++ case 0: ++ state->speed = SPEED_10; ++ break; ++ case MAC_MSR_SPEED_100: ++ state->speed = SPEED_100; ++ break; ++ case MAC_MSR_SPEED_1000: ++ state->speed = SPEED_1000; ++ break; ++ default: ++ state->speed = SPEED_UNKNOWN; ++ break; ++ } ++ ++ state->pause &= (MLO_PAUSE_RX | MLO_PAUSE_TX); ++ if (pmsr & MAC_MSR_RX_FC) ++ state->pause |= MLO_PAUSE_RX; ++ if (pmsr & MAC_MSR_TX_FC) ++ state->pause |= MLO_PAUSE_TX; ++} ++ ++static void mtk_mac_link_down(struct phylink_config *config, unsigned int mode, ++ phy_interface_t interface) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ u32 mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); ++ ++ mcr &= ~(MAC_MCR_TX_EN | MAC_MCR_RX_EN); ++ mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); ++} ++ ++static void mtk_mac_link_up(struct phylink_config *config, ++ struct phy_device *phy, ++ unsigned int mode, phy_interface_t interface, ++ int speed, int duplex, bool tx_pause, bool rx_pause) ++{ ++ struct mtk_mac *mac = container_of(config, struct mtk_mac, ++ phylink_config); ++ u32 mcr; ++ ++ mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); ++ mcr &= ~(MAC_MCR_SPEED_100 | MAC_MCR_SPEED_1000 | ++ MAC_MCR_FORCE_DPX | MAC_MCR_FORCE_TX_FC | ++ MAC_MCR_FORCE_RX_FC); ++ ++ /* Configure speed */ ++ switch (speed) { ++ case SPEED_2500: ++ case SPEED_1000: ++ mcr |= MAC_MCR_SPEED_1000; ++ break; ++ case SPEED_100: ++ mcr |= MAC_MCR_SPEED_100; ++ break; ++ } ++ ++ /* Configure duplex */ ++ if (duplex == DUPLEX_FULL) ++ mcr |= MAC_MCR_FORCE_DPX; ++ ++ /* Configure pause modes - phylink will avoid these for half duplex */ ++ if (tx_pause) ++ mcr |= MAC_MCR_FORCE_TX_FC; ++ if (rx_pause) ++ mcr |= MAC_MCR_FORCE_RX_FC; ++ ++ mcr |= MAC_MCR_TX_EN | MAC_MCR_RX_EN; ++ mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); ++} ++ ++static const struct phylink_mac_ops mtk_phylink_ops = { ++ .validate = phylink_generic_validate, ++ .mac_select_pcs = mtk_mac_select_pcs, ++ .mac_pcs_get_state = mtk_mac_pcs_get_state, ++ .mac_config = mtk_mac_config, ++ .mac_finish = mtk_mac_finish, ++ .mac_link_down = mtk_mac_link_down, ++ .mac_link_up = mtk_mac_link_up, ++}; ++ ++static int mtk_mdio_init(struct mtk_eth *eth) ++{ ++ struct device_node *mii_np; ++ int ret; ++ ++ mii_np = of_get_child_by_name(eth->dev->of_node, "mdio-bus"); ++ if (!mii_np) { ++ dev_err(eth->dev, "no %s child node found", "mdio-bus"); ++ return -ENODEV; ++ } ++ ++ if (!of_device_is_available(mii_np)) { ++ ret = -ENODEV; ++ goto err_put_node; ++ } ++ ++ eth->mii_bus = devm_mdiobus_alloc(eth->dev); ++ if (!eth->mii_bus) { ++ ret = -ENOMEM; ++ goto err_put_node; ++ } ++ ++ eth->mii_bus->name = "mdio"; ++ eth->mii_bus->read = mtk_mdio_read; ++ eth->mii_bus->write = mtk_mdio_write; ++ eth->mii_bus->probe_capabilities = MDIOBUS_C22_C45; ++ eth->mii_bus->priv = eth; ++ eth->mii_bus->parent = eth->dev; ++ ++ snprintf(eth->mii_bus->id, MII_BUS_ID_SIZE, "%pOFn", mii_np); ++ ret = of_mdiobus_register(eth->mii_bus, mii_np); ++ ++err_put_node: ++ of_node_put(mii_np); ++ return ret; ++} ++ ++static void mtk_mdio_cleanup(struct mtk_eth *eth) ++{ ++ if (!eth->mii_bus) ++ return; ++ ++ mdiobus_unregister(eth->mii_bus); ++} ++ ++static inline void mtk_tx_irq_disable(struct mtk_eth *eth, u32 mask) ++{ ++ unsigned long flags; ++ u32 val; ++ ++ spin_lock_irqsave(ð->tx_irq_lock, flags); ++ val = mtk_r32(eth, eth->soc->reg_map->tx_irq_mask); ++ mtk_w32(eth, val & ~mask, eth->soc->reg_map->tx_irq_mask); ++ spin_unlock_irqrestore(ð->tx_irq_lock, flags); ++} ++ ++static inline void mtk_tx_irq_enable(struct mtk_eth *eth, u32 mask) ++{ ++ unsigned long flags; ++ u32 val; ++ ++ spin_lock_irqsave(ð->tx_irq_lock, flags); ++ val = mtk_r32(eth, eth->soc->reg_map->tx_irq_mask); ++ mtk_w32(eth, val | mask, eth->soc->reg_map->tx_irq_mask); ++ spin_unlock_irqrestore(ð->tx_irq_lock, flags); ++} ++ ++static inline void mtk_rx_irq_disable(struct mtk_eth *eth, u32 mask) ++{ ++ unsigned long flags; ++ u32 val; ++ ++ spin_lock_irqsave(ð->rx_irq_lock, flags); ++ val = mtk_r32(eth, eth->soc->reg_map->pdma.irq_mask); ++ mtk_w32(eth, val & ~mask, eth->soc->reg_map->pdma.irq_mask); ++ spin_unlock_irqrestore(ð->rx_irq_lock, flags); ++} ++ ++static inline void mtk_rx_irq_enable(struct mtk_eth *eth, u32 mask) ++{ ++ unsigned long flags; ++ u32 val; ++ ++ spin_lock_irqsave(ð->rx_irq_lock, flags); ++ val = mtk_r32(eth, eth->soc->reg_map->pdma.irq_mask); ++ mtk_w32(eth, val | mask, eth->soc->reg_map->pdma.irq_mask); ++ spin_unlock_irqrestore(ð->rx_irq_lock, flags); ++} ++ ++static int mtk_set_mac_address(struct net_device *dev, void *p) ++{ ++ int ret = eth_mac_addr(dev, p); ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ const char *macaddr = dev->dev_addr; ++ ++ if (ret) ++ return ret; ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return -EBUSY; ++ ++ spin_lock_bh(&mac->hw->page_lock); ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1], ++ MT7628_SDM_MAC_ADRH); ++ mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) | ++ (macaddr[4] << 8) | macaddr[5], ++ MT7628_SDM_MAC_ADRL); ++ } else { ++ mtk_w32(mac->hw, (macaddr[0] << 8) | macaddr[1], ++ MTK_GDMA_MAC_ADRH(mac->id)); ++ mtk_w32(mac->hw, (macaddr[2] << 24) | (macaddr[3] << 16) | ++ (macaddr[4] << 8) | macaddr[5], ++ MTK_GDMA_MAC_ADRL(mac->id)); ++ } ++ spin_unlock_bh(&mac->hw->page_lock); ++ ++ return 0; ++} ++ ++void mtk_stats_update_mac(struct mtk_mac *mac) ++{ ++ struct mtk_hw_stats *hw_stats = mac->hw_stats; ++ struct mtk_eth *eth = mac->hw; ++ ++ u64_stats_update_begin(&hw_stats->syncp); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ hw_stats->tx_packets += mtk_r32(mac->hw, MT7628_SDM_TPCNT); ++ hw_stats->tx_bytes += mtk_r32(mac->hw, MT7628_SDM_TBCNT); ++ hw_stats->rx_packets += mtk_r32(mac->hw, MT7628_SDM_RPCNT); ++ hw_stats->rx_bytes += mtk_r32(mac->hw, MT7628_SDM_RBCNT); ++ hw_stats->rx_checksum_errors += ++ mtk_r32(mac->hw, MT7628_SDM_CS_ERR); ++ } else { ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ unsigned int offs = hw_stats->reg_offset; ++ u64 stats; ++ ++ hw_stats->rx_bytes += mtk_r32(mac->hw, reg_map->gdm1_cnt + offs); ++ stats = mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x4 + offs); ++ if (stats) ++ hw_stats->rx_bytes += (stats << 32); ++ hw_stats->rx_packets += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x8 + offs); ++ hw_stats->rx_overflow += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x10 + offs); ++ hw_stats->rx_fcs_errors += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x14 + offs); ++ hw_stats->rx_short_errors += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x18 + offs); ++ hw_stats->rx_long_errors += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x1c + offs); ++ hw_stats->rx_checksum_errors += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x20 + offs); ++ hw_stats->rx_flow_control_packets += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x24 + offs); ++ hw_stats->tx_skip += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x28 + offs); ++ hw_stats->tx_collisions += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x2c + offs); ++ hw_stats->tx_bytes += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x30 + offs); ++ stats = mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x34 + offs); ++ if (stats) ++ hw_stats->tx_bytes += (stats << 32); ++ hw_stats->tx_packets += ++ mtk_r32(mac->hw, reg_map->gdm1_cnt + 0x38 + offs); ++ } ++ ++ u64_stats_update_end(&hw_stats->syncp); ++} ++ ++static void mtk_stats_update(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->mac[i] || !eth->mac[i]->hw_stats) ++ continue; ++ if (spin_trylock(ð->mac[i]->hw_stats->stats_lock)) { ++ mtk_stats_update_mac(eth->mac[i]); ++ spin_unlock(ð->mac[i]->hw_stats->stats_lock); ++ } ++ } ++} ++ ++static void mtk_get_stats64(struct net_device *dev, ++ struct rtnl_link_stats64 *storage) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_hw_stats *hw_stats = mac->hw_stats; ++ unsigned int start; ++ ++ if (netif_running(dev) && netif_device_present(dev)) { ++ if (spin_trylock_bh(&hw_stats->stats_lock)) { ++ mtk_stats_update_mac(mac); ++ spin_unlock_bh(&hw_stats->stats_lock); ++ } ++ } ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&hw_stats->syncp); ++ storage->rx_packets = hw_stats->rx_packets; ++ storage->tx_packets = hw_stats->tx_packets; ++ storage->rx_bytes = hw_stats->rx_bytes; ++ storage->tx_bytes = hw_stats->tx_bytes; ++ storage->collisions = hw_stats->tx_collisions; ++ storage->rx_length_errors = hw_stats->rx_short_errors + ++ hw_stats->rx_long_errors; ++ storage->rx_over_errors = hw_stats->rx_overflow; ++ storage->rx_crc_errors = hw_stats->rx_fcs_errors; ++ storage->rx_errors = hw_stats->rx_checksum_errors; ++ storage->tx_aborted_errors = hw_stats->tx_skip; ++ } while (u64_stats_fetch_retry_irq(&hw_stats->syncp, start)); ++ ++ storage->tx_errors = dev->stats.tx_errors; ++ storage->rx_dropped = dev->stats.rx_dropped; ++ storage->tx_dropped = dev->stats.tx_dropped; ++} ++ ++static inline int mtk_max_frag_size(int mtu) ++{ ++ /* make sure buf_size will be at least MTK_MAX_RX_LENGTH */ ++ if (mtu + MTK_RX_ETH_HLEN < MTK_MAX_RX_LENGTH_2K) ++ mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN; ++ ++ return SKB_DATA_ALIGN(MTK_RX_HLEN + mtu) + ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++} ++ ++static inline int mtk_max_buf_size(int frag_size) ++{ ++ int buf_size = frag_size - NET_SKB_PAD - NET_IP_ALIGN - ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ ++ WARN_ON(buf_size < MTK_MAX_RX_LENGTH_2K); ++ ++ return buf_size; ++} ++ ++static bool mtk_rx_get_desc(struct mtk_eth *eth, struct mtk_rx_dma_v2 *rxd, ++ struct mtk_rx_dma_v2 *dma_rxd) ++{ ++ rxd->rxd2 = READ_ONCE(dma_rxd->rxd2); ++ if (!(rxd->rxd2 & RX_DMA_DONE)) ++ return false; ++ ++ rxd->rxd1 = READ_ONCE(dma_rxd->rxd1); ++ rxd->rxd3 = READ_ONCE(dma_rxd->rxd3); ++ rxd->rxd4 = READ_ONCE(dma_rxd->rxd4); ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ rxd->rxd5 = READ_ONCE(dma_rxd->rxd5); ++ rxd->rxd6 = READ_ONCE(dma_rxd->rxd6); ++ } ++ ++ return true; ++} ++ ++static void *mtk_max_lro_buf_alloc(gfp_t gfp_mask) ++{ ++ unsigned int size = mtk_max_frag_size(MTK_MAX_LRO_RX_LENGTH); ++ unsigned long data; ++ ++ data = __get_free_pages(gfp_mask | __GFP_COMP | __GFP_NOWARN, ++ get_order(size)); ++ ++ return (void *)data; ++} ++ ++/* the qdma core needs scratch memory to be setup */ ++static int mtk_init_fq_dma(struct mtk_eth *eth) ++{ ++ const struct mtk_soc_data *soc = eth->soc; ++ dma_addr_t phy_ring_tail; ++ int cnt = MTK_DMA_SIZE; ++ dma_addr_t dma_addr; ++ int i; ++ ++ eth->scratch_ring = dma_alloc_coherent(eth->dma_dev, ++ cnt * soc->txrx.txd_size, ++ ð->phy_scratch_ring, ++ GFP_KERNEL); ++ if (unlikely(!eth->scratch_ring)) ++ return -ENOMEM; ++ ++ eth->scratch_head = kcalloc(cnt, MTK_QDMA_PAGE_SIZE, GFP_KERNEL); ++ if (unlikely(!eth->scratch_head)) ++ return -ENOMEM; ++ ++ dma_addr = dma_map_single(eth->dma_dev, ++ eth->scratch_head, cnt * MTK_QDMA_PAGE_SIZE, ++ DMA_FROM_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr))) ++ return -ENOMEM; ++ ++ phy_ring_tail = eth->phy_scratch_ring + soc->txrx.txd_size * (cnt - 1); ++ ++ for (i = 0; i < cnt; i++) { ++ struct mtk_tx_dma_v2 *txd; ++ ++ txd = eth->scratch_ring + i * soc->txrx.txd_size; ++ txd->txd1 = dma_addr + i * MTK_QDMA_PAGE_SIZE; ++ if (i < cnt - 1) ++ txd->txd2 = eth->phy_scratch_ring + ++ (i + 1) * soc->txrx.txd_size; ++ ++ txd->txd3 = TX_DMA_PLEN0(MTK_QDMA_PAGE_SIZE); ++ txd->txd4 = 0; ++ if (MTK_HAS_CAPS(soc->caps, MTK_NETSYS_V2)) { ++ txd->txd5 = 0; ++ txd->txd6 = 0; ++ txd->txd7 = 0; ++ txd->txd8 = 0; ++ } ++ } ++ ++ mtk_w32(eth, eth->phy_scratch_ring, soc->reg_map->qdma.fq_head); ++ mtk_w32(eth, phy_ring_tail, soc->reg_map->qdma.fq_tail); ++ mtk_w32(eth, (cnt << 16) | cnt, soc->reg_map->qdma.fq_count); ++ mtk_w32(eth, MTK_QDMA_PAGE_SIZE << 16, soc->reg_map->qdma.fq_blen); ++ ++ return 0; ++} ++ ++static void *mtk_qdma_phys_to_virt(struct mtk_tx_ring *ring, u32 desc) ++{ ++ return ring->dma + (desc - ring->phys); ++} ++ ++static struct mtk_tx_buf *mtk_desc_to_tx_buf(struct mtk_tx_ring *ring, ++ void *txd, u32 txd_size) ++{ ++ int idx = (txd - ring->dma) / txd_size; ++ ++ return &ring->buf[idx]; ++} ++ ++static struct mtk_tx_dma *qdma_to_pdma(struct mtk_tx_ring *ring, ++ struct mtk_tx_dma *dma) ++{ ++ return ring->dma_pdma - (struct mtk_tx_dma *)ring->dma + dma; ++} ++ ++static int txd_to_idx(struct mtk_tx_ring *ring, void *dma, u32 txd_size) ++{ ++ return (dma - ring->dma) / txd_size; ++} ++ ++static void mtk_tx_unmap(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf, ++ struct xdp_frame_bulk *bq, bool napi) ++{ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) { ++ dma_unmap_single(eth->dma_dev, ++ dma_unmap_addr(tx_buf, dma_addr0), ++ dma_unmap_len(tx_buf, dma_len0), ++ DMA_TO_DEVICE); ++ } else if (tx_buf->flags & MTK_TX_FLAGS_PAGE0) { ++ dma_unmap_page(eth->dma_dev, ++ dma_unmap_addr(tx_buf, dma_addr0), ++ dma_unmap_len(tx_buf, dma_len0), ++ DMA_TO_DEVICE); ++ } ++ } else { ++ if (dma_unmap_len(tx_buf, dma_len0)) { ++ dma_unmap_page(eth->dma_dev, ++ dma_unmap_addr(tx_buf, dma_addr0), ++ dma_unmap_len(tx_buf, dma_len0), ++ DMA_TO_DEVICE); ++ } ++ ++ if (dma_unmap_len(tx_buf, dma_len1)) { ++ dma_unmap_page(eth->dma_dev, ++ dma_unmap_addr(tx_buf, dma_addr1), ++ dma_unmap_len(tx_buf, dma_len1), ++ DMA_TO_DEVICE); ++ } ++ } ++ ++ if (tx_buf->data && tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) { ++ if (tx_buf->type == MTK_TYPE_SKB) { ++ struct sk_buff *skb = tx_buf->data; ++ ++ if (napi) ++ napi_consume_skb(skb, napi); ++ else ++ dev_kfree_skb_any(skb); ++ } else { ++ struct xdp_frame *xdpf = tx_buf->data; ++ ++ if (napi && tx_buf->type == MTK_TYPE_XDP_TX) ++ xdp_return_frame_rx_napi(xdpf); ++ else if (bq) ++ xdp_return_frame_bulk(xdpf, bq); ++ else ++ xdp_return_frame(xdpf); ++ } ++ } ++ tx_buf->flags = 0; ++ tx_buf->data = NULL; ++} ++ ++static void setup_tx_buf(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf, ++ struct mtk_tx_dma *txd, dma_addr_t mapped_addr, ++ size_t size, int idx) ++{ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr); ++ dma_unmap_len_set(tx_buf, dma_len0, size); ++ } else { ++ if (idx & 1) { ++ txd->txd3 = mapped_addr; ++ txd->txd2 |= TX_DMA_PLEN1(size); ++ dma_unmap_addr_set(tx_buf, dma_addr1, mapped_addr); ++ dma_unmap_len_set(tx_buf, dma_len1, size); ++ } else { ++ tx_buf->data = (void *)MTK_DMA_DUMMY_DESC; ++ txd->txd1 = mapped_addr; ++ txd->txd2 = TX_DMA_PLEN0(size); ++ dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr); ++ dma_unmap_len_set(tx_buf, dma_len0, size); ++ } ++ } ++} ++ ++static void mtk_tx_set_dma_desc_v1(struct net_device *dev, void *txd, ++ struct mtk_tx_dma_desc_info *info) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ struct mtk_tx_dma *desc = txd; ++ u32 data; ++ ++ WRITE_ONCE(desc->txd1, info->addr); ++ ++ data = TX_DMA_SWC | TX_DMA_PLEN0(info->size); ++ if (info->last) ++ data |= TX_DMA_LS0; ++ WRITE_ONCE(desc->txd3, data); ++ ++ data = (mac->id + 1) << TX_DMA_FPORT_SHIFT; /* forward port */ ++ if (info->first) { ++ if (info->gso) ++ data |= TX_DMA_TSO; ++ /* tx checksum offload */ ++ if (info->csum) ++ data |= TX_DMA_CHKSUM; ++ /* vlan header offload */ ++ if (info->vlan) ++ data |= TX_DMA_INS_VLAN | info->vlan_tci; ++ } ++ WRITE_ONCE(desc->txd4, data); ++} ++ ++static void mtk_tx_set_dma_desc_v2(struct net_device *dev, void *txd, ++ struct mtk_tx_dma_desc_info *info) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_tx_dma_v2 *desc = txd; ++ struct mtk_eth *eth = mac->hw; ++ u32 data; ++ ++ WRITE_ONCE(desc->txd1, info->addr); ++ ++ data = TX_DMA_PLEN0(info->size); ++ if (info->last) ++ data |= TX_DMA_LS0; ++ WRITE_ONCE(desc->txd3, data); ++ ++ if (!info->qid && mac->id) ++ info->qid = MTK_QDMA_GMAC2_QID; ++ ++ data = (mac->id + 1) << TX_DMA_FPORT_SHIFT_V2; /* forward port */ ++ data |= TX_DMA_SWC_V2 | QID_BITS_V2(info->qid); ++ WRITE_ONCE(desc->txd4, data); ++ ++ data = 0; ++ if (info->first) { ++ if (info->gso) ++ data |= TX_DMA_TSO_V2; ++ /* tx checksum offload */ ++ if (info->csum) ++ data |= TX_DMA_CHKSUM_V2; ++ } ++ WRITE_ONCE(desc->txd5, data); ++ ++ data = 0; ++ if (info->first && info->vlan) ++ data |= TX_DMA_INS_VLAN_V2 | info->vlan_tci; ++ WRITE_ONCE(desc->txd6, data); ++ ++ WRITE_ONCE(desc->txd7, 0); ++ WRITE_ONCE(desc->txd8, 0); ++} ++ ++static void mtk_tx_set_dma_desc(struct net_device *dev, void *txd, ++ struct mtk_tx_dma_desc_info *info) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) ++ mtk_tx_set_dma_desc_v2(dev, txd, info); ++ else ++ mtk_tx_set_dma_desc_v1(dev, txd, info); ++} ++ ++static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev, ++ int tx_num, struct mtk_tx_ring *ring, bool gso) ++{ ++ struct mtk_tx_dma_desc_info txd_info = { ++ .size = skb_headlen(skb), ++ .gso = gso, ++ .csum = skb->ip_summed == CHECKSUM_PARTIAL, ++ .vlan = skb_vlan_tag_present(skb), ++ .qid = skb->mark & MTK_QDMA_TX_MASK, ++ .vlan_tci = skb_vlan_tag_get(skb), ++ .first = true, ++ .last = !skb_is_nonlinear(skb), ++ }; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ const struct mtk_soc_data *soc = eth->soc; ++ struct mtk_tx_dma *itxd, *txd; ++ struct mtk_tx_dma *itxd_pdma, *txd_pdma; ++ struct mtk_tx_buf *itx_buf, *tx_buf; ++ int i, n_desc = 1; ++ int k = 0; ++ ++ itxd = ring->next_free; ++ itxd_pdma = qdma_to_pdma(ring, itxd); ++ if (itxd == ring->last_free) ++ return -ENOMEM; ++ ++ itx_buf = mtk_desc_to_tx_buf(ring, itxd, soc->txrx.txd_size); ++ memset(itx_buf, 0, sizeof(*itx_buf)); ++ ++ txd_info.addr = dma_map_single(eth->dma_dev, skb->data, txd_info.size, ++ DMA_TO_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, txd_info.addr))) ++ return -ENOMEM; ++ ++ mtk_tx_set_dma_desc(dev, itxd, &txd_info); ++ ++ itx_buf->flags |= MTK_TX_FLAGS_SINGLE0; ++ itx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 : ++ MTK_TX_FLAGS_FPORT1; ++ setup_tx_buf(eth, itx_buf, itxd_pdma, txd_info.addr, txd_info.size, ++ k++); ++ ++ /* TX SG offload */ ++ txd = itxd; ++ txd_pdma = qdma_to_pdma(ring, txd); ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ unsigned int offset = 0; ++ int frag_size = skb_frag_size(frag); ++ ++ while (frag_size) { ++ bool new_desc = true; ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA) || ++ (i & 0x1)) { ++ txd = mtk_qdma_phys_to_virt(ring, txd->txd2); ++ txd_pdma = qdma_to_pdma(ring, txd); ++ if (txd == ring->last_free) ++ goto err_dma; ++ ++ n_desc++; ++ } else { ++ new_desc = false; ++ } ++ ++ memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info)); ++ txd_info.size = min_t(unsigned int, frag_size, ++ soc->txrx.dma_max_len); ++ txd_info.qid = skb->mark & MTK_QDMA_TX_MASK; ++ txd_info.last = i == skb_shinfo(skb)->nr_frags - 1 && ++ !(frag_size - txd_info.size); ++ txd_info.addr = skb_frag_dma_map(eth->dma_dev, frag, ++ offset, txd_info.size, ++ DMA_TO_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, txd_info.addr))) ++ goto err_dma; ++ ++ mtk_tx_set_dma_desc(dev, txd, &txd_info); ++ ++ tx_buf = mtk_desc_to_tx_buf(ring, txd, ++ soc->txrx.txd_size); ++ if (new_desc) ++ memset(tx_buf, 0, sizeof(*tx_buf)); ++ tx_buf->data = (void *)MTK_DMA_DUMMY_DESC; ++ tx_buf->flags |= MTK_TX_FLAGS_PAGE0; ++ tx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 : ++ MTK_TX_FLAGS_FPORT1; ++ ++ setup_tx_buf(eth, tx_buf, txd_pdma, txd_info.addr, ++ txd_info.size, k++); ++ ++ frag_size -= txd_info.size; ++ offset += txd_info.size; ++ } ++ } ++ ++ /* store skb to cleanup */ ++ itx_buf->type = MTK_TYPE_SKB; ++ itx_buf->data = skb; ++ ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ if (k & 0x1) ++ txd_pdma->txd2 |= TX_DMA_LS0; ++ else ++ txd_pdma->txd2 |= TX_DMA_LS1; ++ } ++ ++ netdev_sent_queue(dev, skb->len); ++ skb_tx_timestamp(skb); ++ ++ ring->next_free = mtk_qdma_phys_to_virt(ring, txd->txd2); ++ atomic_sub(n_desc, &ring->free_count); ++ ++ /* make sure that all changes to the dma ring are flushed before we ++ * continue ++ */ ++ wmb(); ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)) || ++ !netdev_xmit_more()) ++ mtk_w32(eth, txd->txd2, soc->reg_map->qdma.ctx_ptr); ++ } else { ++ int next_idx; ++ ++ next_idx = NEXT_DESP_IDX(txd_to_idx(ring, txd, soc->txrx.txd_size), ++ ring->dma_size); ++ mtk_w32(eth, next_idx, MT7628_TX_CTX_IDX0); ++ } ++ ++ return 0; ++ ++err_dma: ++ do { ++ tx_buf = mtk_desc_to_tx_buf(ring, itxd, soc->txrx.txd_size); ++ ++ /* unmap dma */ ++ mtk_tx_unmap(eth, tx_buf, NULL, false); ++ ++ itxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU; ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) ++ itxd_pdma->txd2 = TX_DMA_DESP2_DEF; ++ ++ itxd = mtk_qdma_phys_to_virt(ring, itxd->txd2); ++ itxd_pdma = qdma_to_pdma(ring, itxd); ++ } while (itxd != txd); ++ ++ return -ENOMEM; ++} ++ ++static int mtk_cal_txd_req(struct mtk_eth *eth, struct sk_buff *skb) ++{ ++ int i, nfrags = 1; ++ skb_frag_t *frag; ++ ++ if (skb_is_gso(skb)) { ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ frag = &skb_shinfo(skb)->frags[i]; ++ nfrags += DIV_ROUND_UP(skb_frag_size(frag), ++ eth->soc->txrx.dma_max_len); ++ } ++ } else { ++ nfrags += skb_shinfo(skb)->nr_frags; ++ } ++ ++ return nfrags; ++} ++ ++static int mtk_queue_stopped(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ if (netif_queue_stopped(eth->netdev[i])) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static void mtk_wake_queue(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ netif_wake_queue(eth->netdev[i]); ++ } ++} ++ ++static netdev_tx_t mtk_start_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct net_device_stats *stats = &dev->stats; ++ bool gso = false; ++ int tx_num; ++ ++ /* normally we can rely on the stack not calling this more than once, ++ * however we have 2 queues running on the same ring so we need to lock ++ * the ring access ++ */ ++ spin_lock(ð->page_lock); ++ ++ if (unlikely(test_bit(MTK_RESETTING, ð->state))) ++ goto drop; ++ ++ tx_num = mtk_cal_txd_req(eth, skb); ++ if (unlikely(atomic_read(&ring->free_count) <= tx_num)) { ++ netif_stop_queue(dev); ++ netif_err(eth, tx_queued, dev, ++ "Tx Ring full when queue awake!\n"); ++ spin_unlock(ð->page_lock); ++ return NETDEV_TX_BUSY; ++ } ++ ++ /* TSO: fill MSS info in tcp checksum field */ ++ if (skb_is_gso(skb)) { ++ if (skb_cow_head(skb, 0)) { ++ netif_warn(eth, tx_err, dev, ++ "GSO expand head fail.\n"); ++ goto drop; ++ } ++ ++ if (skb_shinfo(skb)->gso_type & ++ (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) { ++ gso = true; ++ tcp_hdr(skb)->check = htons(skb_shinfo(skb)->gso_size); ++ } ++ } ++ ++ if (mtk_tx_map(skb, dev, tx_num, ring, gso) < 0) ++ goto drop; ++ ++ if (unlikely(atomic_read(&ring->free_count) <= ring->thresh)) ++ netif_stop_queue(dev); ++ ++ spin_unlock(ð->page_lock); ++ ++ return NETDEV_TX_OK; ++ ++drop: ++ spin_unlock(ð->page_lock); ++ stats->tx_dropped++; ++ dev_kfree_skb_any(skb); ++ return NETDEV_TX_OK; ++} ++ ++static struct mtk_rx_ring *mtk_get_rx_ring(struct mtk_eth *eth) ++{ ++ int i; ++ struct mtk_rx_ring *ring; ++ int idx; ++ ++ if (!eth->hwlro) ++ return ð->rx_ring[0]; ++ ++ for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) { ++ struct mtk_rx_dma *rxd; ++ ++ ring = ð->rx_ring[i]; ++ idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size); ++ rxd = ring->dma + idx * eth->soc->txrx.rxd_size; ++ if (rxd->rxd2 & RX_DMA_DONE) { ++ ring->calc_idx_update = true; ++ return ring; ++ } ++ } ++ ++ return NULL; ++} ++ ++static void mtk_update_rx_cpu_idx(struct mtk_eth *eth) ++{ ++ struct mtk_rx_ring *ring; ++ int i; ++ ++ if (!eth->hwlro) { ++ ring = ð->rx_ring[0]; ++ mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg); ++ } else { ++ for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) { ++ ring = ð->rx_ring[i]; ++ if (ring->calc_idx_update) { ++ ring->calc_idx_update = false; ++ mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg); ++ } ++ } ++ } ++} ++ ++static bool mtk_page_pool_enabled(struct mtk_eth *eth) ++{ ++ return MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2); ++} ++ ++static struct page_pool *mtk_create_page_pool(struct mtk_eth *eth, ++ struct xdp_rxq_info *xdp_q, ++ int id, int size) ++{ ++ struct page_pool_params pp_params = { ++ .order = 0, ++ .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, ++ .pool_size = size, ++ .nid = NUMA_NO_NODE, ++ .dev = eth->dma_dev, ++ .offset = MTK_PP_HEADROOM, ++ .max_len = MTK_PP_MAX_BUF_SIZE, ++ }; ++ struct page_pool *pp; ++ int err; ++ ++ pp_params.dma_dir = rcu_access_pointer(eth->prog) ? DMA_BIDIRECTIONAL ++ : DMA_FROM_DEVICE; ++ pp = page_pool_create(&pp_params); ++ if (IS_ERR(pp)) ++ return pp; ++ ++ err = __xdp_rxq_info_reg(xdp_q, ð->dummy_dev, eth->rx_napi.napi_id, ++ id, PAGE_SIZE); ++ if (err < 0) ++ goto err_free_pp; ++ ++ err = xdp_rxq_info_reg_mem_model(xdp_q, MEM_TYPE_PAGE_POOL, pp); ++ if (err) ++ goto err_unregister_rxq; ++ ++ return pp; ++ ++err_unregister_rxq: ++ xdp_rxq_info_unreg(xdp_q); ++err_free_pp: ++ page_pool_destroy(pp); ++ ++ return ERR_PTR(err); ++} ++ ++static void *mtk_page_pool_get_buff(struct page_pool *pp, dma_addr_t *dma_addr, ++ gfp_t gfp_mask) ++{ ++ struct page *page; ++ ++ page = page_pool_alloc_pages(pp, gfp_mask | __GFP_NOWARN); ++ if (!page) ++ return NULL; ++ ++ *dma_addr = page_pool_get_dma_addr(page) + MTK_PP_HEADROOM; ++ return page_address(page); ++} ++ ++static void mtk_rx_put_buff(struct mtk_rx_ring *ring, void *data, bool napi) ++{ ++ if (ring->page_pool) ++ page_pool_put_full_page(ring->page_pool, ++ virt_to_head_page(data), napi); ++ else ++ skb_free_frag(data); ++} ++ ++static int mtk_xdp_frame_map(struct mtk_eth *eth, struct net_device *dev, ++ struct mtk_tx_dma_desc_info *txd_info, ++ struct mtk_tx_dma *txd, struct mtk_tx_buf *tx_buf, ++ void *data, u16 headroom, int index, bool dma_map) ++{ ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_tx_dma *txd_pdma; ++ ++ if (dma_map) { /* ndo_xdp_xmit */ ++ txd_info->addr = dma_map_single(eth->dma_dev, data, ++ txd_info->size, DMA_TO_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, txd_info->addr))) ++ return -ENOMEM; ++ ++ tx_buf->flags |= MTK_TX_FLAGS_SINGLE0; ++ } else { ++ struct page *page = virt_to_head_page(data); ++ ++ txd_info->addr = page_pool_get_dma_addr(page) + ++ sizeof(struct xdp_frame) + headroom; ++ dma_sync_single_for_device(eth->dma_dev, txd_info->addr, ++ txd_info->size, DMA_BIDIRECTIONAL); ++ } ++ mtk_tx_set_dma_desc(dev, txd, txd_info); ++ ++ tx_buf->flags |= !mac->id ? MTK_TX_FLAGS_FPORT0 : MTK_TX_FLAGS_FPORT1; ++ tx_buf->type = dma_map ? MTK_TYPE_XDP_NDO : MTK_TYPE_XDP_TX; ++ tx_buf->data = (void *)MTK_DMA_DUMMY_DESC; ++ ++ txd_pdma = qdma_to_pdma(ring, txd); ++ setup_tx_buf(eth, tx_buf, txd_pdma, txd_info->addr, txd_info->size, ++ index); ++ ++ return 0; ++} ++ ++static int mtk_xdp_submit_frame(struct mtk_eth *eth, struct xdp_frame *xdpf, ++ struct net_device *dev, bool dma_map) ++{ ++ struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf); ++ const struct mtk_soc_data *soc = eth->soc; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct mtk_tx_dma_desc_info txd_info = { ++ .size = xdpf->len, ++ .first = true, ++ .last = !xdp_frame_has_frags(xdpf), ++ }; ++ int err, index = 0, n_desc = 1, nr_frags; ++ struct mtk_tx_dma *htxd, *txd, *txd_pdma; ++ struct mtk_tx_buf *htx_buf, *tx_buf; ++ void *data = xdpf->data; ++ ++ if (unlikely(test_bit(MTK_RESETTING, ð->state))) ++ return -EBUSY; ++ ++ nr_frags = unlikely(xdp_frame_has_frags(xdpf)) ? sinfo->nr_frags : 0; ++ if (unlikely(atomic_read(&ring->free_count) <= 1 + nr_frags)) ++ return -EBUSY; ++ ++ spin_lock(ð->page_lock); ++ ++ txd = ring->next_free; ++ if (txd == ring->last_free) { ++ spin_unlock(ð->page_lock); ++ return -ENOMEM; ++ } ++ htxd = txd; ++ ++ tx_buf = mtk_desc_to_tx_buf(ring, txd, soc->txrx.txd_size); ++ memset(tx_buf, 0, sizeof(*tx_buf)); ++ htx_buf = tx_buf; ++ ++ for (;;) { ++ err = mtk_xdp_frame_map(eth, dev, &txd_info, txd, tx_buf, ++ data, xdpf->headroom, index, dma_map); ++ if (err < 0) ++ goto unmap; ++ ++ if (txd_info.last) ++ break; ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA) || (index & 0x1)) { ++ txd = mtk_qdma_phys_to_virt(ring, txd->txd2); ++ txd_pdma = qdma_to_pdma(ring, txd); ++ if (txd == ring->last_free) ++ goto unmap; ++ ++ tx_buf = mtk_desc_to_tx_buf(ring, txd, ++ soc->txrx.txd_size); ++ memset(tx_buf, 0, sizeof(*tx_buf)); ++ n_desc++; ++ } ++ ++ memset(&txd_info, 0, sizeof(struct mtk_tx_dma_desc_info)); ++ txd_info.size = skb_frag_size(&sinfo->frags[index]); ++ txd_info.last = index + 1 == nr_frags; ++ data = skb_frag_address(&sinfo->frags[index]); ++ ++ index++; ++ } ++ /* store xdpf for cleanup */ ++ htx_buf->data = xdpf; ++ ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ txd_pdma = qdma_to_pdma(ring, txd); ++ if (index & 1) ++ txd_pdma->txd2 |= TX_DMA_LS0; ++ else ++ txd_pdma->txd2 |= TX_DMA_LS1; ++ } ++ ++ ring->next_free = mtk_qdma_phys_to_virt(ring, txd->txd2); ++ atomic_sub(n_desc, &ring->free_count); ++ ++ /* make sure that all changes to the dma ring are flushed before we ++ * continue ++ */ ++ wmb(); ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ mtk_w32(eth, txd->txd2, soc->reg_map->qdma.ctx_ptr); ++ } else { ++ int idx; ++ ++ idx = txd_to_idx(ring, txd, soc->txrx.txd_size); ++ mtk_w32(eth, NEXT_DESP_IDX(idx, ring->dma_size), ++ MT7628_TX_CTX_IDX0); ++ } ++ ++ spin_unlock(ð->page_lock); ++ ++ return 0; ++ ++unmap: ++ while (htxd != txd) { ++ txd_pdma = qdma_to_pdma(ring, htxd); ++ tx_buf = mtk_desc_to_tx_buf(ring, htxd, soc->txrx.txd_size); ++ mtk_tx_unmap(eth, tx_buf, NULL, false); ++ ++ htxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU; ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) ++ txd_pdma->txd2 = TX_DMA_DESP2_DEF; ++ ++ htxd = mtk_qdma_phys_to_virt(ring, htxd->txd2); ++ } ++ ++ spin_unlock(ð->page_lock); ++ ++ return err; ++} ++ ++static int mtk_xdp_xmit(struct net_device *dev, int num_frame, ++ struct xdp_frame **frames, u32 flags) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_hw_stats *hw_stats = mac->hw_stats; ++ struct mtk_eth *eth = mac->hw; ++ int i, nxmit = 0; ++ ++ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) ++ return -EINVAL; ++ ++ for (i = 0; i < num_frame; i++) { ++ if (mtk_xdp_submit_frame(eth, frames[i], dev, true)) ++ break; ++ nxmit++; ++ } ++ ++ u64_stats_update_begin(&hw_stats->syncp); ++ hw_stats->xdp_stats.tx_xdp_xmit += nxmit; ++ hw_stats->xdp_stats.tx_xdp_xmit_errors += num_frame - nxmit; ++ u64_stats_update_end(&hw_stats->syncp); ++ ++ return nxmit; ++} ++ ++static u32 mtk_xdp_run(struct mtk_eth *eth, struct mtk_rx_ring *ring, ++ struct xdp_buff *xdp, struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_hw_stats *hw_stats = mac->hw_stats; ++ u64 *count = &hw_stats->xdp_stats.rx_xdp_drop; ++ struct bpf_prog *prog; ++ u32 act = XDP_PASS; ++ ++ rcu_read_lock(); ++ ++ prog = rcu_dereference(eth->prog); ++ if (!prog) ++ goto out; ++ ++ act = bpf_prog_run_xdp(prog, xdp); ++ switch (act) { ++ case XDP_PASS: ++ count = &hw_stats->xdp_stats.rx_xdp_pass; ++ goto update_stats; ++ case XDP_REDIRECT: ++ if (unlikely(xdp_do_redirect(dev, xdp, prog))) { ++ act = XDP_DROP; ++ break; ++ } ++ ++ count = &hw_stats->xdp_stats.rx_xdp_redirect; ++ goto update_stats; ++ case XDP_TX: { ++ struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); ++ ++ if (!xdpf || mtk_xdp_submit_frame(eth, xdpf, dev, false)) { ++ count = &hw_stats->xdp_stats.rx_xdp_tx_errors; ++ act = XDP_DROP; ++ break; ++ } ++ ++ count = &hw_stats->xdp_stats.rx_xdp_tx; ++ goto update_stats; ++ } ++ default: ++ bpf_warn_invalid_xdp_action(dev, prog, act); ++ fallthrough; ++ case XDP_ABORTED: ++ trace_xdp_exception(dev, prog, act); ++ fallthrough; ++ case XDP_DROP: ++ break; ++ } ++ ++ page_pool_put_full_page(ring->page_pool, ++ virt_to_head_page(xdp->data), true); ++ ++update_stats: ++ u64_stats_update_begin(&hw_stats->syncp); ++ *count = *count + 1; ++ u64_stats_update_end(&hw_stats->syncp); ++out: ++ rcu_read_unlock(); ++ ++ return act; ++} ++ ++static int mtk_poll_rx(struct napi_struct *napi, int budget, ++ struct mtk_eth *eth) ++{ ++ struct dim_sample dim_sample = {}; ++ struct mtk_rx_ring *ring; ++ bool xdp_flush = false; ++ int idx; ++ struct sk_buff *skb; ++ u8 *data, *new_data; ++ struct mtk_rx_dma_v2 *rxd, trxd; ++ int done = 0, bytes = 0; ++ ++ while (done < budget) { ++ unsigned int pktlen, *rxdcsum; ++ struct net_device *netdev; ++ dma_addr_t dma_addr; ++ u32 hash, reason; ++ int mac = 0; ++ ++ ring = mtk_get_rx_ring(eth); ++ if (unlikely(!ring)) ++ goto rx_done; ++ ++ idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size); ++ rxd = ring->dma + idx * eth->soc->txrx.rxd_size; ++ data = ring->data[idx]; ++ ++ if (!mtk_rx_get_desc(eth, &trxd, rxd)) ++ break; ++ ++ /* find out which mac the packet come from. values start at 1 */ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) ++ mac = RX_DMA_GET_SPORT_V2(trxd.rxd5) - 1; ++ else if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) && ++ !(trxd.rxd4 & RX_DMA_SPECIAL_TAG)) ++ mac = RX_DMA_GET_SPORT(trxd.rxd4) - 1; ++ ++ if (unlikely(mac < 0 || mac >= MTK_MAC_COUNT || ++ !eth->netdev[mac])) ++ goto release_desc; ++ ++ netdev = eth->netdev[mac]; ++ ++ if (unlikely(test_bit(MTK_RESETTING, ð->state))) ++ goto release_desc; ++ ++ pktlen = RX_DMA_GET_PLEN0(trxd.rxd2); ++ ++ /* alloc new buffer */ ++ if (ring->page_pool) { ++ struct page *page = virt_to_head_page(data); ++ struct xdp_buff xdp; ++ u32 ret; ++ ++ new_data = mtk_page_pool_get_buff(ring->page_pool, ++ &dma_addr, ++ GFP_ATOMIC); ++ if (unlikely(!new_data)) { ++ netdev->stats.rx_dropped++; ++ goto release_desc; ++ } ++ ++ dma_sync_single_for_cpu(eth->dma_dev, ++ page_pool_get_dma_addr(page) + MTK_PP_HEADROOM, ++ pktlen, page_pool_get_dma_dir(ring->page_pool)); ++ ++ xdp_init_buff(&xdp, PAGE_SIZE, &ring->xdp_q); ++ xdp_prepare_buff(&xdp, data, MTK_PP_HEADROOM, pktlen, ++ false); ++ xdp_buff_clear_frags_flag(&xdp); ++ ++ ret = mtk_xdp_run(eth, ring, &xdp, netdev); ++ if (ret == XDP_REDIRECT) ++ xdp_flush = true; ++ ++ if (ret != XDP_PASS) ++ goto skip_rx; ++ ++ skb = build_skb(data, PAGE_SIZE); ++ if (unlikely(!skb)) { ++ page_pool_put_full_page(ring->page_pool, ++ page, true); ++ netdev->stats.rx_dropped++; ++ goto skip_rx; ++ } ++ ++ skb_reserve(skb, xdp.data - xdp.data_hard_start); ++ skb_put(skb, xdp.data_end - xdp.data); ++ skb_mark_for_recycle(skb); ++ } else { ++ if (ring->frag_size <= PAGE_SIZE) ++ new_data = napi_alloc_frag(ring->frag_size); ++ else ++ new_data = mtk_max_lro_buf_alloc(GFP_ATOMIC); ++ ++ if (unlikely(!new_data)) { ++ netdev->stats.rx_dropped++; ++ goto release_desc; ++ } ++ ++ dma_addr = dma_map_single(eth->dma_dev, ++ new_data + NET_SKB_PAD + eth->ip_align, ++ ring->buf_size, DMA_FROM_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, ++ dma_addr))) { ++ skb_free_frag(new_data); ++ netdev->stats.rx_dropped++; ++ goto release_desc; ++ } ++ ++ dma_unmap_single(eth->dma_dev, trxd.rxd1, ++ ring->buf_size, DMA_FROM_DEVICE); ++ ++ skb = build_skb(data, ring->frag_size); ++ if (unlikely(!skb)) { ++ netdev->stats.rx_dropped++; ++ skb_free_frag(data); ++ goto skip_rx; ++ } ++ ++ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); ++ skb_put(skb, pktlen); ++ } ++ ++ skb->dev = netdev; ++ bytes += skb->len; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ hash = trxd.rxd5 & MTK_RXD5_FOE_ENTRY; ++ if (hash != MTK_RXD5_FOE_ENTRY) ++ skb_set_hash(skb, jhash_1word(hash, 0), ++ PKT_HASH_TYPE_L4); ++ rxdcsum = &trxd.rxd3; ++ } else { ++ hash = trxd.rxd4 & MTK_RXD4_FOE_ENTRY; ++ if (hash != MTK_RXD4_FOE_ENTRY) ++ skb_set_hash(skb, jhash_1word(hash, 0), ++ PKT_HASH_TYPE_L4); ++ rxdcsum = &trxd.rxd4; ++ } ++ ++ if (*rxdcsum & eth->soc->txrx.rx_dma_l4_valid) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ else ++ skb_checksum_none_assert(skb); ++ skb->protocol = eth_type_trans(skb, netdev); ++ ++ reason = FIELD_GET(MTK_RXD4_PPE_CPU_REASON, trxd.rxd4); ++ if (reason == MTK_PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED) ++ mtk_ppe_check_skb(eth->ppe, skb, hash); ++ ++ if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) { ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ if (trxd.rxd3 & RX_DMA_VTAG_V2) ++ __vlan_hwaccel_put_tag(skb, ++ htons(RX_DMA_VPID(trxd.rxd4)), ++ RX_DMA_VID(trxd.rxd4)); ++ } else if (trxd.rxd2 & RX_DMA_VTAG) { ++ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ++ RX_DMA_VID(trxd.rxd3)); ++ } ++ ++ /* If the device is attached to a dsa switch, the special ++ * tag inserted in VLAN field by hw switch can * be offloaded ++ * by RX HW VLAN offload. Clear vlan info. ++ */ ++ if (netdev_uses_dsa(netdev)) ++ __vlan_hwaccel_clear_tag(skb); ++ } ++ ++ skb_record_rx_queue(skb, 0); ++ napi_gro_receive(napi, skb); ++ ++skip_rx: ++ ring->data[idx] = new_data; ++ rxd->rxd1 = (unsigned int)dma_addr; ++release_desc: ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ rxd->rxd2 = RX_DMA_LSO; ++ else ++ rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size); ++ ++ ring->calc_idx = idx; ++ done++; ++ } ++ ++rx_done: ++ if (done) { ++ /* make sure that all changes to the dma ring are flushed before ++ * we continue ++ */ ++ wmb(); ++ mtk_update_rx_cpu_idx(eth); ++ } ++ ++ eth->rx_packets += done; ++ eth->rx_bytes += bytes; ++ dim_update_sample(eth->rx_events, eth->rx_packets, eth->rx_bytes, ++ &dim_sample); ++ net_dim(ð->rx_dim, dim_sample); ++ ++ if (xdp_flush) ++ xdp_do_flush_map(); ++ ++ return done; ++} ++ ++static int mtk_poll_tx_qdma(struct mtk_eth *eth, int budget, ++ unsigned int *done, unsigned int *bytes) ++{ ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct mtk_tx_buf *tx_buf; ++ struct xdp_frame_bulk bq; ++ struct mtk_tx_dma *desc; ++ u32 cpu, dma; ++ ++ cpu = ring->last_free_ptr; ++ dma = mtk_r32(eth, reg_map->qdma.drx_ptr); ++ ++ desc = mtk_qdma_phys_to_virt(ring, cpu); ++ xdp_frame_bulk_init(&bq); ++ ++ while ((cpu != dma) && budget) { ++ u32 next_cpu = desc->txd2; ++ int mac = 0; ++ ++ desc = mtk_qdma_phys_to_virt(ring, desc->txd2); ++ if ((desc->txd3 & TX_DMA_OWNER_CPU) == 0) ++ break; ++ ++ tx_buf = mtk_desc_to_tx_buf(ring, desc, ++ eth->soc->txrx.txd_size); ++ if (tx_buf->flags & MTK_TX_FLAGS_FPORT1) ++ mac = 1; ++ ++ if (!tx_buf->data) ++ break; ++ ++ if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) { ++ if (tx_buf->type == MTK_TYPE_SKB) { ++ struct sk_buff *skb = tx_buf->data; ++ ++ bytes[mac] += skb->len; ++ done[mac]++; ++ } ++ budget--; ++ } ++ mtk_tx_unmap(eth, tx_buf, &bq, true); ++ ++ ring->last_free = desc; ++ atomic_inc(&ring->free_count); ++ ++ cpu = next_cpu; ++ } ++ xdp_flush_frame_bulk(&bq); ++ ++ ring->last_free_ptr = cpu; ++ mtk_w32(eth, cpu, reg_map->qdma.crx_ptr); ++ ++ return budget; ++} ++ ++static int mtk_poll_tx_pdma(struct mtk_eth *eth, int budget, ++ unsigned int *done, unsigned int *bytes) ++{ ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct mtk_tx_buf *tx_buf; ++ struct xdp_frame_bulk bq; ++ struct mtk_tx_dma *desc; ++ u32 cpu, dma; ++ ++ cpu = ring->cpu_idx; ++ dma = mtk_r32(eth, MT7628_TX_DTX_IDX0); ++ xdp_frame_bulk_init(&bq); ++ ++ while ((cpu != dma) && budget) { ++ tx_buf = &ring->buf[cpu]; ++ if (!tx_buf->data) ++ break; ++ ++ if (tx_buf->data != (void *)MTK_DMA_DUMMY_DESC) { ++ if (tx_buf->type == MTK_TYPE_SKB) { ++ struct sk_buff *skb = tx_buf->data; ++ ++ bytes[0] += skb->len; ++ done[0]++; ++ } ++ budget--; ++ } ++ mtk_tx_unmap(eth, tx_buf, &bq, true); ++ ++ desc = ring->dma + cpu * eth->soc->txrx.txd_size; ++ ring->last_free = desc; ++ atomic_inc(&ring->free_count); ++ ++ cpu = NEXT_DESP_IDX(cpu, ring->dma_size); ++ } ++ xdp_flush_frame_bulk(&bq); ++ ++ ring->cpu_idx = cpu; ++ ++ return budget; ++} ++ ++static int mtk_poll_tx(struct mtk_eth *eth, int budget) ++{ ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ struct dim_sample dim_sample = {}; ++ unsigned int done[MTK_MAX_DEVS]; ++ unsigned int bytes[MTK_MAX_DEVS]; ++ int total = 0, i; ++ ++ memset(done, 0, sizeof(done)); ++ memset(bytes, 0, sizeof(bytes)); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ budget = mtk_poll_tx_qdma(eth, budget, done, bytes); ++ else ++ budget = mtk_poll_tx_pdma(eth, budget, done, bytes); ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i] || !done[i]) ++ continue; ++ netdev_completed_queue(eth->netdev[i], done[i], bytes[i]); ++ total += done[i]; ++ eth->tx_packets += done[i]; ++ eth->tx_bytes += bytes[i]; ++ } ++ ++ dim_update_sample(eth->tx_events, eth->tx_packets, eth->tx_bytes, ++ &dim_sample); ++ net_dim(ð->tx_dim, dim_sample); ++ ++ if (mtk_queue_stopped(eth) && ++ (atomic_read(&ring->free_count) > ring->thresh)) ++ mtk_wake_queue(eth); ++ ++ return total; ++} ++ ++static void mtk_handle_status_irq(struct mtk_eth *eth) ++{ ++ u32 status2 = mtk_r32(eth, MTK_INT_STATUS2); ++ ++ if (unlikely(status2 & (MTK_GDM1_AF | MTK_GDM2_AF))) { ++ mtk_stats_update(eth); ++ mtk_w32(eth, (MTK_GDM1_AF | MTK_GDM2_AF), ++ MTK_INT_STATUS2); ++ } ++} ++ ++static int mtk_napi_tx(struct napi_struct *napi, int budget) ++{ ++ struct mtk_eth *eth = container_of(napi, struct mtk_eth, tx_napi); ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ int tx_done = 0; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ mtk_handle_status_irq(eth); ++ mtk_w32(eth, MTK_TX_DONE_INT, reg_map->tx_irq_status); ++ tx_done = mtk_poll_tx(eth, budget); ++ ++ if (unlikely(netif_msg_intr(eth))) { ++ dev_info(eth->dev, ++ "done tx %d, intr 0x%08x/0x%x\n", tx_done, ++ mtk_r32(eth, reg_map->tx_irq_status), ++ mtk_r32(eth, reg_map->tx_irq_mask)); ++ } ++ ++ if (tx_done == budget) ++ return budget; ++ ++ if (mtk_r32(eth, reg_map->tx_irq_status) & MTK_TX_DONE_INT) ++ return budget; ++ ++ if (napi_complete_done(napi, tx_done)) ++ mtk_tx_irq_enable(eth, MTK_TX_DONE_INT); ++ ++ return tx_done; ++} ++ ++static int mtk_napi_rx(struct napi_struct *napi, int budget) ++{ ++ struct mtk_eth *eth = container_of(napi, struct mtk_eth, rx_napi); ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ int rx_done_total = 0; ++ ++ mtk_handle_status_irq(eth); ++ ++ do { ++ int rx_done; ++ ++ mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, ++ reg_map->pdma.irq_status); ++ rx_done = mtk_poll_rx(napi, budget - rx_done_total, eth); ++ rx_done_total += rx_done; ++ ++ if (unlikely(netif_msg_intr(eth))) { ++ dev_info(eth->dev, ++ "done rx %d, intr 0x%08x/0x%x\n", rx_done, ++ mtk_r32(eth, reg_map->pdma.irq_status), ++ mtk_r32(eth, reg_map->pdma.irq_mask)); ++ } ++ ++ if (rx_done_total == budget) ++ return budget; ++ ++ } while (mtk_r32(eth, reg_map->pdma.irq_status) & ++ eth->soc->txrx.rx_irq_done_mask); ++ ++ if (napi_complete_done(napi, rx_done_total)) ++ mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask); ++ ++ return rx_done_total; ++} ++ ++static int mtk_tx_alloc(struct mtk_eth *eth) ++{ ++ const struct mtk_soc_data *soc = eth->soc; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ int i, sz = soc->txrx.txd_size; ++ struct mtk_tx_dma_v2 *txd; ++ ++ ring->buf = kcalloc(MTK_DMA_SIZE, sizeof(*ring->buf), ++ GFP_KERNEL); ++ if (!ring->buf) ++ goto no_tx_mem; ++ ++ ring->dma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz, ++ &ring->phys, GFP_KERNEL); ++ if (!ring->dma) ++ goto no_tx_mem; ++ ++ for (i = 0; i < MTK_DMA_SIZE; i++) { ++ int next = (i + 1) % MTK_DMA_SIZE; ++ u32 next_ptr = ring->phys + next * sz; ++ ++ txd = ring->dma + i * sz; ++ txd->txd2 = next_ptr; ++ txd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU; ++ txd->txd4 = 0; ++ if (MTK_HAS_CAPS(soc->caps, MTK_NETSYS_V2)) { ++ txd->txd5 = 0; ++ txd->txd6 = 0; ++ txd->txd7 = 0; ++ txd->txd8 = 0; ++ } ++ } ++ ++ /* On MT7688 (PDMA only) this driver uses the ring->dma structs ++ * only as the framework. The real HW descriptors are the PDMA ++ * descriptors in ring->dma_pdma. ++ */ ++ if (!MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ ring->dma_pdma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz, ++ &ring->phys_pdma, GFP_KERNEL); ++ if (!ring->dma_pdma) ++ goto no_tx_mem; ++ ++ for (i = 0; i < MTK_DMA_SIZE; i++) { ++ ring->dma_pdma[i].txd2 = TX_DMA_DESP2_DEF; ++ ring->dma_pdma[i].txd4 = 0; ++ } ++ } ++ ++ ring->dma_size = MTK_DMA_SIZE; ++ atomic_set(&ring->free_count, MTK_DMA_SIZE - 2); ++ ring->next_free = ring->dma; ++ ring->last_free = (void *)txd; ++ ring->last_free_ptr = (u32)(ring->phys + ((MTK_DMA_SIZE - 1) * sz)); ++ ring->thresh = MAX_SKB_FRAGS; ++ ++ /* make sure that all changes to the dma ring are flushed before we ++ * continue ++ */ ++ wmb(); ++ ++ if (MTK_HAS_CAPS(soc->caps, MTK_QDMA)) { ++ mtk_w32(eth, ring->phys, soc->reg_map->qdma.ctx_ptr); ++ mtk_w32(eth, ring->phys, soc->reg_map->qdma.dtx_ptr); ++ mtk_w32(eth, ++ ring->phys + ((MTK_DMA_SIZE - 1) * sz), ++ soc->reg_map->qdma.crx_ptr); ++ mtk_w32(eth, ring->last_free_ptr, soc->reg_map->qdma.drx_ptr); ++ mtk_w32(eth, (QDMA_RES_THRES << 8) | QDMA_RES_THRES, ++ soc->reg_map->qdma.qtx_cfg); ++ } else { ++ mtk_w32(eth, ring->phys_pdma, MT7628_TX_BASE_PTR0); ++ mtk_w32(eth, MTK_DMA_SIZE, MT7628_TX_MAX_CNT0); ++ mtk_w32(eth, 0, MT7628_TX_CTX_IDX0); ++ mtk_w32(eth, MT7628_PST_DTX_IDX0, soc->reg_map->pdma.rst_idx); ++ } ++ ++ return 0; ++ ++no_tx_mem: ++ return -ENOMEM; ++} ++ ++static void mtk_tx_clean(struct mtk_eth *eth) ++{ ++ const struct mtk_soc_data *soc = eth->soc; ++ struct mtk_tx_ring *ring = ð->tx_ring; ++ int i; ++ ++ if (ring->buf) { ++ for (i = 0; i < MTK_DMA_SIZE; i++) ++ mtk_tx_unmap(eth, &ring->buf[i], NULL, false); ++ kfree(ring->buf); ++ ring->buf = NULL; ++ } ++ ++ if (ring->dma) { ++ dma_free_coherent(eth->dma_dev, ++ MTK_DMA_SIZE * soc->txrx.txd_size, ++ ring->dma, ring->phys); ++ ring->dma = NULL; ++ } ++ ++ if (ring->dma_pdma) { ++ dma_free_coherent(eth->dma_dev, ++ MTK_DMA_SIZE * soc->txrx.txd_size, ++ ring->dma_pdma, ring->phys_pdma); ++ ring->dma_pdma = NULL; ++ } ++} ++ ++static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag) ++{ ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ struct mtk_rx_ring *ring; ++ int rx_data_len, rx_dma_size; ++ int i; ++ ++ if (rx_flag == MTK_RX_FLAGS_QDMA) { ++ if (ring_no) ++ return -EINVAL; ++ ring = ð->rx_ring_qdma; ++ } else { ++ ring = ð->rx_ring[ring_no]; ++ } ++ ++ if (rx_flag == MTK_RX_FLAGS_HWLRO) { ++ rx_data_len = MTK_MAX_LRO_RX_LENGTH; ++ rx_dma_size = MTK_HW_LRO_DMA_SIZE; ++ } else { ++ rx_data_len = ETH_DATA_LEN; ++ rx_dma_size = MTK_DMA_SIZE; ++ } ++ ++ ring->frag_size = mtk_max_frag_size(rx_data_len); ++ ring->buf_size = mtk_max_buf_size(ring->frag_size); ++ ring->data = kcalloc(rx_dma_size, sizeof(*ring->data), ++ GFP_KERNEL); ++ if (!ring->data) ++ return -ENOMEM; ++ ++ if (mtk_page_pool_enabled(eth)) { ++ struct page_pool *pp; ++ ++ pp = mtk_create_page_pool(eth, &ring->xdp_q, ring_no, ++ rx_dma_size); ++ if (IS_ERR(pp)) ++ return PTR_ERR(pp); ++ ++ ring->page_pool = pp; ++ } ++ ++ ring->dma = dma_alloc_coherent(eth->dma_dev, ++ rx_dma_size * eth->soc->txrx.rxd_size, ++ &ring->phys, GFP_KERNEL); ++ if (!ring->dma) ++ return -ENOMEM; ++ ++ for (i = 0; i < rx_dma_size; i++) { ++ struct mtk_rx_dma_v2 *rxd; ++ dma_addr_t dma_addr; ++ void *data; ++ ++ rxd = ring->dma + i * eth->soc->txrx.rxd_size; ++ if (ring->page_pool) { ++ data = mtk_page_pool_get_buff(ring->page_pool, ++ &dma_addr, GFP_KERNEL); ++ if (!data) ++ return -ENOMEM; ++ } else { ++ if (ring->frag_size <= PAGE_SIZE) ++ data = netdev_alloc_frag(ring->frag_size); ++ else ++ data = mtk_max_lro_buf_alloc(GFP_KERNEL); ++ ++ if (!data) ++ return -ENOMEM; ++ ++ dma_addr = dma_map_single(eth->dma_dev, ++ data + NET_SKB_PAD + eth->ip_align, ++ ring->buf_size, DMA_FROM_DEVICE); ++ if (unlikely(dma_mapping_error(eth->dma_dev, ++ dma_addr))) { ++ skb_free_frag(data); ++ return -ENOMEM; ++ } ++ } ++ rxd->rxd1 = (unsigned int)dma_addr; ++ ring->data[i] = data; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ rxd->rxd2 = RX_DMA_LSO; ++ else ++ rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size); ++ ++ rxd->rxd3 = 0; ++ rxd->rxd4 = 0; ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ rxd->rxd5 = 0; ++ rxd->rxd6 = 0; ++ rxd->rxd7 = 0; ++ rxd->rxd8 = 0; ++ } ++ } ++ ++ ring->dma_size = rx_dma_size; ++ ring->calc_idx_update = false; ++ ring->calc_idx = rx_dma_size - 1; ++ if (rx_flag == MTK_RX_FLAGS_QDMA) ++ ring->crx_idx_reg = reg_map->qdma.qcrx_ptr + ++ ring_no * MTK_QRX_OFFSET; ++ else ++ ring->crx_idx_reg = reg_map->pdma.pcrx_ptr + ++ ring_no * MTK_QRX_OFFSET; ++ /* make sure that all changes to the dma ring are flushed before we ++ * continue ++ */ ++ wmb(); ++ ++ if (rx_flag == MTK_RX_FLAGS_QDMA) { ++ mtk_w32(eth, ring->phys, ++ reg_map->qdma.rx_ptr + ring_no * MTK_QRX_OFFSET); ++ mtk_w32(eth, rx_dma_size, ++ reg_map->qdma.rx_cnt_cfg + ring_no * MTK_QRX_OFFSET); ++ mtk_w32(eth, MTK_PST_DRX_IDX_CFG(ring_no), ++ reg_map->qdma.rst_idx); ++ } else { ++ mtk_w32(eth, ring->phys, ++ reg_map->pdma.rx_ptr + ring_no * MTK_QRX_OFFSET); ++ mtk_w32(eth, rx_dma_size, ++ reg_map->pdma.rx_cnt_cfg + ring_no * MTK_QRX_OFFSET); ++ mtk_w32(eth, MTK_PST_DRX_IDX_CFG(ring_no), ++ reg_map->pdma.rst_idx); ++ } ++ mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg); ++ ++ return 0; ++} ++ ++static void mtk_rx_clean(struct mtk_eth *eth, struct mtk_rx_ring *ring) ++{ ++ int i; ++ ++ if (ring->data && ring->dma) { ++ for (i = 0; i < ring->dma_size; i++) { ++ struct mtk_rx_dma *rxd; ++ ++ if (!ring->data[i]) ++ continue; ++ ++ rxd = ring->dma + i * eth->soc->txrx.rxd_size; ++ if (!rxd->rxd1) ++ continue; ++ ++ dma_unmap_single(eth->dma_dev, rxd->rxd1, ++ ring->buf_size, DMA_FROM_DEVICE); ++ mtk_rx_put_buff(ring, ring->data[i], false); ++ } ++ kfree(ring->data); ++ ring->data = NULL; ++ } ++ ++ if (ring->dma) { ++ dma_free_coherent(eth->dma_dev, ++ ring->dma_size * eth->soc->txrx.rxd_size, ++ ring->dma, ring->phys); ++ ring->dma = NULL; ++ } ++ ++ if (ring->page_pool) { ++ if (xdp_rxq_info_is_reg(&ring->xdp_q)) ++ xdp_rxq_info_unreg(&ring->xdp_q); ++ page_pool_destroy(ring->page_pool); ++ ring->page_pool = NULL; ++ } ++} ++ ++static int mtk_hwlro_rx_init(struct mtk_eth *eth) ++{ ++ int i; ++ u32 ring_ctrl_dw1 = 0, ring_ctrl_dw2 = 0, ring_ctrl_dw3 = 0; ++ u32 lro_ctrl_dw0 = 0, lro_ctrl_dw3 = 0; ++ ++ /* set LRO rings to auto-learn modes */ ++ ring_ctrl_dw2 |= MTK_RING_AUTO_LERAN_MODE; ++ ++ /* validate LRO ring */ ++ ring_ctrl_dw2 |= MTK_RING_VLD; ++ ++ /* set AGE timer (unit: 20us) */ ++ ring_ctrl_dw2 |= MTK_RING_AGE_TIME_H; ++ ring_ctrl_dw1 |= MTK_RING_AGE_TIME_L; ++ ++ /* set max AGG timer (unit: 20us) */ ++ ring_ctrl_dw2 |= MTK_RING_MAX_AGG_TIME; ++ ++ /* set max LRO AGG count */ ++ ring_ctrl_dw2 |= MTK_RING_MAX_AGG_CNT_L; ++ ring_ctrl_dw3 |= MTK_RING_MAX_AGG_CNT_H; ++ ++ for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) { ++ mtk_w32(eth, ring_ctrl_dw1, MTK_LRO_CTRL_DW1_CFG(i)); ++ mtk_w32(eth, ring_ctrl_dw2, MTK_LRO_CTRL_DW2_CFG(i)); ++ mtk_w32(eth, ring_ctrl_dw3, MTK_LRO_CTRL_DW3_CFG(i)); ++ } ++ ++ /* IPv4 checksum update enable */ ++ lro_ctrl_dw0 |= MTK_L3_CKS_UPD_EN; ++ ++ /* switch priority comparison to packet count mode */ ++ lro_ctrl_dw0 |= MTK_LRO_ALT_PKT_CNT_MODE; ++ ++ /* bandwidth threshold setting */ ++ mtk_w32(eth, MTK_HW_LRO_BW_THRE, MTK_PDMA_LRO_CTRL_DW2); ++ ++ /* auto-learn score delta setting */ ++ mtk_w32(eth, MTK_HW_LRO_REPLACE_DELTA, MTK_PDMA_LRO_ALT_SCORE_DELTA); ++ ++ /* set refresh timer for altering flows to 1 sec. (unit: 20us) */ ++ mtk_w32(eth, (MTK_HW_LRO_TIMER_UNIT << 16) | MTK_HW_LRO_REFRESH_TIME, ++ MTK_PDMA_LRO_ALT_REFRESH_TIMER); ++ ++ /* set HW LRO mode & the max aggregation count for rx packets */ ++ lro_ctrl_dw3 |= MTK_ADMA_MODE | (MTK_HW_LRO_MAX_AGG_CNT & 0xff); ++ ++ /* the minimal remaining room of SDL0 in RXD for lro aggregation */ ++ lro_ctrl_dw3 |= MTK_LRO_MIN_RXD_SDL; ++ ++ /* enable HW LRO */ ++ lro_ctrl_dw0 |= MTK_LRO_EN; ++ ++ mtk_w32(eth, lro_ctrl_dw3, MTK_PDMA_LRO_CTRL_DW3); ++ mtk_w32(eth, lro_ctrl_dw0, MTK_PDMA_LRO_CTRL_DW0); ++ ++ return 0; ++} ++ ++static void mtk_hwlro_rx_uninit(struct mtk_eth *eth) ++{ ++ int i; ++ u32 val; ++ ++ /* relinquish lro rings, flush aggregated packets */ ++ mtk_w32(eth, MTK_LRO_RING_RELINQUISH_REQ, MTK_PDMA_LRO_CTRL_DW0); ++ ++ /* wait for relinquishments done */ ++ for (i = 0; i < 10; i++) { ++ val = mtk_r32(eth, MTK_PDMA_LRO_CTRL_DW0); ++ if (val & MTK_LRO_RING_RELINQUISH_DONE) { ++ msleep(20); ++ continue; ++ } ++ break; ++ } ++ ++ /* invalidate lro rings */ ++ for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) ++ mtk_w32(eth, 0, MTK_LRO_CTRL_DW2_CFG(i)); ++ ++ /* disable HW LRO */ ++ mtk_w32(eth, 0, MTK_PDMA_LRO_CTRL_DW0); ++} ++ ++static void mtk_hwlro_val_ipaddr(struct mtk_eth *eth, int idx, __be32 ip) ++{ ++ u32 reg_val; ++ ++ reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx)); ++ ++ /* invalidate the IP setting */ ++ mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx)); ++ ++ mtk_w32(eth, ip, MTK_LRO_DIP_DW0_CFG(idx)); ++ ++ /* validate the IP setting */ ++ mtk_w32(eth, (reg_val | MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx)); ++} ++ ++static void mtk_hwlro_inval_ipaddr(struct mtk_eth *eth, int idx) ++{ ++ u32 reg_val; ++ ++ reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx)); ++ ++ /* invalidate the IP setting */ ++ mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx)); ++ ++ mtk_w32(eth, 0, MTK_LRO_DIP_DW0_CFG(idx)); ++} ++ ++static int mtk_hwlro_get_ip_cnt(struct mtk_mac *mac) ++{ ++ int cnt = 0; ++ int i; ++ ++ for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) { ++ if (mac->hwlro_ip[i]) ++ cnt++; ++ } ++ ++ return cnt; ++} ++ ++static int mtk_hwlro_add_ipaddr(struct net_device *dev, ++ struct ethtool_rxnfc *cmd) ++{ ++ struct ethtool_rx_flow_spec *fsp = ++ (struct ethtool_rx_flow_spec *)&cmd->fs; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int hwlro_idx; ++ ++ if ((fsp->flow_type != TCP_V4_FLOW) || ++ (!fsp->h_u.tcp_ip4_spec.ip4dst) || ++ (fsp->location > 1)) ++ return -EINVAL; ++ ++ mac->hwlro_ip[fsp->location] = htonl(fsp->h_u.tcp_ip4_spec.ip4dst); ++ hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location; ++ ++ mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac); ++ ++ mtk_hwlro_val_ipaddr(eth, hwlro_idx, mac->hwlro_ip[fsp->location]); ++ ++ return 0; ++} ++ ++static int mtk_hwlro_del_ipaddr(struct net_device *dev, ++ struct ethtool_rxnfc *cmd) ++{ ++ struct ethtool_rx_flow_spec *fsp = ++ (struct ethtool_rx_flow_spec *)&cmd->fs; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int hwlro_idx; ++ ++ if (fsp->location > 1) ++ return -EINVAL; ++ ++ mac->hwlro_ip[fsp->location] = 0; ++ hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location; ++ ++ mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac); ++ ++ mtk_hwlro_inval_ipaddr(eth, hwlro_idx); ++ ++ return 0; ++} ++ ++static void mtk_hwlro_netdev_disable(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int i, hwlro_idx; ++ ++ for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) { ++ mac->hwlro_ip[i] = 0; ++ hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + i; ++ ++ mtk_hwlro_inval_ipaddr(eth, hwlro_idx); ++ } ++ ++ mac->hwlro_ip_cnt = 0; ++} ++ ++static int mtk_hwlro_get_fdir_entry(struct net_device *dev, ++ struct ethtool_rxnfc *cmd) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct ethtool_rx_flow_spec *fsp = ++ (struct ethtool_rx_flow_spec *)&cmd->fs; ++ ++ if (fsp->location >= ARRAY_SIZE(mac->hwlro_ip)) ++ return -EINVAL; ++ ++ /* only tcp dst ipv4 is meaningful, others are meaningless */ ++ fsp->flow_type = TCP_V4_FLOW; ++ fsp->h_u.tcp_ip4_spec.ip4dst = ntohl(mac->hwlro_ip[fsp->location]); ++ fsp->m_u.tcp_ip4_spec.ip4dst = 0; ++ ++ fsp->h_u.tcp_ip4_spec.ip4src = 0; ++ fsp->m_u.tcp_ip4_spec.ip4src = 0xffffffff; ++ fsp->h_u.tcp_ip4_spec.psrc = 0; ++ fsp->m_u.tcp_ip4_spec.psrc = 0xffff; ++ fsp->h_u.tcp_ip4_spec.pdst = 0; ++ fsp->m_u.tcp_ip4_spec.pdst = 0xffff; ++ fsp->h_u.tcp_ip4_spec.tos = 0; ++ fsp->m_u.tcp_ip4_spec.tos = 0xff; ++ ++ return 0; ++} ++ ++static int mtk_hwlro_get_fdir_all(struct net_device *dev, ++ struct ethtool_rxnfc *cmd, ++ u32 *rule_locs) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ int cnt = 0; ++ int i; ++ ++ for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) { ++ if (mac->hwlro_ip[i]) { ++ rule_locs[cnt] = i; ++ cnt++; ++ } ++ } ++ ++ cmd->rule_cnt = cnt; ++ ++ return 0; ++} ++ ++static netdev_features_t mtk_fix_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ if (!(features & NETIF_F_LRO)) { ++ struct mtk_mac *mac = netdev_priv(dev); ++ int ip_cnt = mtk_hwlro_get_ip_cnt(mac); ++ ++ if (ip_cnt) { ++ netdev_info(dev, "RX flow is programmed, LRO should keep on\n"); ++ ++ features |= NETIF_F_LRO; ++ } ++ } ++ ++ return features; ++} ++ ++static int mtk_set_features(struct net_device *dev, netdev_features_t features) ++{ ++ int err = 0; ++ ++ if (!((dev->features ^ features) & NETIF_F_LRO)) ++ return 0; ++ ++ if (!(features & NETIF_F_LRO)) ++ mtk_hwlro_netdev_disable(dev); ++ ++ return err; ++} ++ ++/* wait for DMA to finish whatever it is doing before we start using it again */ ++static int mtk_dma_busy_wait(struct mtk_eth *eth) ++{ ++ unsigned int reg; ++ int ret; ++ u32 val; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ reg = eth->soc->reg_map->qdma.glo_cfg; ++ else ++ reg = eth->soc->reg_map->pdma.glo_cfg; ++ ++ ret = readx_poll_timeout_atomic(__raw_readl, eth->base + reg, val, ++ !(val & (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)), ++ 5, MTK_DMA_BUSY_TIMEOUT_US); ++ if (ret) ++ dev_err(eth->dev, "DMA init timeout\n"); ++ ++ return ret; ++} ++ ++static int mtk_dma_init(struct mtk_eth *eth) ++{ ++ int err; ++ u32 i; ++ ++ if (mtk_dma_busy_wait(eth)) ++ return -EBUSY; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ /* QDMA needs scratch memory for internal reordering of the ++ * descriptors ++ */ ++ err = mtk_init_fq_dma(eth); ++ if (err) ++ return err; ++ } ++ ++ err = mtk_tx_alloc(eth); ++ if (err) ++ return err; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_QDMA); ++ if (err) ++ return err; ++ } ++ ++ err = mtk_rx_alloc(eth, 0, MTK_RX_FLAGS_NORMAL); ++ if (err) ++ return err; ++ ++ if (eth->hwlro) { ++ for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) { ++ err = mtk_rx_alloc(eth, i, MTK_RX_FLAGS_HWLRO); ++ if (err) ++ return err; ++ } ++ err = mtk_hwlro_rx_init(eth); ++ if (err) ++ return err; ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ /* Enable random early drop and set drop threshold ++ * automatically ++ */ ++ mtk_w32(eth, FC_THRES_DROP_MODE | FC_THRES_DROP_EN | ++ FC_THRES_MIN, eth->soc->reg_map->qdma.fc_th); ++ mtk_w32(eth, 0x0, eth->soc->reg_map->qdma.hred); ++ } ++ ++ return 0; ++} ++ ++static void mtk_dma_free(struct mtk_eth *eth) ++{ ++ const struct mtk_soc_data *soc = eth->soc; ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) ++ if (eth->netdev[i]) ++ netdev_reset_queue(eth->netdev[i]); ++ if (eth->scratch_ring) { ++ dma_free_coherent(eth->dma_dev, ++ MTK_DMA_SIZE * soc->txrx.txd_size, ++ eth->scratch_ring, eth->phy_scratch_ring); ++ eth->scratch_ring = NULL; ++ eth->phy_scratch_ring = 0; ++ } ++ mtk_tx_clean(eth); ++ mtk_rx_clean(eth, ð->rx_ring[0]); ++ mtk_rx_clean(eth, ð->rx_ring_qdma); ++ ++ if (eth->hwlro) { ++ mtk_hwlro_rx_uninit(eth); ++ for (i = 1; i < MTK_MAX_RX_RING_NUM; i++) ++ mtk_rx_clean(eth, ð->rx_ring[i]); ++ } ++ ++ kfree(eth->scratch_head); ++} ++ ++static void mtk_tx_timeout(struct net_device *dev, unsigned int txqueue) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ eth->netdev[mac->id]->stats.tx_errors++; ++ netif_err(eth, tx_err, dev, ++ "transmit timed out\n"); ++ schedule_work(ð->pending_work); ++} ++ ++static irqreturn_t mtk_handle_irq_rx(int irq, void *_eth) ++{ ++ struct mtk_eth *eth = _eth; ++ ++ eth->rx_events++; ++ if (likely(napi_schedule_prep(ð->rx_napi))) { ++ __napi_schedule(ð->rx_napi); ++ mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask); ++ } ++ ++ return IRQ_HANDLED; ++} ++ ++static irqreturn_t mtk_handle_irq_tx(int irq, void *_eth) ++{ ++ struct mtk_eth *eth = _eth; ++ ++ eth->tx_events++; ++ if (likely(napi_schedule_prep(ð->tx_napi))) { ++ __napi_schedule(ð->tx_napi); ++ mtk_tx_irq_disable(eth, MTK_TX_DONE_INT); ++ } ++ ++ return IRQ_HANDLED; ++} ++ ++static irqreturn_t mtk_handle_irq(int irq, void *_eth) ++{ ++ struct mtk_eth *eth = _eth; ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ ++ if (mtk_r32(eth, reg_map->pdma.irq_mask) & ++ eth->soc->txrx.rx_irq_done_mask) { ++ if (mtk_r32(eth, reg_map->pdma.irq_status) & ++ eth->soc->txrx.rx_irq_done_mask) ++ mtk_handle_irq_rx(irq, _eth); ++ } ++ if (mtk_r32(eth, reg_map->tx_irq_mask) & MTK_TX_DONE_INT) { ++ if (mtk_r32(eth, reg_map->tx_irq_status) & MTK_TX_DONE_INT) ++ mtk_handle_irq_tx(irq, _eth); ++ } ++ ++ return IRQ_HANDLED; ++} ++ ++#ifdef CONFIG_NET_POLL_CONTROLLER ++static void mtk_poll_controller(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ mtk_tx_irq_disable(eth, MTK_TX_DONE_INT); ++ mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask); ++ mtk_handle_irq_rx(eth->irq[2], dev); ++ mtk_tx_irq_enable(eth, MTK_TX_DONE_INT); ++ mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask); ++} ++#endif ++ ++static int mtk_start_dma(struct mtk_eth *eth) ++{ ++ u32 val, rx_2b_offset = (NET_IP_ALIGN == 2) ? MTK_RX_2B_OFFSET : 0; ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ int err; ++ ++ err = mtk_dma_init(eth); ++ if (err) { ++ mtk_dma_free(eth); ++ return err; ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) { ++ val = mtk_r32(eth, reg_map->qdma.glo_cfg); ++ val |= MTK_TX_DMA_EN | MTK_RX_DMA_EN | ++ MTK_TX_BT_32DWORDS | MTK_NDP_CO_PRO | ++ MTK_RX_2B_OFFSET | MTK_TX_WB_DDONE; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) ++ val |= MTK_MUTLI_CNT | MTK_RESV_BUF | ++ MTK_WCOMP_EN | MTK_DMAD_WR_WDONE | ++ MTK_CHK_DDONE_EN; ++ else ++ val |= MTK_RX_BT_32DWORDS; ++ mtk_w32(eth, val, reg_map->qdma.glo_cfg); ++ ++ mtk_w32(eth, ++ MTK_RX_DMA_EN | rx_2b_offset | ++ MTK_RX_BT_32DWORDS | MTK_MULTI_EN, ++ reg_map->pdma.glo_cfg); ++ } else { ++ mtk_w32(eth, MTK_TX_WB_DDONE | MTK_TX_DMA_EN | MTK_RX_DMA_EN | ++ MTK_MULTI_EN | MTK_PDMA_SIZE_8DWORDS, ++ reg_map->pdma.glo_cfg); ++ } ++ ++ return 0; ++} ++ ++static void mtk_gdm_config(struct mtk_eth *eth, u32 config) ++{ ++ int i; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ return; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ u32 val = mtk_r32(eth, MTK_GDMA_FWD_CFG(i)); ++ ++ /* default setup the forward port to send frame to PDMA */ ++ val &= ~0xffff; ++ ++ /* Enable RX checksum */ ++ val |= MTK_GDMA_ICS_EN | MTK_GDMA_TCS_EN | MTK_GDMA_UCS_EN; ++ ++ val |= config; ++ ++ if (!i && eth->netdev[0] && netdev_uses_dsa(eth->netdev[0])) ++ val |= MTK_GDMA_SPECIAL_TAG; ++ ++ mtk_w32(eth, val, MTK_GDMA_FWD_CFG(i)); ++ } ++ /* Reset and enable PSE */ ++ mtk_w32(eth, RST_GL_PSE, MTK_RST_GL); ++ mtk_w32(eth, 0, MTK_RST_GL); ++} ++ ++static int mtk_open(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int err; ++ ++ err = phylink_of_phy_connect(mac->phylink, mac->of_node, 0); ++ if (err) { ++ netdev_err(dev, "%s: could not attach PHY: %d\n", __func__, ++ err); ++ return err; ++ } ++ ++ /* we run 2 netdevs on the same dma ring so we only bring it up once */ ++ if (!refcount_read(ð->dma_refcnt)) { ++ u32 gdm_config = MTK_GDMA_TO_PDMA; ++ ++ err = mtk_start_dma(eth); ++ if (err) { ++ phylink_disconnect_phy(mac->phylink); ++ return err; ++ } ++ ++ if (eth->soc->offload_version && mtk_ppe_start(eth->ppe) == 0) ++ gdm_config = MTK_GDMA_TO_PPE; ++ ++ mtk_gdm_config(eth, gdm_config); ++ ++ napi_enable(ð->tx_napi); ++ napi_enable(ð->rx_napi); ++ mtk_tx_irq_enable(eth, MTK_TX_DONE_INT); ++ mtk_rx_irq_enable(eth, eth->soc->txrx.rx_irq_done_mask); ++ refcount_set(ð->dma_refcnt, 1); ++ } ++ else ++ refcount_inc(ð->dma_refcnt); ++ ++ phylink_start(mac->phylink); ++ netif_start_queue(dev); ++ return 0; ++} ++ ++static void mtk_stop_dma(struct mtk_eth *eth, u32 glo_cfg) ++{ ++ u32 val; ++ int i; ++ ++ /* stop the dma engine */ ++ spin_lock_bh(ð->page_lock); ++ val = mtk_r32(eth, glo_cfg); ++ mtk_w32(eth, val & ~(MTK_TX_WB_DDONE | MTK_RX_DMA_EN | MTK_TX_DMA_EN), ++ glo_cfg); ++ spin_unlock_bh(ð->page_lock); ++ ++ /* wait for dma stop */ ++ for (i = 0; i < 10; i++) { ++ val = mtk_r32(eth, glo_cfg); ++ if (val & (MTK_TX_DMA_BUSY | MTK_RX_DMA_BUSY)) { ++ msleep(20); ++ continue; ++ } ++ break; ++ } ++} ++ ++static int mtk_stop(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ phylink_stop(mac->phylink); ++ ++ netif_tx_disable(dev); ++ ++ phylink_disconnect_phy(mac->phylink); ++ ++ /* only shutdown DMA if this is the last user */ ++ if (!refcount_dec_and_test(ð->dma_refcnt)) ++ return 0; ++ ++ mtk_gdm_config(eth, MTK_GDMA_DROP_ALL); ++ ++ mtk_tx_irq_disable(eth, MTK_TX_DONE_INT); ++ mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask); ++ napi_disable(ð->tx_napi); ++ napi_disable(ð->rx_napi); ++ ++ cancel_work_sync(ð->rx_dim.work); ++ cancel_work_sync(ð->tx_dim.work); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ mtk_stop_dma(eth, eth->soc->reg_map->qdma.glo_cfg); ++ mtk_stop_dma(eth, eth->soc->reg_map->pdma.glo_cfg); ++ ++ mtk_dma_free(eth); ++ ++ if (eth->soc->offload_version) ++ mtk_ppe_stop(eth->ppe); ++ ++ return 0; ++} ++ ++static int mtk_xdp_setup(struct net_device *dev, struct bpf_prog *prog, ++ struct netlink_ext_ack *extack) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ struct bpf_prog *old_prog; ++ bool need_update; ++ ++ if (eth->hwlro) { ++ NL_SET_ERR_MSG_MOD(extack, "XDP not supported with HWLRO"); ++ return -EOPNOTSUPP; ++ } ++ ++ if (dev->mtu > MTK_PP_MAX_BUF_SIZE) { ++ NL_SET_ERR_MSG_MOD(extack, "MTU too large for XDP"); ++ return -EOPNOTSUPP; ++ } ++ ++ need_update = !!eth->prog != !!prog; ++ if (netif_running(dev) && need_update) ++ mtk_stop(dev); ++ ++ old_prog = rcu_replace_pointer(eth->prog, prog, lockdep_rtnl_is_held()); ++ if (old_prog) ++ bpf_prog_put(old_prog); ++ ++ if (netif_running(dev) && need_update) ++ return mtk_open(dev); ++ ++ return 0; ++} ++ ++static int mtk_xdp(struct net_device *dev, struct netdev_bpf *xdp) ++{ ++ switch (xdp->command) { ++ case XDP_SETUP_PROG: ++ return mtk_xdp_setup(dev, xdp->prog, xdp->extack); ++ default: ++ return -EINVAL; ++ } ++} ++ ++static void ethsys_reset(struct mtk_eth *eth, u32 reset_bits) ++{ ++ regmap_update_bits(eth->ethsys, ETHSYS_RSTCTRL, ++ reset_bits, ++ reset_bits); ++ ++ usleep_range(1000, 1100); ++ regmap_update_bits(eth->ethsys, ETHSYS_RSTCTRL, ++ reset_bits, ++ ~reset_bits); ++ mdelay(10); ++} ++ ++static void mtk_clk_disable(struct mtk_eth *eth) ++{ ++ int clk; ++ ++ for (clk = MTK_CLK_MAX - 1; clk >= 0; clk--) ++ clk_disable_unprepare(eth->clks[clk]); ++} ++ ++static int mtk_clk_enable(struct mtk_eth *eth) ++{ ++ int clk, ret; ++ ++ for (clk = 0; clk < MTK_CLK_MAX ; clk++) { ++ ret = clk_prepare_enable(eth->clks[clk]); ++ if (ret) ++ goto err_disable_clks; ++ } ++ ++ return 0; ++ ++err_disable_clks: ++ while (--clk >= 0) ++ clk_disable_unprepare(eth->clks[clk]); ++ ++ return ret; ++} ++ ++static void mtk_dim_rx(struct work_struct *work) ++{ ++ struct dim *dim = container_of(work, struct dim, work); ++ struct mtk_eth *eth = container_of(dim, struct mtk_eth, rx_dim); ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ struct dim_cq_moder cur_profile; ++ u32 val, cur; ++ ++ cur_profile = net_dim_get_rx_moderation(eth->rx_dim.mode, ++ dim->profile_ix); ++ spin_lock_bh(ð->dim_lock); ++ ++ val = mtk_r32(eth, reg_map->pdma.delay_irq); ++ val &= MTK_PDMA_DELAY_TX_MASK; ++ val |= MTK_PDMA_DELAY_RX_EN; ++ ++ cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK); ++ val |= cur << MTK_PDMA_DELAY_RX_PTIME_SHIFT; ++ ++ cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK); ++ val |= cur << MTK_PDMA_DELAY_RX_PINT_SHIFT; ++ ++ mtk_w32(eth, val, reg_map->pdma.delay_irq); ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ mtk_w32(eth, val, reg_map->qdma.delay_irq); ++ ++ spin_unlock_bh(ð->dim_lock); ++ ++ dim->state = DIM_START_MEASURE; ++} ++ ++static void mtk_dim_tx(struct work_struct *work) ++{ ++ struct dim *dim = container_of(work, struct dim, work); ++ struct mtk_eth *eth = container_of(dim, struct mtk_eth, tx_dim); ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ struct dim_cq_moder cur_profile; ++ u32 val, cur; ++ ++ cur_profile = net_dim_get_tx_moderation(eth->tx_dim.mode, ++ dim->profile_ix); ++ spin_lock_bh(ð->dim_lock); ++ ++ val = mtk_r32(eth, reg_map->pdma.delay_irq); ++ val &= MTK_PDMA_DELAY_RX_MASK; ++ val |= MTK_PDMA_DELAY_TX_EN; ++ ++ cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK); ++ val |= cur << MTK_PDMA_DELAY_TX_PTIME_SHIFT; ++ ++ cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK); ++ val |= cur << MTK_PDMA_DELAY_TX_PINT_SHIFT; ++ ++ mtk_w32(eth, val, reg_map->pdma.delay_irq); ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) ++ mtk_w32(eth, val, reg_map->qdma.delay_irq); ++ ++ spin_unlock_bh(ð->dim_lock); ++ ++ dim->state = DIM_START_MEASURE; ++} ++ ++static int mtk_hw_init(struct mtk_eth *eth) ++{ ++ u32 dma_mask = ETHSYS_DMA_AG_MAP_PDMA | ETHSYS_DMA_AG_MAP_QDMA | ++ ETHSYS_DMA_AG_MAP_PPE; ++ const struct mtk_reg_map *reg_map = eth->soc->reg_map; ++ int i, val, ret; ++ ++ if (test_and_set_bit(MTK_HW_INIT, ð->state)) ++ return 0; ++ ++ pm_runtime_enable(eth->dev); ++ pm_runtime_get_sync(eth->dev); ++ ++ ret = mtk_clk_enable(eth); ++ if (ret) ++ goto err_disable_pm; ++ ++ if (eth->ethsys) ++ regmap_update_bits(eth->ethsys, ETHSYS_DMA_AG_MAP, dma_mask, ++ of_dma_is_coherent(eth->dma_dev->of_node) * dma_mask); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ ret = device_reset(eth->dev); ++ if (ret) { ++ dev_err(eth->dev, "MAC reset failed!\n"); ++ goto err_disable_pm; ++ } ++ ++ /* set interrupt delays based on current Net DIM sample */ ++ mtk_dim_rx(ð->rx_dim.work); ++ mtk_dim_tx(ð->tx_dim.work); ++ ++ /* disable delay and normal interrupt */ ++ mtk_tx_irq_disable(eth, ~0); ++ mtk_rx_irq_disable(eth, ~0); ++ ++ return 0; ++ } ++ ++ val = RSTCTRL_FE | RSTCTRL_PPE; ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ regmap_write(eth->ethsys, ETHSYS_FE_RST_CHK_IDLE_EN, 0); ++ ++ val |= RSTCTRL_ETH; ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_RSTCTRL_PPE1)) ++ val |= RSTCTRL_PPE1; ++ } ++ ++ ethsys_reset(eth, val); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ regmap_write(eth->ethsys, ETHSYS_FE_RST_CHK_IDLE_EN, ++ 0x3ffffff); ++ ++ /* Set FE to PDMAv2 if necessary */ ++ val = mtk_r32(eth, MTK_FE_GLO_MISC); ++ mtk_w32(eth, val | BIT(4), MTK_FE_GLO_MISC); ++ } ++ ++ if (eth->pctl) { ++ /* Set GE2 driving and slew rate */ ++ regmap_write(eth->pctl, GPIO_DRV_SEL10, 0xa00); ++ ++ /* set GE2 TDSEL */ ++ regmap_write(eth->pctl, GPIO_OD33_CTRL8, 0x5); ++ ++ /* set GE2 TUNE */ ++ regmap_write(eth->pctl, GPIO_BIAS_CTRL, 0x0); ++ } ++ ++ /* Set linkdown as the default for each GMAC. Its own MCR would be set ++ * up with the more appropriate value when mtk_mac_config call is being ++ * invoked. ++ */ ++ for (i = 0; i < MTK_MAC_COUNT; i++) ++ mtk_w32(eth, MAC_MCR_FORCE_LINK_DOWN, MTK_MAC_MCR(i)); ++ ++ /* Indicates CDM to parse the MTK special tag from CPU ++ * which also is working out for untag packets. ++ */ ++ val = mtk_r32(eth, MTK_CDMQ_IG_CTRL); ++ mtk_w32(eth, val | MTK_CDMQ_STAG_EN, MTK_CDMQ_IG_CTRL); ++ ++ /* Enable RX VLan Offloading */ ++ mtk_w32(eth, 1, MTK_CDMP_EG_CTRL); ++ ++ /* set interrupt delays based on current Net DIM sample */ ++ mtk_dim_rx(ð->rx_dim.work); ++ mtk_dim_tx(ð->tx_dim.work); ++ ++ /* disable delay and normal interrupt */ ++ mtk_tx_irq_disable(eth, ~0); ++ mtk_rx_irq_disable(eth, ~0); ++ ++ /* FE int grouping */ ++ mtk_w32(eth, MTK_TX_DONE_INT, reg_map->pdma.int_grp); ++ mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, reg_map->pdma.int_grp + 4); ++ mtk_w32(eth, MTK_TX_DONE_INT, reg_map->qdma.int_grp); ++ mtk_w32(eth, eth->soc->txrx.rx_irq_done_mask, reg_map->qdma.int_grp + 4); ++ mtk_w32(eth, 0x21021000, MTK_FE_INT_GRP); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_NETSYS_V2)) { ++ /* PSE should not drop port8 and port9 packets */ ++ mtk_w32(eth, 0x00000300, PSE_DROP_CFG); ++ ++ /* PSE Free Queue Flow Control */ ++ mtk_w32(eth, 0x01fa01f4, PSE_FQFC_CFG2); ++ ++ /* PSE config input queue threshold */ ++ mtk_w32(eth, 0x001a000e, PSE_IQ_REV(1)); ++ mtk_w32(eth, 0x01ff001a, PSE_IQ_REV(2)); ++ mtk_w32(eth, 0x000e01ff, PSE_IQ_REV(3)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(4)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(5)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(6)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(7)); ++ mtk_w32(eth, 0x000e000e, PSE_IQ_REV(8)); ++ ++ /* PSE config output queue threshold */ ++ mtk_w32(eth, 0x000f000a, PSE_OQ_TH(1)); ++ mtk_w32(eth, 0x001a000f, PSE_OQ_TH(2)); ++ mtk_w32(eth, 0x000f001a, PSE_OQ_TH(3)); ++ mtk_w32(eth, 0x01ff000f, PSE_OQ_TH(4)); ++ mtk_w32(eth, 0x000f000f, PSE_OQ_TH(5)); ++ mtk_w32(eth, 0x0006000f, PSE_OQ_TH(6)); ++ mtk_w32(eth, 0x00060006, PSE_OQ_TH(7)); ++ mtk_w32(eth, 0x00060006, PSE_OQ_TH(8)); ++ ++ /* GDM and CDM Threshold */ ++ mtk_w32(eth, 0x00000004, MTK_GDM2_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDMW0_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDMW1_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDME0_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDME1_THRES); ++ mtk_w32(eth, 0x00000004, MTK_CDMM_THRES); ++ } ++ ++ return 0; ++ ++err_disable_pm: ++ pm_runtime_put_sync(eth->dev); ++ pm_runtime_disable(eth->dev); ++ ++ return ret; ++} ++ ++static int mtk_hw_deinit(struct mtk_eth *eth) ++{ ++ if (!test_and_clear_bit(MTK_HW_INIT, ð->state)) ++ return 0; ++ ++ mtk_clk_disable(eth); ++ ++ pm_runtime_put_sync(eth->dev); ++ pm_runtime_disable(eth->dev); ++ ++ return 0; ++} ++ ++static int __init mtk_init(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ int ret; ++ ++ ret = of_get_ethdev_address(mac->of_node, dev); ++ if (ret) { ++ /* If the mac address is invalid, use random mac address */ ++ eth_hw_addr_random(dev); ++ dev_err(eth->dev, "generated random MAC address %pM\n", ++ dev->dev_addr); ++ } ++ ++ return 0; ++} ++ ++static void mtk_uninit(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ ++ phylink_disconnect_phy(mac->phylink); ++ mtk_tx_irq_disable(eth, ~0); ++ mtk_rx_irq_disable(eth, ~0); ++} ++ ++static int mtk_change_mtu(struct net_device *dev, int new_mtu) ++{ ++ int length = new_mtu + MTK_RX_ETH_HLEN; ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_eth *eth = mac->hw; ++ u32 mcr_cur, mcr_new; ++ ++ if (rcu_access_pointer(eth->prog) && ++ length > MTK_PP_MAX_BUF_SIZE) { ++ netdev_err(dev, "Invalid MTU for XDP mode\n"); ++ return -EINVAL; ++ } ++ ++ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); ++ mcr_new = mcr_cur & ~MAC_MCR_MAX_RX_MASK; ++ ++ if (length <= 1518) ++ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1518); ++ else if (length <= 1536) ++ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1536); ++ else if (length <= 1552) ++ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1552); ++ else ++ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_2048); ++ ++ if (mcr_new != mcr_cur) ++ mtk_w32(mac->hw, mcr_new, MTK_MAC_MCR(mac->id)); ++ } ++ ++ dev->mtu = new_mtu; ++ ++ return 0; ++} ++ ++static int mtk_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ switch (cmd) { ++ case SIOCGMIIPHY: ++ case SIOCGMIIREG: ++ case SIOCSMIIREG: ++ return phylink_mii_ioctl(mac->phylink, ifr, cmd); ++ default: ++ break; ++ } ++ ++ return -EOPNOTSUPP; ++} ++ ++static void mtk_pending_work(struct work_struct *work) ++{ ++ struct mtk_eth *eth = container_of(work, struct mtk_eth, pending_work); ++ int err, i; ++ unsigned long restart = 0; ++ ++ rtnl_lock(); ++ ++ dev_dbg(eth->dev, "[%s][%d] reset\n", __func__, __LINE__); ++ ++ while (test_and_set_bit_lock(MTK_RESETTING, ð->state)) ++ cpu_relax(); ++ ++ dev_dbg(eth->dev, "[%s][%d] mtk_stop starts\n", __func__, __LINE__); ++ /* stop all devices to make sure that dma is properly shut down */ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ mtk_stop(eth->netdev[i]); ++ __set_bit(i, &restart); ++ } ++ dev_dbg(eth->dev, "[%s][%d] mtk_stop ends\n", __func__, __LINE__); ++ ++ /* restart underlying hardware such as power, clock, pin mux ++ * and the connected phy ++ */ ++ mtk_hw_deinit(eth); ++ ++ if (eth->dev->pins) ++ pinctrl_select_state(eth->dev->pins->p, ++ eth->dev->pins->default_state); ++ mtk_hw_init(eth); ++ ++ /* restart DMA and enable IRQs */ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!test_bit(i, &restart)) ++ continue; ++ err = mtk_open(eth->netdev[i]); ++ if (err) { ++ netif_alert(eth, ifup, eth->netdev[i], ++ "Driver up/down cycle failed, closing device.\n"); ++ dev_close(eth->netdev[i]); ++ } ++ } ++ ++ dev_dbg(eth->dev, "[%s][%d] reset done\n", __func__, __LINE__); ++ ++ clear_bit_unlock(MTK_RESETTING, ð->state); ++ ++ rtnl_unlock(); ++} ++ ++static int mtk_free_dev(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ free_netdev(eth->netdev[i]); ++ } ++ ++ return 0; ++} ++ ++static int mtk_unreg_dev(struct mtk_eth *eth) ++{ ++ int i; ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ unregister_netdev(eth->netdev[i]); ++ } ++ ++ return 0; ++} ++ ++static int mtk_cleanup(struct mtk_eth *eth) ++{ ++ mtk_unreg_dev(eth); ++ mtk_free_dev(eth); ++ cancel_work_sync(ð->pending_work); ++ ++ return 0; ++} ++ ++static int mtk_get_link_ksettings(struct net_device *ndev, ++ struct ethtool_link_ksettings *cmd) ++{ ++ struct mtk_mac *mac = netdev_priv(ndev); ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return -EBUSY; ++ ++ return phylink_ethtool_ksettings_get(mac->phylink, cmd); ++} ++ ++static int mtk_set_link_ksettings(struct net_device *ndev, ++ const struct ethtool_link_ksettings *cmd) ++{ ++ struct mtk_mac *mac = netdev_priv(ndev); ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return -EBUSY; ++ ++ return phylink_ethtool_ksettings_set(mac->phylink, cmd); ++} ++ ++static void mtk_get_drvinfo(struct net_device *dev, ++ struct ethtool_drvinfo *info) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ strlcpy(info->driver, mac->hw->dev->driver->name, sizeof(info->driver)); ++ strlcpy(info->bus_info, dev_name(mac->hw->dev), sizeof(info->bus_info)); ++ info->n_stats = ARRAY_SIZE(mtk_ethtool_stats); ++} ++ ++static u32 mtk_get_msglevel(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ return mac->hw->msg_enable; ++} ++ ++static void mtk_set_msglevel(struct net_device *dev, u32 value) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ mac->hw->msg_enable = value; ++} ++ ++static int mtk_nway_reset(struct net_device *dev) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return -EBUSY; ++ ++ if (!mac->phylink) ++ return -ENOTSUPP; ++ ++ return phylink_ethtool_nway_reset(mac->phylink); ++} ++ ++static void mtk_get_strings(struct net_device *dev, u32 stringset, u8 *data) ++{ ++ int i; ++ ++ switch (stringset) { ++ case ETH_SS_STATS: { ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ for (i = 0; i < ARRAY_SIZE(mtk_ethtool_stats); i++) { ++ memcpy(data, mtk_ethtool_stats[i].str, ETH_GSTRING_LEN); ++ data += ETH_GSTRING_LEN; ++ } ++ if (mtk_page_pool_enabled(mac->hw)) ++ page_pool_ethtool_stats_get_strings(data); ++ break; ++ } ++ default: ++ break; ++ } ++} ++ ++static int mtk_get_sset_count(struct net_device *dev, int sset) ++{ ++ switch (sset) { ++ case ETH_SS_STATS: { ++ int count = ARRAY_SIZE(mtk_ethtool_stats); ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ if (mtk_page_pool_enabled(mac->hw)) ++ count += page_pool_ethtool_stats_get_count(); ++ return count; ++ } ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++static void mtk_ethtool_pp_stats(struct mtk_eth *eth, u64 *data) ++{ ++ struct page_pool_stats stats = {}; ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(eth->rx_ring); i++) { ++ struct mtk_rx_ring *ring = ð->rx_ring[i]; ++ ++ if (!ring->page_pool) ++ continue; ++ ++ page_pool_get_stats(ring->page_pool, &stats); ++ } ++ page_pool_ethtool_stats_get(data, &stats); ++} ++ ++static void mtk_get_ethtool_stats(struct net_device *dev, ++ struct ethtool_stats *stats, u64 *data) ++{ ++ struct mtk_mac *mac = netdev_priv(dev); ++ struct mtk_hw_stats *hwstats = mac->hw_stats; ++ u64 *data_src, *data_dst; ++ unsigned int start; ++ int i; ++ ++ if (unlikely(test_bit(MTK_RESETTING, &mac->hw->state))) ++ return; ++ ++ if (netif_running(dev) && netif_device_present(dev)) { ++ if (spin_trylock_bh(&hwstats->stats_lock)) { ++ mtk_stats_update_mac(mac); ++ spin_unlock_bh(&hwstats->stats_lock); ++ } ++ } ++ ++ data_src = (u64 *)hwstats; ++ ++ do { ++ data_dst = data; ++ start = u64_stats_fetch_begin_irq(&hwstats->syncp); ++ ++ for (i = 0; i < ARRAY_SIZE(mtk_ethtool_stats); i++) ++ *data_dst++ = *(data_src + mtk_ethtool_stats[i].offset); ++ if (mtk_page_pool_enabled(mac->hw)) ++ mtk_ethtool_pp_stats(mac->hw, data_dst); ++ } while (u64_stats_fetch_retry_irq(&hwstats->syncp, start)); ++} ++ ++static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, ++ u32 *rule_locs) ++{ ++ int ret = -EOPNOTSUPP; ++ ++ switch (cmd->cmd) { ++ case ETHTOOL_GRXRINGS: ++ if (dev->hw_features & NETIF_F_LRO) { ++ cmd->data = MTK_MAX_RX_RING_NUM; ++ ret = 0; ++ } ++ break; ++ case ETHTOOL_GRXCLSRLCNT: ++ if (dev->hw_features & NETIF_F_LRO) { ++ struct mtk_mac *mac = netdev_priv(dev); ++ ++ cmd->rule_cnt = mac->hwlro_ip_cnt; ++ ret = 0; ++ } ++ break; ++ case ETHTOOL_GRXCLSRULE: ++ if (dev->hw_features & NETIF_F_LRO) ++ ret = mtk_hwlro_get_fdir_entry(dev, cmd); ++ break; ++ case ETHTOOL_GRXCLSRLALL: ++ if (dev->hw_features & NETIF_F_LRO) ++ ret = mtk_hwlro_get_fdir_all(dev, cmd, ++ rule_locs); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++static int mtk_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) ++{ ++ int ret = -EOPNOTSUPP; ++ ++ switch (cmd->cmd) { ++ case ETHTOOL_SRXCLSRLINS: ++ if (dev->hw_features & NETIF_F_LRO) ++ ret = mtk_hwlro_add_ipaddr(dev, cmd); ++ break; ++ case ETHTOOL_SRXCLSRLDEL: ++ if (dev->hw_features & NETIF_F_LRO) ++ ret = mtk_hwlro_del_ipaddr(dev, cmd); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++static const struct ethtool_ops mtk_ethtool_ops = { ++ .get_link_ksettings = mtk_get_link_ksettings, ++ .set_link_ksettings = mtk_set_link_ksettings, ++ .get_drvinfo = mtk_get_drvinfo, ++ .get_msglevel = mtk_get_msglevel, ++ .set_msglevel = mtk_set_msglevel, ++ .nway_reset = mtk_nway_reset, ++ .get_link = ethtool_op_get_link, ++ .get_strings = mtk_get_strings, ++ .get_sset_count = mtk_get_sset_count, ++ .get_ethtool_stats = mtk_get_ethtool_stats, ++ .get_rxnfc = mtk_get_rxnfc, ++ .set_rxnfc = mtk_set_rxnfc, ++}; ++ ++static const struct net_device_ops mtk_netdev_ops = { ++ .ndo_init = mtk_init, ++ .ndo_uninit = mtk_uninit, ++ .ndo_open = mtk_open, ++ .ndo_stop = mtk_stop, ++ .ndo_start_xmit = mtk_start_xmit, ++ .ndo_set_mac_address = mtk_set_mac_address, ++ .ndo_validate_addr = eth_validate_addr, ++ .ndo_eth_ioctl = mtk_do_ioctl, ++ .ndo_change_mtu = mtk_change_mtu, ++ .ndo_tx_timeout = mtk_tx_timeout, ++ .ndo_get_stats64 = mtk_get_stats64, ++ .ndo_fix_features = mtk_fix_features, ++ .ndo_set_features = mtk_set_features, ++#ifdef CONFIG_NET_POLL_CONTROLLER ++ .ndo_poll_controller = mtk_poll_controller, ++#endif ++ .ndo_setup_tc = mtk_eth_setup_tc, ++ .ndo_bpf = mtk_xdp, ++ .ndo_xdp_xmit = mtk_xdp_xmit, ++}; ++ ++static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np) ++{ ++ const __be32 *_id = of_get_property(np, "reg", NULL); ++ phy_interface_t phy_mode; ++ struct phylink *phylink; ++ struct mtk_mac *mac; ++ int id, err; ++ ++ if (!_id) { ++ dev_err(eth->dev, "missing mac id\n"); ++ return -EINVAL; ++ } ++ ++ id = be32_to_cpup(_id); ++ if (id >= MTK_MAC_COUNT) { ++ dev_err(eth->dev, "%d is not a valid mac id\n", id); ++ return -EINVAL; ++ } ++ ++ if (eth->netdev[id]) { ++ dev_err(eth->dev, "duplicate mac id found: %d\n", id); ++ return -EINVAL; ++ } ++ ++ eth->netdev[id] = alloc_etherdev(sizeof(*mac)); ++ if (!eth->netdev[id]) { ++ dev_err(eth->dev, "alloc_etherdev failed\n"); ++ return -ENOMEM; ++ } ++ mac = netdev_priv(eth->netdev[id]); ++ eth->mac[id] = mac; ++ mac->id = id; ++ mac->hw = eth; ++ mac->of_node = np; ++ ++ memset(mac->hwlro_ip, 0, sizeof(mac->hwlro_ip)); ++ mac->hwlro_ip_cnt = 0; ++ ++ mac->hw_stats = devm_kzalloc(eth->dev, ++ sizeof(*mac->hw_stats), ++ GFP_KERNEL); ++ if (!mac->hw_stats) { ++ dev_err(eth->dev, "failed to allocate counter memory\n"); ++ err = -ENOMEM; ++ goto free_netdev; ++ } ++ spin_lock_init(&mac->hw_stats->stats_lock); ++ u64_stats_init(&mac->hw_stats->syncp); ++ mac->hw_stats->reg_offset = id * MTK_STAT_OFFSET; ++ ++ /* phylink create */ ++ err = of_get_phy_mode(np, &phy_mode); ++ if (err) { ++ dev_err(eth->dev, "incorrect phy-mode\n"); ++ goto free_netdev; ++ } ++ ++ /* mac config is not set */ ++ mac->interface = PHY_INTERFACE_MODE_NA; ++ mac->speed = SPEED_UNKNOWN; ++ ++ mac->phylink_config.dev = ð->netdev[id]->dev; ++ mac->phylink_config.type = PHYLINK_NETDEV; ++ /* This driver makes use of state->speed in mac_config */ ++ mac->phylink_config.legacy_pre_march2020 = true; ++ mac->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | ++ MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD; ++ ++ __set_bit(PHY_INTERFACE_MODE_MII, ++ mac->phylink_config.supported_interfaces); ++ __set_bit(PHY_INTERFACE_MODE_GMII, ++ mac->phylink_config.supported_interfaces); ++ ++ if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_RGMII)) ++ phy_interface_set_rgmii(mac->phylink_config.supported_interfaces); ++ ++ if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_TRGMII) && !mac->id) ++ __set_bit(PHY_INTERFACE_MODE_TRGMII, ++ mac->phylink_config.supported_interfaces); ++ ++ if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_SGMII)) { ++ __set_bit(PHY_INTERFACE_MODE_SGMII, ++ mac->phylink_config.supported_interfaces); ++ __set_bit(PHY_INTERFACE_MODE_1000BASEX, ++ mac->phylink_config.supported_interfaces); ++ __set_bit(PHY_INTERFACE_MODE_2500BASEX, ++ mac->phylink_config.supported_interfaces); ++ } ++ ++ phylink = phylink_create(&mac->phylink_config, ++ of_fwnode_handle(mac->of_node), ++ phy_mode, &mtk_phylink_ops); ++ if (IS_ERR(phylink)) { ++ err = PTR_ERR(phylink); ++ goto free_netdev; ++ } ++ ++ mac->phylink = phylink; ++ ++ SET_NETDEV_DEV(eth->netdev[id], eth->dev); ++ eth->netdev[id]->watchdog_timeo = 5 * HZ; ++ eth->netdev[id]->netdev_ops = &mtk_netdev_ops; ++ eth->netdev[id]->base_addr = (unsigned long)eth->base; ++ ++ eth->netdev[id]->hw_features = eth->soc->hw_features; ++ if (eth->hwlro) ++ eth->netdev[id]->hw_features |= NETIF_F_LRO; ++ ++ eth->netdev[id]->vlan_features = eth->soc->hw_features & ++ ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX); ++ eth->netdev[id]->features |= eth->soc->hw_features; ++ eth->netdev[id]->ethtool_ops = &mtk_ethtool_ops; ++ ++ eth->netdev[id]->irq = eth->irq[0]; ++ eth->netdev[id]->dev.of_node = np; ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH - MTK_RX_ETH_HLEN; ++ else ++ eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN; ++ ++ return 0; ++ ++free_netdev: ++ free_netdev(eth->netdev[id]); ++ return err; ++} ++ ++void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev) ++{ ++ struct net_device *dev, *tmp; ++ LIST_HEAD(dev_list); ++ int i; ++ ++ rtnl_lock(); ++ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ dev = eth->netdev[i]; ++ ++ if (!dev || !(dev->flags & IFF_UP)) ++ continue; ++ ++ list_add_tail(&dev->close_list, &dev_list); ++ } ++ ++ dev_close_many(&dev_list, false); ++ ++ eth->dma_dev = dma_dev; ++ ++ list_for_each_entry_safe(dev, tmp, &dev_list, close_list) { ++ list_del_init(&dev->close_list); ++ dev_open(dev, NULL); ++ } ++ ++ rtnl_unlock(); ++} ++ ++static int mtk_probe(struct platform_device *pdev) ++{ ++ struct device_node *mac_np; ++ struct mtk_eth *eth; ++ int err, i; ++ ++ eth = devm_kzalloc(&pdev->dev, sizeof(*eth), GFP_KERNEL); ++ if (!eth) ++ return -ENOMEM; ++ ++ eth->soc = of_device_get_match_data(&pdev->dev); ++ ++ eth->dev = &pdev->dev; ++ eth->dma_dev = &pdev->dev; ++ eth->base = devm_platform_ioremap_resource(pdev, 0); ++ if (IS_ERR(eth->base)) ++ return PTR_ERR(eth->base); ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) ++ eth->ip_align = NET_IP_ALIGN; ++ ++ spin_lock_init(ð->page_lock); ++ spin_lock_init(ð->tx_irq_lock); ++ spin_lock_init(ð->rx_irq_lock); ++ spin_lock_init(ð->dim_lock); ++ ++ eth->rx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; ++ INIT_WORK(ð->rx_dim.work, mtk_dim_rx); ++ ++ eth->tx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; ++ INIT_WORK(ð->tx_dim.work, mtk_dim_tx); ++ ++ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ eth->ethsys = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, ++ "mediatek,ethsys"); ++ if (IS_ERR(eth->ethsys)) { ++ dev_err(&pdev->dev, "no ethsys regmap found\n"); ++ return PTR_ERR(eth->ethsys); ++ } ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_INFRA)) { ++ eth->infra = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, ++ "mediatek,infracfg"); ++ if (IS_ERR(eth->infra)) { ++ dev_err(&pdev->dev, "no infracfg regmap found\n"); ++ return PTR_ERR(eth->infra); ++ } ++ } ++ ++ if (of_dma_is_coherent(pdev->dev.of_node)) { ++ struct regmap *cci; ++ ++ cci = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, ++ "cci-control-port"); ++ /* enable CPU/bus coherency */ ++ if (!IS_ERR(cci)) ++ regmap_write(cci, 0, 3); ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) { ++ eth->sgmii = devm_kzalloc(eth->dev, sizeof(*eth->sgmii), ++ GFP_KERNEL); ++ if (!eth->sgmii) ++ return -ENOMEM; ++ ++ err = mtk_sgmii_init(eth->sgmii, pdev->dev.of_node, ++ eth->soc->ana_rgc3); ++ ++ if (err) ++ return err; ++ } ++ ++ if (eth->soc->required_pctl) { ++ eth->pctl = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, ++ "mediatek,pctl"); ++ if (IS_ERR(eth->pctl)) { ++ dev_err(&pdev->dev, "no pctl regmap found\n"); ++ return PTR_ERR(eth->pctl); ++ } ++ } ++ ++ for (i = 0;; i++) { ++ struct device_node *np = of_parse_phandle(pdev->dev.of_node, ++ "mediatek,wed", i); ++ static const u32 wdma_regs[] = { ++ MTK_WDMA0_BASE, ++ MTK_WDMA1_BASE ++ }; ++ void __iomem *wdma; ++ ++ if (!np || i >= ARRAY_SIZE(wdma_regs)) ++ break; ++ ++ wdma = eth->base + wdma_regs[i]; ++ mtk_wed_add_hw(np, eth, wdma, i); ++ } ++ ++ for (i = 0; i < 3; i++) { ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_INT) && i > 0) ++ eth->irq[i] = eth->irq[0]; ++ else ++ eth->irq[i] = platform_get_irq(pdev, i); ++ if (eth->irq[i] < 0) { ++ dev_err(&pdev->dev, "no IRQ%d resource found\n", i); ++ err = -ENXIO; ++ goto err_wed_exit; ++ } ++ } ++ for (i = 0; i < ARRAY_SIZE(eth->clks); i++) { ++ eth->clks[i] = devm_clk_get(eth->dev, ++ mtk_clks_source_name[i]); ++ if (IS_ERR(eth->clks[i])) { ++ if (PTR_ERR(eth->clks[i]) == -EPROBE_DEFER) { ++ err = -EPROBE_DEFER; ++ goto err_wed_exit; ++ } ++ if (eth->soc->required_clks & BIT(i)) { ++ dev_err(&pdev->dev, "clock %s not found\n", ++ mtk_clks_source_name[i]); ++ err = -EINVAL; ++ goto err_wed_exit; ++ } ++ eth->clks[i] = NULL; ++ } ++ } ++ ++ eth->msg_enable = netif_msg_init(mtk_msg_level, MTK_DEFAULT_MSG_ENABLE); ++ INIT_WORK(ð->pending_work, mtk_pending_work); ++ ++ err = mtk_hw_init(eth); ++ if (err) ++ goto err_wed_exit; ++ ++ eth->hwlro = MTK_HAS_CAPS(eth->soc->caps, MTK_HWLRO); ++ ++ for_each_child_of_node(pdev->dev.of_node, mac_np) { ++ if (!of_device_is_compatible(mac_np, ++ "mediatek,eth-mac")) ++ continue; ++ ++ if (!of_device_is_available(mac_np)) ++ continue; ++ ++ err = mtk_add_mac(eth, mac_np); ++ if (err) { ++ of_node_put(mac_np); ++ goto err_deinit_hw; ++ } ++ } ++ ++ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_INT)) { ++ err = devm_request_irq(eth->dev, eth->irq[0], ++ mtk_handle_irq, 0, ++ dev_name(eth->dev), eth); ++ } else { ++ err = devm_request_irq(eth->dev, eth->irq[1], ++ mtk_handle_irq_tx, 0, ++ dev_name(eth->dev), eth); ++ if (err) ++ goto err_free_dev; ++ ++ err = devm_request_irq(eth->dev, eth->irq[2], ++ mtk_handle_irq_rx, 0, ++ dev_name(eth->dev), eth); ++ } ++ if (err) ++ goto err_free_dev; ++ ++ /* No MT7628/88 support yet */ ++ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) { ++ err = mtk_mdio_init(eth); ++ if (err) ++ goto err_free_dev; ++ } ++ ++ if (eth->soc->offload_version) { ++ eth->ppe = mtk_ppe_init(eth, eth->base + MTK_ETH_PPE_BASE, 2); ++ if (!eth->ppe) { ++ err = -ENOMEM; ++ goto err_deinit_mdio; ++ } ++ ++ err = mtk_eth_offload_init(eth); ++ if (err) ++ goto err_deinit_mdio; ++ } ++ ++ for (i = 0; i < MTK_MAX_DEVS; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ ++ err = register_netdev(eth->netdev[i]); ++ if (err) { ++ dev_err(eth->dev, "error bringing up device\n"); ++ goto err_deinit_mdio; ++ } else ++ netif_info(eth, probe, eth->netdev[i], ++ "mediatek frame engine at 0x%08lx, irq %d\n", ++ eth->netdev[i]->base_addr, eth->irq[0]); ++ } ++ ++ /* we run 2 devices on the same DMA ring so we need a dummy device ++ * for NAPI to work ++ */ ++ init_dummy_netdev(ð->dummy_dev); ++ netif_napi_add(ð->dummy_dev, ð->tx_napi, mtk_napi_tx, ++ NAPI_POLL_WEIGHT); ++ netif_napi_add(ð->dummy_dev, ð->rx_napi, mtk_napi_rx, ++ NAPI_POLL_WEIGHT); ++ ++ platform_set_drvdata(pdev, eth); ++ ++ return 0; ++ ++err_deinit_mdio: ++ mtk_mdio_cleanup(eth); ++err_free_dev: ++ mtk_free_dev(eth); ++err_deinit_hw: ++ mtk_hw_deinit(eth); ++err_wed_exit: ++ mtk_wed_exit(); ++ ++ return err; ++} ++ ++static int mtk_remove(struct platform_device *pdev) ++{ ++ struct mtk_eth *eth = platform_get_drvdata(pdev); ++ struct mtk_mac *mac; ++ int i; ++ ++ /* stop all devices to make sure that dma is properly shut down */ ++ for (i = 0; i < MTK_MAC_COUNT; i++) { ++ if (!eth->netdev[i]) ++ continue; ++ mtk_stop(eth->netdev[i]); ++ mac = netdev_priv(eth->netdev[i]); ++ phylink_disconnect_phy(mac->phylink); ++ } ++ ++ mtk_wed_exit(); ++ mtk_hw_deinit(eth); ++ ++ netif_napi_del(ð->tx_napi); ++ netif_napi_del(ð->rx_napi); ++ mtk_cleanup(eth); ++ mtk_mdio_cleanup(eth); ++ ++ return 0; ++} ++ ++static const struct mtk_soc_data mt2701_data = { ++ .reg_map = &mtk_reg_map, ++ .caps = MT7623_CAPS | MTK_HWLRO, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7623_CLKS_BITMAP, ++ .required_pctl = true, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7621_data = { ++ .reg_map = &mtk_reg_map, ++ .caps = MT7621_CAPS, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7621_CLKS_BITMAP, ++ .required_pctl = false, ++ .offload_version = 2, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7622_data = { ++ .reg_map = &mtk_reg_map, ++ .ana_rgc3 = 0x2028, ++ .caps = MT7622_CAPS | MTK_HWLRO, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7622_CLKS_BITMAP, ++ .required_pctl = false, ++ .offload_version = 2, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7623_data = { ++ .reg_map = &mtk_reg_map, ++ .caps = MT7623_CAPS | MTK_HWLRO, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7623_CLKS_BITMAP, ++ .required_pctl = true, ++ .offload_version = 2, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7629_data = { ++ .reg_map = &mtk_reg_map, ++ .ana_rgc3 = 0x128, ++ .caps = MT7629_CAPS | MTK_HWLRO, ++ .hw_features = MTK_HW_FEATURES, ++ .required_clks = MT7629_CLKS_BITMAP, ++ .required_pctl = false, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++static const struct mtk_soc_data mt7986_data = { ++ .reg_map = &mt7986_reg_map, ++ .ana_rgc3 = 0x128, ++ .caps = MT7986_CAPS, ++ .required_clks = MT7986_CLKS_BITMAP, ++ .required_pctl = false, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma_v2), ++ .rxd_size = sizeof(struct mtk_rx_dma_v2), ++ .rx_irq_done_mask = MTK_RX_DONE_INT_V2, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID_V2, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN_V2, ++ .dma_len_offset = 8, ++ }, ++}; ++ ++static const struct mtk_soc_data rt5350_data = { ++ .reg_map = &mt7628_reg_map, ++ .caps = MT7628_CAPS, ++ .hw_features = MTK_HW_FEATURES_MT7628, ++ .required_clks = MT7628_CLKS_BITMAP, ++ .required_pctl = false, ++ .txrx = { ++ .txd_size = sizeof(struct mtk_tx_dma), ++ .rxd_size = sizeof(struct mtk_rx_dma), ++ .rx_irq_done_mask = MTK_RX_DONE_INT, ++ .rx_dma_l4_valid = RX_DMA_L4_VALID_PDMA, ++ .dma_max_len = MTK_TX_DMA_BUF_LEN, ++ .dma_len_offset = 16, ++ }, ++}; ++ ++const struct of_device_id of_mtk_match[] = { ++ { .compatible = "mediatek,mt2701-eth", .data = &mt2701_data}, ++ { .compatible = "mediatek,mt7621-eth", .data = &mt7621_data}, ++ { .compatible = "mediatek,mt7622-eth", .data = &mt7622_data}, ++ { .compatible = "mediatek,mt7623-eth", .data = &mt7623_data}, ++ { .compatible = "mediatek,mt7629-eth", .data = &mt7629_data}, ++ { .compatible = "mediatek,mt7986-eth", .data = &mt7986_data}, ++ { .compatible = "ralink,rt5350-eth", .data = &rt5350_data}, ++ {}, ++}; ++MODULE_DEVICE_TABLE(of, of_mtk_match); ++ ++static struct platform_driver mtk_driver = { ++ .probe = mtk_probe, ++ .remove = mtk_remove, ++ .driver = { ++ .name = "mtk_soc_eth", ++ .of_match_table = of_mtk_match, ++ }, ++}; ++ ++module_platform_driver(mtk_driver); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("John Crispin "); ++MODULE_DESCRIPTION("Ethernet driver for MediaTek SoC"); +diff -rupN linux.orig/drivers/net/ethernet/mellanox/mlxsw/spectrum.c linux/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +--- linux.orig/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 2022-12-04 10:40:26.692034106 -0500 +@@ -827,12 +827,12 @@ mlxsw_sp_port_get_sw_stats64(const struc for_each_possible_cpu(i) { p = per_cpu_ptr(mlxsw_sp_port->pcpu_stats, i); do { @@ -2234,11 +12394,10 @@ index 30c7b0e157218..fa2753318cdf7 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c -index 9259a74eca40b..318dbbb482797 100644 ---- a/drivers/net/ethernet/microsoft/mana/mana_en.c -+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c -@@ -315,10 +315,10 @@ static void mana_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/ethernet/microsoft/mana/mana_en.c linux/drivers/net/ethernet/microsoft/mana/mana_en.c +--- linux.orig/drivers/net/ethernet/microsoft/mana/mana_en.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/microsoft/mana/mana_en.c 2022-12-04 10:40:26.692034106 -0500 +@@ -315,10 +315,10 @@ static void mana_get_stats64(struct net_ rx_stats = &apc->rxqs[q]->stats; do { @@ -2251,7 +12410,7 @@ index 9259a74eca40b..318dbbb482797 100644 st->rx_packets += packets; st->rx_bytes += bytes; -@@ -328,10 +328,10 @@ static void mana_get_stats64(struct net_device *ndev, +@@ -328,10 +328,10 @@ static void mana_get_stats64(struct net_ tx_stats = &apc->tx_qp[q].txq.stats; do { @@ -2264,11 +12423,10 @@ index 9259a74eca40b..318dbbb482797 100644 st->tx_packets += packets; st->tx_bytes += bytes; -diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c -index c530db76880f0..96d55c91c9698 100644 ---- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c -+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c -@@ -90,13 +90,13 @@ static void mana_get_ethtool_stats(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/ethernet/microsoft/mana/mana_ethtool.c linux/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +--- linux.orig/drivers/net/ethernet/microsoft/mana/mana_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/microsoft/mana/mana_ethtool.c 2022-12-04 10:40:26.692034106 -0500 +@@ -90,13 +90,13 @@ static void mana_get_ethtool_stats(struc rx_stats = &apc->rxqs[q]->stats; do { @@ -2284,7 +12442,7 @@ index c530db76880f0..96d55c91c9698 100644 data[i++] = packets; data[i++] = bytes; -@@ -109,11 +109,11 @@ static void mana_get_ethtool_stats(struct net_device *ndev, +@@ -109,11 +109,11 @@ static void mana_get_ethtool_stats(struc tx_stats = &apc->tx_qp[q].txq.stats; do { @@ -2298,11 +12456,10 @@ index c530db76880f0..96d55c91c9698 100644 data[i++] = packets; data[i++] = bytes; -diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c -index 349a2b1a19a24..cf4d6f1129fa2 100644 ---- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c -+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c -@@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_common.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 2022-12-04 10:40:26.692034106 -0500 +@@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_de unsigned int start; do { @@ -2328,11 +12485,10 @@ index 349a2b1a19a24..cf4d6f1129fa2 100644 stats->tx_packets += data[0]; stats->tx_bytes += data[1]; stats->tx_errors += data[2]; -diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c -index b1b1b648e40cb..eeb1455a4e5db 100644 ---- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c -+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c -@@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) +diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 2022-12-04 10:40:26.692034106 -0500 +@@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct unsigned int start; do { @@ -2341,7 +12497,7 @@ index b1b1b648e40cb..eeb1455a4e5db 100644 data[0] = nn->r_vecs[i].rx_pkts; tmp[0] = nn->r_vecs[i].hw_csum_rx_ok; tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok; -@@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) +@@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct tmp[3] = nn->r_vecs[i].hw_csum_rx_error; tmp[4] = nn->r_vecs[i].rx_replace_buf_alloc_fail; tmp[5] = nn->r_vecs[i].hw_tls_rx; @@ -2354,7 +12510,7 @@ index b1b1b648e40cb..eeb1455a4e5db 100644 data[1] = nn->r_vecs[i].tx_pkts; data[2] = nn->r_vecs[i].tx_busy; tmp[6] = nn->r_vecs[i].hw_csum_tx; -@@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) +@@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct tmp[10] = nn->r_vecs[i].hw_tls_tx; tmp[11] = nn->r_vecs[i].tls_tx_fallback; tmp[12] = nn->r_vecs[i].tls_tx_no_fallback; @@ -2363,11 +12519,10 @@ index b1b1b648e40cb..eeb1455a4e5db 100644 data += NN_RVEC_PER_Q_STATS; -diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c -index 8b77582bdfa01..a6b6ca1fd55ee 100644 ---- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c -+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c -@@ -134,13 +134,13 @@ nfp_repr_get_host_stats64(const struct net_device *netdev, +diff -rupN linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c linux/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +--- linux.orig/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c 2022-12-04 10:40:26.692034106 -0500 +@@ -134,13 +134,13 @@ nfp_repr_get_host_stats64(const struct n repr_stats = per_cpu_ptr(repr->stats, i); do { @@ -2383,11 +12538,10 @@ index 8b77582bdfa01..a6b6ca1fd55ee 100644 stats->tx_bytes += tbytes; stats->tx_packets += tpkts; -diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c -index 5116badaf0919..50ebbd7e91c48 100644 ---- a/drivers/net/ethernet/nvidia/forcedeth.c -+++ b/drivers/net/ethernet/nvidia/forcedeth.c -@@ -1734,12 +1734,12 @@ static void nv_get_stats(int cpu, struct fe_priv *np, +diff -rupN linux.orig/drivers/net/ethernet/nvidia/forcedeth.c linux/drivers/net/ethernet/nvidia/forcedeth.c +--- linux.orig/drivers/net/ethernet/nvidia/forcedeth.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/nvidia/forcedeth.c 2022-12-04 10:40:26.692034106 -0500 +@@ -1734,12 +1734,12 @@ static void nv_get_stats(int cpu, struct u64 tx_packets, tx_bytes, tx_dropped; do { @@ -2402,7 +12556,7 @@ index 5116badaf0919..50ebbd7e91c48 100644 storage->rx_packets += rx_packets; storage->rx_bytes += rx_bytes; -@@ -1747,11 +1747,11 @@ static void nv_get_stats(int cpu, struct fe_priv *np, +@@ -1747,11 +1747,11 @@ static void nv_get_stats(int cpu, struct storage->rx_missed_errors += rx_missed_errors; do { @@ -2416,11 +12570,10 @@ index 5116badaf0919..50ebbd7e91c48 100644 storage->tx_packets += tx_packets; storage->tx_bytes += tx_bytes; -diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c -index 1b2119b1d48aa..3f5e6572d20e7 100644 ---- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c -+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c -@@ -135,9 +135,9 @@ static void rmnet_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c linux/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c +--- linux.orig/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c 2022-12-04 10:40:26.692034106 -0500 +@@ -135,9 +135,9 @@ static void rmnet_get_stats64(struct net pcpu_ptr = per_cpu_ptr(priv->pcpu_stats, cpu); do { @@ -2432,11 +12585,10 @@ index 1b2119b1d48aa..3f5e6572d20e7 100644 total_stats.rx_pkts += snapshot.rx_pkts; total_stats.rx_bytes += snapshot.rx_bytes; -diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c -index 15b40fd93cd2e..82bd0eb614634 100644 ---- a/drivers/net/ethernet/realtek/8139too.c -+++ b/drivers/net/ethernet/realtek/8139too.c -@@ -2532,16 +2532,16 @@ rtl8139_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/ethernet/realtek/8139too.c linux/drivers/net/ethernet/realtek/8139too.c +--- linux.orig/drivers/net/ethernet/realtek/8139too.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/realtek/8139too.c 2022-12-04 10:40:26.692034106 -0500 +@@ -2532,16 +2532,16 @@ rtl8139_get_stats64(struct net_device *d netdev_stats_to_stats64(stats, &dev->stats); do { @@ -2457,11 +12609,10 @@ index 15b40fd93cd2e..82bd0eb614634 100644 } /* Set or clear the multicast filter for this adaptor. -diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c -index f0c8de2c60755..d4f7238333bb7 100644 ---- a/drivers/net/ethernet/socionext/sni_ave.c -+++ b/drivers/net/ethernet/socionext/sni_ave.c -@@ -1506,16 +1506,16 @@ static void ave_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/ethernet/socionext/sni_ave.c linux/drivers/net/ethernet/socionext/sni_ave.c +--- linux.orig/drivers/net/ethernet/socionext/sni_ave.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/socionext/sni_ave.c 2022-12-04 10:40:26.692034106 -0500 +@@ -1508,16 +1508,16 @@ static void ave_get_stats64(struct net_d unsigned int start; do { @@ -2482,11 +12633,2010 @@ index f0c8de2c60755..d4f7238333bb7 100644 stats->rx_errors = priv->stats_rx.errors; stats->tx_errors = priv->stats_tx.errors; -diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c -index f4a6b590a1e39..1b62400c19049 100644 ---- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c -+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c -@@ -1365,12 +1365,12 @@ static void am65_cpsw_nuss_ndo_get_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ethernet/socionext/sni_ave.c.orig linux/drivers/net/ethernet/socionext/sni_ave.c.orig +--- linux.orig/drivers/net/ethernet/socionext/sni_ave.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/ethernet/socionext/sni_ave.c.orig 2022-12-04 10:40:18.168055947 -0500 +@@ -0,0 +1,1996 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * sni_ave.c - Socionext UniPhier AVE ethernet driver ++ * Copyright 2014 Panasonic Corporation ++ * Copyright 2015-2017 Socionext Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* General Register Group */ ++#define AVE_IDR 0x000 /* ID */ ++#define AVE_VR 0x004 /* Version */ ++#define AVE_GRR 0x008 /* Global Reset */ ++#define AVE_CFGR 0x00c /* Configuration */ ++ ++/* Interrupt Register Group */ ++#define AVE_GIMR 0x100 /* Global Interrupt Mask */ ++#define AVE_GISR 0x104 /* Global Interrupt Status */ ++ ++/* MAC Register Group */ ++#define AVE_TXCR 0x200 /* TX Setup */ ++#define AVE_RXCR 0x204 /* RX Setup */ ++#define AVE_RXMAC1R 0x208 /* MAC address (lower) */ ++#define AVE_RXMAC2R 0x20c /* MAC address (upper) */ ++#define AVE_MDIOCTR 0x214 /* MDIO Control */ ++#define AVE_MDIOAR 0x218 /* MDIO Address */ ++#define AVE_MDIOWDR 0x21c /* MDIO Data */ ++#define AVE_MDIOSR 0x220 /* MDIO Status */ ++#define AVE_MDIORDR 0x224 /* MDIO Rd Data */ ++ ++/* Descriptor Control Register Group */ ++#define AVE_DESCC 0x300 /* Descriptor Control */ ++#define AVE_TXDC 0x304 /* TX Descriptor Configuration */ ++#define AVE_RXDC0 0x308 /* RX Descriptor Ring0 Configuration */ ++#define AVE_IIRQC 0x34c /* Interval IRQ Control */ ++ ++/* Packet Filter Register Group */ ++#define AVE_PKTF_BASE 0x800 /* PF Base Address */ ++#define AVE_PFMBYTE_BASE 0xd00 /* PF Mask Byte Base Address */ ++#define AVE_PFMBIT_BASE 0xe00 /* PF Mask Bit Base Address */ ++#define AVE_PFSEL_BASE 0xf00 /* PF Selector Base Address */ ++#define AVE_PFEN 0xffc /* Packet Filter Enable */ ++#define AVE_PKTF(ent) (AVE_PKTF_BASE + (ent) * 0x40) ++#define AVE_PFMBYTE(ent) (AVE_PFMBYTE_BASE + (ent) * 8) ++#define AVE_PFMBIT(ent) (AVE_PFMBIT_BASE + (ent) * 4) ++#define AVE_PFSEL(ent) (AVE_PFSEL_BASE + (ent) * 4) ++ ++/* 64bit descriptor memory */ ++#define AVE_DESC_SIZE_64 12 /* Descriptor Size */ ++ ++#define AVE_TXDM_64 0x1000 /* Tx Descriptor Memory */ ++#define AVE_RXDM_64 0x1c00 /* Rx Descriptor Memory */ ++ ++#define AVE_TXDM_SIZE_64 0x0ba0 /* Tx Descriptor Memory Size 3KB */ ++#define AVE_RXDM_SIZE_64 0x6000 /* Rx Descriptor Memory Size 24KB */ ++ ++/* 32bit descriptor memory */ ++#define AVE_DESC_SIZE_32 8 /* Descriptor Size */ ++ ++#define AVE_TXDM_32 0x1000 /* Tx Descriptor Memory */ ++#define AVE_RXDM_32 0x1800 /* Rx Descriptor Memory */ ++ ++#define AVE_TXDM_SIZE_32 0x07c0 /* Tx Descriptor Memory Size 2KB */ ++#define AVE_RXDM_SIZE_32 0x4000 /* Rx Descriptor Memory Size 16KB */ ++ ++/* RMII Bridge Register Group */ ++#define AVE_RSTCTRL 0x8028 /* Reset control */ ++#define AVE_RSTCTRL_RMIIRST BIT(16) ++#define AVE_LINKSEL 0x8034 /* Link speed setting */ ++#define AVE_LINKSEL_100M BIT(0) ++ ++/* AVE_GRR */ ++#define AVE_GRR_RXFFR BIT(5) /* Reset RxFIFO */ ++#define AVE_GRR_PHYRST BIT(4) /* Reset external PHY */ ++#define AVE_GRR_GRST BIT(0) /* Reset all MAC */ ++ ++/* AVE_CFGR */ ++#define AVE_CFGR_FLE BIT(31) /* Filter Function */ ++#define AVE_CFGR_CHE BIT(30) /* Checksum Function */ ++#define AVE_CFGR_MII BIT(27) /* Func mode (1:MII/RMII, 0:RGMII) */ ++#define AVE_CFGR_IPFCEN BIT(24) /* IP fragment sum Enable */ ++ ++/* AVE_GISR (common with GIMR) */ ++#define AVE_GI_PHY BIT(24) /* PHY interrupt */ ++#define AVE_GI_TX BIT(16) /* Tx complete */ ++#define AVE_GI_RXERR BIT(8) /* Receive frame more than max size */ ++#define AVE_GI_RXOVF BIT(7) /* Overflow at the RxFIFO */ ++#define AVE_GI_RXDROP BIT(6) /* Drop packet */ ++#define AVE_GI_RXIINT BIT(5) /* Interval interrupt */ ++ ++/* AVE_TXCR */ ++#define AVE_TXCR_FLOCTR BIT(18) /* Flow control */ ++#define AVE_TXCR_TXSPD_1G BIT(17) ++#define AVE_TXCR_TXSPD_100 BIT(16) ++ ++/* AVE_RXCR */ ++#define AVE_RXCR_RXEN BIT(30) /* Rx enable */ ++#define AVE_RXCR_FDUPEN BIT(22) /* Interface mode */ ++#define AVE_RXCR_FLOCTR BIT(21) /* Flow control */ ++#define AVE_RXCR_AFEN BIT(19) /* MAC address filter */ ++#define AVE_RXCR_DRPEN BIT(18) /* Drop pause frame */ ++#define AVE_RXCR_MPSIZ_MASK GENMASK(10, 0) ++ ++/* AVE_MDIOCTR */ ++#define AVE_MDIOCTR_RREQ BIT(3) /* Read request */ ++#define AVE_MDIOCTR_WREQ BIT(2) /* Write request */ ++ ++/* AVE_MDIOSR */ ++#define AVE_MDIOSR_STS BIT(0) /* access status */ ++ ++/* AVE_DESCC */ ++#define AVE_DESCC_STATUS_MASK GENMASK(31, 16) ++#define AVE_DESCC_RD0 BIT(8) /* Enable Rx descriptor Ring0 */ ++#define AVE_DESCC_RDSTP BIT(4) /* Pause Rx descriptor */ ++#define AVE_DESCC_TD BIT(0) /* Enable Tx descriptor */ ++ ++/* AVE_TXDC */ ++#define AVE_TXDC_SIZE GENMASK(27, 16) /* Size of Tx descriptor */ ++#define AVE_TXDC_ADDR GENMASK(11, 0) /* Start address */ ++#define AVE_TXDC_ADDR_START 0 ++ ++/* AVE_RXDC0 */ ++#define AVE_RXDC0_SIZE GENMASK(30, 16) /* Size of Rx descriptor */ ++#define AVE_RXDC0_ADDR GENMASK(14, 0) /* Start address */ ++#define AVE_RXDC0_ADDR_START 0 ++ ++/* AVE_IIRQC */ ++#define AVE_IIRQC_EN0 BIT(27) /* Enable interval interrupt Ring0 */ ++#define AVE_IIRQC_BSCK GENMASK(15, 0) /* Interval count unit */ ++ ++/* Command status for descriptor */ ++#define AVE_STS_OWN BIT(31) /* Descriptor ownership */ ++#define AVE_STS_INTR BIT(29) /* Request for interrupt */ ++#define AVE_STS_OK BIT(27) /* Normal transmit */ ++/* TX */ ++#define AVE_STS_NOCSUM BIT(28) /* No use HW checksum */ ++#define AVE_STS_1ST BIT(26) /* Head of buffer chain */ ++#define AVE_STS_LAST BIT(25) /* Tail of buffer chain */ ++#define AVE_STS_OWC BIT(21) /* Out of window,Late Collision */ ++#define AVE_STS_EC BIT(20) /* Excess collision occurred */ ++#define AVE_STS_PKTLEN_TX_MASK GENMASK(15, 0) ++/* RX */ ++#define AVE_STS_CSSV BIT(21) /* Checksum check performed */ ++#define AVE_STS_CSER BIT(20) /* Checksum error detected */ ++#define AVE_STS_PKTLEN_RX_MASK GENMASK(10, 0) ++ ++/* Packet filter */ ++#define AVE_PFMBYTE_MASK0 (GENMASK(31, 8) | GENMASK(5, 0)) ++#define AVE_PFMBYTE_MASK1 GENMASK(25, 0) ++#define AVE_PFMBIT_MASK GENMASK(15, 0) ++ ++#define AVE_PF_SIZE 17 /* Number of all packet filter */ ++#define AVE_PF_MULTICAST_SIZE 7 /* Number of multicast filter */ ++ ++#define AVE_PFNUM_FILTER 0 /* No.0 */ ++#define AVE_PFNUM_UNICAST 1 /* No.1 */ ++#define AVE_PFNUM_BROADCAST 2 /* No.2 */ ++#define AVE_PFNUM_MULTICAST 11 /* No.11-17 */ ++ ++/* NETIF Message control */ ++#define AVE_DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | \ ++ NETIF_MSG_PROBE | \ ++ NETIF_MSG_LINK | \ ++ NETIF_MSG_TIMER | \ ++ NETIF_MSG_IFDOWN | \ ++ NETIF_MSG_IFUP | \ ++ NETIF_MSG_RX_ERR | \ ++ NETIF_MSG_TX_ERR) ++ ++/* Parameter for descriptor */ ++#define AVE_NR_TXDESC 64 /* Tx descriptor */ ++#define AVE_NR_RXDESC 256 /* Rx descriptor */ ++ ++#define AVE_DESC_OFS_CMDSTS 0 ++#define AVE_DESC_OFS_ADDRL 4 ++#define AVE_DESC_OFS_ADDRU 8 ++ ++/* Parameter for ethernet frame */ ++#define AVE_MAX_ETHFRAME 1518 ++#define AVE_FRAME_HEADROOM 2 ++ ++/* Parameter for interrupt */ ++#define AVE_INTM_COUNT 20 ++#define AVE_FORCE_TXINTCNT 1 ++ ++/* SG */ ++#define SG_ETPINMODE 0x540 ++#define SG_ETPINMODE_EXTPHY BIT(1) /* for LD11 */ ++#define SG_ETPINMODE_RMII(ins) BIT(ins) ++ ++#define IS_DESC_64BIT(p) ((p)->data->is_desc_64bit) ++ ++#define AVE_MAX_CLKS 4 ++#define AVE_MAX_RSTS 2 ++ ++enum desc_id { ++ AVE_DESCID_RX, ++ AVE_DESCID_TX, ++}; ++ ++enum desc_state { ++ AVE_DESC_RX_PERMIT, ++ AVE_DESC_RX_SUSPEND, ++ AVE_DESC_START, ++ AVE_DESC_STOP, ++}; ++ ++struct ave_desc { ++ struct sk_buff *skbs; ++ dma_addr_t skbs_dma; ++ size_t skbs_dmalen; ++}; ++ ++struct ave_desc_info { ++ u32 ndesc; /* number of descriptor */ ++ u32 daddr; /* start address of descriptor */ ++ u32 proc_idx; /* index of processing packet */ ++ u32 done_idx; /* index of processed packet */ ++ struct ave_desc *desc; /* skb info related descriptor */ ++}; ++ ++struct ave_stats { ++ struct u64_stats_sync syncp; ++ u64 packets; ++ u64 bytes; ++ u64 errors; ++ u64 dropped; ++ u64 collisions; ++ u64 fifo_errors; ++}; ++ ++struct ave_private { ++ void __iomem *base; ++ int irq; ++ int phy_id; ++ unsigned int desc_size; ++ u32 msg_enable; ++ int nclks; ++ struct clk *clk[AVE_MAX_CLKS]; ++ int nrsts; ++ struct reset_control *rst[AVE_MAX_RSTS]; ++ phy_interface_t phy_mode; ++ struct phy_device *phydev; ++ struct mii_bus *mdio; ++ struct regmap *regmap; ++ unsigned int pinmode_mask; ++ unsigned int pinmode_val; ++ u32 wolopts; ++ ++ /* stats */ ++ struct ave_stats stats_rx; ++ struct ave_stats stats_tx; ++ ++ /* NAPI support */ ++ struct net_device *ndev; ++ struct napi_struct napi_rx; ++ struct napi_struct napi_tx; ++ ++ /* descriptor */ ++ struct ave_desc_info rx; ++ struct ave_desc_info tx; ++ ++ /* flow control */ ++ int pause_auto; ++ int pause_rx; ++ int pause_tx; ++ ++ const struct ave_soc_data *data; ++}; ++ ++struct ave_soc_data { ++ bool is_desc_64bit; ++ const char *clock_names[AVE_MAX_CLKS]; ++ const char *reset_names[AVE_MAX_RSTS]; ++ int (*get_pinmode)(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg); ++}; ++ ++static u32 ave_desc_read(struct net_device *ndev, enum desc_id id, int entry, ++ int offset) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 addr; ++ ++ addr = ((id == AVE_DESCID_TX) ? priv->tx.daddr : priv->rx.daddr) ++ + entry * priv->desc_size + offset; ++ ++ return readl(priv->base + addr); ++} ++ ++static u32 ave_desc_read_cmdsts(struct net_device *ndev, enum desc_id id, ++ int entry) ++{ ++ return ave_desc_read(ndev, id, entry, AVE_DESC_OFS_CMDSTS); ++} ++ ++static void ave_desc_write(struct net_device *ndev, enum desc_id id, ++ int entry, int offset, u32 val) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 addr; ++ ++ addr = ((id == AVE_DESCID_TX) ? priv->tx.daddr : priv->rx.daddr) ++ + entry * priv->desc_size + offset; ++ ++ writel(val, priv->base + addr); ++} ++ ++static void ave_desc_write_cmdsts(struct net_device *ndev, enum desc_id id, ++ int entry, u32 val) ++{ ++ ave_desc_write(ndev, id, entry, AVE_DESC_OFS_CMDSTS, val); ++} ++ ++static void ave_desc_write_addr(struct net_device *ndev, enum desc_id id, ++ int entry, dma_addr_t paddr) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ ave_desc_write(ndev, id, entry, AVE_DESC_OFS_ADDRL, ++ lower_32_bits(paddr)); ++ if (IS_DESC_64BIT(priv)) ++ ave_desc_write(ndev, id, ++ entry, AVE_DESC_OFS_ADDRU, ++ upper_32_bits(paddr)); ++} ++ ++static u32 ave_irq_disable_all(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 ret; ++ ++ ret = readl(priv->base + AVE_GIMR); ++ writel(0, priv->base + AVE_GIMR); ++ ++ return ret; ++} ++ ++static void ave_irq_restore(struct net_device *ndev, u32 val) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ writel(val, priv->base + AVE_GIMR); ++} ++ ++static void ave_irq_enable(struct net_device *ndev, u32 bitflag) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ writel(readl(priv->base + AVE_GIMR) | bitflag, priv->base + AVE_GIMR); ++ writel(bitflag, priv->base + AVE_GISR); ++} ++ ++static void ave_hw_write_macaddr(struct net_device *ndev, ++ const unsigned char *mac_addr, ++ int reg1, int reg2) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ writel(mac_addr[0] | mac_addr[1] << 8 | ++ mac_addr[2] << 16 | mac_addr[3] << 24, priv->base + reg1); ++ writel(mac_addr[4] | mac_addr[5] << 8, priv->base + reg2); ++} ++ ++static void ave_hw_read_version(struct net_device *ndev, char *buf, int len) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 major, minor, vr; ++ ++ vr = readl(priv->base + AVE_VR); ++ major = (vr & GENMASK(15, 8)) >> 8; ++ minor = (vr & GENMASK(7, 0)); ++ snprintf(buf, len, "v%u.%u", major, minor); ++} ++ ++static void ave_ethtool_get_drvinfo(struct net_device *ndev, ++ struct ethtool_drvinfo *info) ++{ ++ struct device *dev = ndev->dev.parent; ++ ++ strlcpy(info->driver, dev->driver->name, sizeof(info->driver)); ++ strlcpy(info->bus_info, dev_name(dev), sizeof(info->bus_info)); ++ ave_hw_read_version(ndev, info->fw_version, sizeof(info->fw_version)); ++} ++ ++static u32 ave_ethtool_get_msglevel(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ return priv->msg_enable; ++} ++ ++static void ave_ethtool_set_msglevel(struct net_device *ndev, u32 val) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ priv->msg_enable = val; ++} ++ ++static void ave_ethtool_get_wol(struct net_device *ndev, ++ struct ethtool_wolinfo *wol) ++{ ++ wol->supported = 0; ++ wol->wolopts = 0; ++ ++ if (ndev->phydev) ++ phy_ethtool_get_wol(ndev->phydev, wol); ++} ++ ++static int __ave_ethtool_set_wol(struct net_device *ndev, ++ struct ethtool_wolinfo *wol) ++{ ++ if (!ndev->phydev || ++ (wol->wolopts & (WAKE_ARP | WAKE_MAGICSECURE))) ++ return -EOPNOTSUPP; ++ ++ return phy_ethtool_set_wol(ndev->phydev, wol); ++} ++ ++static int ave_ethtool_set_wol(struct net_device *ndev, ++ struct ethtool_wolinfo *wol) ++{ ++ int ret; ++ ++ ret = __ave_ethtool_set_wol(ndev, wol); ++ if (!ret) ++ device_set_wakeup_enable(&ndev->dev, !!wol->wolopts); ++ ++ return ret; ++} ++ ++static void ave_ethtool_get_pauseparam(struct net_device *ndev, ++ struct ethtool_pauseparam *pause) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ pause->autoneg = priv->pause_auto; ++ pause->rx_pause = priv->pause_rx; ++ pause->tx_pause = priv->pause_tx; ++} ++ ++static int ave_ethtool_set_pauseparam(struct net_device *ndev, ++ struct ethtool_pauseparam *pause) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ struct phy_device *phydev = ndev->phydev; ++ ++ if (!phydev) ++ return -EINVAL; ++ ++ priv->pause_auto = pause->autoneg; ++ priv->pause_rx = pause->rx_pause; ++ priv->pause_tx = pause->tx_pause; ++ ++ phy_set_asym_pause(phydev, pause->rx_pause, pause->tx_pause); ++ ++ return 0; ++} ++ ++static const struct ethtool_ops ave_ethtool_ops = { ++ .get_link_ksettings = phy_ethtool_get_link_ksettings, ++ .set_link_ksettings = phy_ethtool_set_link_ksettings, ++ .get_drvinfo = ave_ethtool_get_drvinfo, ++ .nway_reset = phy_ethtool_nway_reset, ++ .get_link = ethtool_op_get_link, ++ .get_msglevel = ave_ethtool_get_msglevel, ++ .set_msglevel = ave_ethtool_set_msglevel, ++ .get_wol = ave_ethtool_get_wol, ++ .set_wol = ave_ethtool_set_wol, ++ .get_pauseparam = ave_ethtool_get_pauseparam, ++ .set_pauseparam = ave_ethtool_set_pauseparam, ++}; ++ ++static int ave_mdiobus_read(struct mii_bus *bus, int phyid, int regnum) ++{ ++ struct net_device *ndev = bus->priv; ++ struct ave_private *priv; ++ u32 mdioctl, mdiosr; ++ int ret; ++ ++ priv = netdev_priv(ndev); ++ ++ /* write address */ ++ writel((phyid << 8) | regnum, priv->base + AVE_MDIOAR); ++ ++ /* read request */ ++ mdioctl = readl(priv->base + AVE_MDIOCTR); ++ writel((mdioctl | AVE_MDIOCTR_RREQ) & ~AVE_MDIOCTR_WREQ, ++ priv->base + AVE_MDIOCTR); ++ ++ ret = readl_poll_timeout(priv->base + AVE_MDIOSR, mdiosr, ++ !(mdiosr & AVE_MDIOSR_STS), 20, 2000); ++ if (ret) { ++ netdev_err(ndev, "failed to read (phy:%d reg:%x)\n", ++ phyid, regnum); ++ return ret; ++ } ++ ++ return readl(priv->base + AVE_MDIORDR) & GENMASK(15, 0); ++} ++ ++static int ave_mdiobus_write(struct mii_bus *bus, int phyid, int regnum, ++ u16 val) ++{ ++ struct net_device *ndev = bus->priv; ++ struct ave_private *priv; ++ u32 mdioctl, mdiosr; ++ int ret; ++ ++ priv = netdev_priv(ndev); ++ ++ /* write address */ ++ writel((phyid << 8) | regnum, priv->base + AVE_MDIOAR); ++ ++ /* write data */ ++ writel(val, priv->base + AVE_MDIOWDR); ++ ++ /* write request */ ++ mdioctl = readl(priv->base + AVE_MDIOCTR); ++ writel((mdioctl | AVE_MDIOCTR_WREQ) & ~AVE_MDIOCTR_RREQ, ++ priv->base + AVE_MDIOCTR); ++ ++ ret = readl_poll_timeout(priv->base + AVE_MDIOSR, mdiosr, ++ !(mdiosr & AVE_MDIOSR_STS), 20, 2000); ++ if (ret) ++ netdev_err(ndev, "failed to write (phy:%d reg:%x)\n", ++ phyid, regnum); ++ ++ return ret; ++} ++ ++static int ave_dma_map(struct net_device *ndev, struct ave_desc *desc, ++ void *ptr, size_t len, enum dma_data_direction dir, ++ dma_addr_t *paddr) ++{ ++ dma_addr_t map_addr; ++ ++ map_addr = dma_map_single(ndev->dev.parent, ptr, len, dir); ++ if (unlikely(dma_mapping_error(ndev->dev.parent, map_addr))) ++ return -ENOMEM; ++ ++ desc->skbs_dma = map_addr; ++ desc->skbs_dmalen = len; ++ *paddr = map_addr; ++ ++ return 0; ++} ++ ++static void ave_dma_unmap(struct net_device *ndev, struct ave_desc *desc, ++ enum dma_data_direction dir) ++{ ++ if (!desc->skbs_dma) ++ return; ++ ++ dma_unmap_single(ndev->dev.parent, ++ desc->skbs_dma, desc->skbs_dmalen, dir); ++ desc->skbs_dma = 0; ++} ++ ++/* Prepare Rx descriptor and memory */ ++static int ave_rxdesc_prepare(struct net_device *ndev, int entry) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ struct sk_buff *skb; ++ dma_addr_t paddr; ++ int ret; ++ ++ skb = priv->rx.desc[entry].skbs; ++ if (!skb) { ++ skb = netdev_alloc_skb(ndev, AVE_MAX_ETHFRAME); ++ if (!skb) { ++ netdev_err(ndev, "can't allocate skb for Rx\n"); ++ return -ENOMEM; ++ } ++ skb->data += AVE_FRAME_HEADROOM; ++ skb->tail += AVE_FRAME_HEADROOM; ++ } ++ ++ /* set disable to cmdsts */ ++ ave_desc_write_cmdsts(ndev, AVE_DESCID_RX, entry, ++ AVE_STS_INTR | AVE_STS_OWN); ++ ++ /* map Rx buffer ++ * Rx buffer set to the Rx descriptor has two restrictions: ++ * - Rx buffer address is 4 byte aligned. ++ * - Rx buffer begins with 2 byte headroom, and data will be put from ++ * (buffer + 2). ++ * To satisfy this, specify the address to put back the buffer ++ * pointer advanced by AVE_FRAME_HEADROOM, and expand the map size ++ * by AVE_FRAME_HEADROOM. ++ */ ++ ret = ave_dma_map(ndev, &priv->rx.desc[entry], ++ skb->data - AVE_FRAME_HEADROOM, ++ AVE_MAX_ETHFRAME + AVE_FRAME_HEADROOM, ++ DMA_FROM_DEVICE, &paddr); ++ if (ret) { ++ netdev_err(ndev, "can't map skb for Rx\n"); ++ dev_kfree_skb_any(skb); ++ return ret; ++ } ++ priv->rx.desc[entry].skbs = skb; ++ ++ /* set buffer pointer */ ++ ave_desc_write_addr(ndev, AVE_DESCID_RX, entry, paddr); ++ ++ /* set enable to cmdsts */ ++ ave_desc_write_cmdsts(ndev, AVE_DESCID_RX, entry, ++ AVE_STS_INTR | AVE_MAX_ETHFRAME); ++ ++ return ret; ++} ++ ++/* Switch state of descriptor */ ++static int ave_desc_switch(struct net_device *ndev, enum desc_state state) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ int ret = 0; ++ u32 val; ++ ++ switch (state) { ++ case AVE_DESC_START: ++ writel(AVE_DESCC_TD | AVE_DESCC_RD0, priv->base + AVE_DESCC); ++ break; ++ ++ case AVE_DESC_STOP: ++ writel(0, priv->base + AVE_DESCC); ++ if (readl_poll_timeout(priv->base + AVE_DESCC, val, !val, ++ 150, 15000)) { ++ netdev_err(ndev, "can't stop descriptor\n"); ++ ret = -EBUSY; ++ } ++ break; ++ ++ case AVE_DESC_RX_SUSPEND: ++ val = readl(priv->base + AVE_DESCC); ++ val |= AVE_DESCC_RDSTP; ++ val &= ~AVE_DESCC_STATUS_MASK; ++ writel(val, priv->base + AVE_DESCC); ++ if (readl_poll_timeout(priv->base + AVE_DESCC, val, ++ val & (AVE_DESCC_RDSTP << 16), ++ 150, 150000)) { ++ netdev_err(ndev, "can't suspend descriptor\n"); ++ ret = -EBUSY; ++ } ++ break; ++ ++ case AVE_DESC_RX_PERMIT: ++ val = readl(priv->base + AVE_DESCC); ++ val &= ~AVE_DESCC_RDSTP; ++ val &= ~AVE_DESCC_STATUS_MASK; ++ writel(val, priv->base + AVE_DESCC); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++static int ave_tx_complete(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 proc_idx, done_idx, ndesc, cmdsts; ++ unsigned int nr_freebuf = 0; ++ unsigned int tx_packets = 0; ++ unsigned int tx_bytes = 0; ++ ++ proc_idx = priv->tx.proc_idx; ++ done_idx = priv->tx.done_idx; ++ ndesc = priv->tx.ndesc; ++ ++ /* free pre-stored skb from done_idx to proc_idx */ ++ while (proc_idx != done_idx) { ++ cmdsts = ave_desc_read_cmdsts(ndev, AVE_DESCID_TX, done_idx); ++ ++ /* do nothing if owner is HW (==1 for Tx) */ ++ if (cmdsts & AVE_STS_OWN) ++ break; ++ ++ /* check Tx status and updates statistics */ ++ if (cmdsts & AVE_STS_OK) { ++ tx_bytes += cmdsts & AVE_STS_PKTLEN_TX_MASK; ++ /* success */ ++ if (cmdsts & AVE_STS_LAST) ++ tx_packets++; ++ } else { ++ /* error */ ++ if (cmdsts & AVE_STS_LAST) { ++ priv->stats_tx.errors++; ++ if (cmdsts & (AVE_STS_OWC | AVE_STS_EC)) ++ priv->stats_tx.collisions++; ++ } ++ } ++ ++ /* release skb */ ++ if (priv->tx.desc[done_idx].skbs) { ++ ave_dma_unmap(ndev, &priv->tx.desc[done_idx], ++ DMA_TO_DEVICE); ++ dev_consume_skb_any(priv->tx.desc[done_idx].skbs); ++ priv->tx.desc[done_idx].skbs = NULL; ++ nr_freebuf++; ++ } ++ done_idx = (done_idx + 1) % ndesc; ++ } ++ ++ priv->tx.done_idx = done_idx; ++ ++ /* update stats */ ++ u64_stats_update_begin(&priv->stats_tx.syncp); ++ priv->stats_tx.packets += tx_packets; ++ priv->stats_tx.bytes += tx_bytes; ++ u64_stats_update_end(&priv->stats_tx.syncp); ++ ++ /* wake queue for freeing buffer */ ++ if (unlikely(netif_queue_stopped(ndev)) && nr_freebuf) ++ netif_wake_queue(ndev); ++ ++ return nr_freebuf; ++} ++ ++static int ave_rx_receive(struct net_device *ndev, int num) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ unsigned int rx_packets = 0; ++ unsigned int rx_bytes = 0; ++ u32 proc_idx, done_idx; ++ struct sk_buff *skb; ++ unsigned int pktlen; ++ int restpkt, npkts; ++ u32 ndesc, cmdsts; ++ ++ proc_idx = priv->rx.proc_idx; ++ done_idx = priv->rx.done_idx; ++ ndesc = priv->rx.ndesc; ++ restpkt = ((proc_idx + ndesc - 1) - done_idx) % ndesc; ++ ++ for (npkts = 0; npkts < num; npkts++) { ++ /* we can't receive more packet, so fill desc quickly */ ++ if (--restpkt < 0) ++ break; ++ ++ cmdsts = ave_desc_read_cmdsts(ndev, AVE_DESCID_RX, proc_idx); ++ ++ /* do nothing if owner is HW (==0 for Rx) */ ++ if (!(cmdsts & AVE_STS_OWN)) ++ break; ++ ++ if (!(cmdsts & AVE_STS_OK)) { ++ priv->stats_rx.errors++; ++ proc_idx = (proc_idx + 1) % ndesc; ++ continue; ++ } ++ ++ pktlen = cmdsts & AVE_STS_PKTLEN_RX_MASK; ++ ++ /* get skbuff for rx */ ++ skb = priv->rx.desc[proc_idx].skbs; ++ priv->rx.desc[proc_idx].skbs = NULL; ++ ++ ave_dma_unmap(ndev, &priv->rx.desc[proc_idx], DMA_FROM_DEVICE); ++ ++ skb->dev = ndev; ++ skb_put(skb, pktlen); ++ skb->protocol = eth_type_trans(skb, ndev); ++ ++ if ((cmdsts & AVE_STS_CSSV) && (!(cmdsts & AVE_STS_CSER))) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ ++ rx_packets++; ++ rx_bytes += pktlen; ++ ++ netif_receive_skb(skb); ++ ++ proc_idx = (proc_idx + 1) % ndesc; ++ } ++ ++ priv->rx.proc_idx = proc_idx; ++ ++ /* update stats */ ++ u64_stats_update_begin(&priv->stats_rx.syncp); ++ priv->stats_rx.packets += rx_packets; ++ priv->stats_rx.bytes += rx_bytes; ++ u64_stats_update_end(&priv->stats_rx.syncp); ++ ++ /* refill the Rx buffers */ ++ while (proc_idx != done_idx) { ++ if (ave_rxdesc_prepare(ndev, done_idx)) ++ break; ++ done_idx = (done_idx + 1) % ndesc; ++ } ++ ++ priv->rx.done_idx = done_idx; ++ ++ return npkts; ++} ++ ++static int ave_napi_poll_rx(struct napi_struct *napi, int budget) ++{ ++ struct ave_private *priv; ++ struct net_device *ndev; ++ int num; ++ ++ priv = container_of(napi, struct ave_private, napi_rx); ++ ndev = priv->ndev; ++ ++ num = ave_rx_receive(ndev, budget); ++ if (num < budget) { ++ napi_complete_done(napi, num); ++ ++ /* enable Rx interrupt when NAPI finishes */ ++ ave_irq_enable(ndev, AVE_GI_RXIINT); ++ } ++ ++ return num; ++} ++ ++static int ave_napi_poll_tx(struct napi_struct *napi, int budget) ++{ ++ struct ave_private *priv; ++ struct net_device *ndev; ++ int num; ++ ++ priv = container_of(napi, struct ave_private, napi_tx); ++ ndev = priv->ndev; ++ ++ num = ave_tx_complete(ndev); ++ napi_complete(napi); ++ ++ /* enable Tx interrupt when NAPI finishes */ ++ ave_irq_enable(ndev, AVE_GI_TX); ++ ++ return num; ++} ++ ++static void ave_global_reset(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 val; ++ ++ /* set config register */ ++ val = AVE_CFGR_FLE | AVE_CFGR_IPFCEN | AVE_CFGR_CHE; ++ if (!phy_interface_mode_is_rgmii(priv->phy_mode)) ++ val |= AVE_CFGR_MII; ++ writel(val, priv->base + AVE_CFGR); ++ ++ /* reset RMII register */ ++ val = readl(priv->base + AVE_RSTCTRL); ++ val &= ~AVE_RSTCTRL_RMIIRST; ++ writel(val, priv->base + AVE_RSTCTRL); ++ ++ /* assert reset */ ++ writel(AVE_GRR_GRST | AVE_GRR_PHYRST, priv->base + AVE_GRR); ++ msleep(20); ++ ++ /* 1st, negate PHY reset only */ ++ writel(AVE_GRR_GRST, priv->base + AVE_GRR); ++ msleep(40); ++ ++ /* negate reset */ ++ writel(0, priv->base + AVE_GRR); ++ msleep(40); ++ ++ /* negate RMII register */ ++ val = readl(priv->base + AVE_RSTCTRL); ++ val |= AVE_RSTCTRL_RMIIRST; ++ writel(val, priv->base + AVE_RSTCTRL); ++ ++ ave_irq_disable_all(ndev); ++} ++ ++static void ave_rxfifo_reset(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 rxcr_org; ++ ++ /* save and disable MAC receive op */ ++ rxcr_org = readl(priv->base + AVE_RXCR); ++ writel(rxcr_org & (~AVE_RXCR_RXEN), priv->base + AVE_RXCR); ++ ++ /* suspend Rx descriptor */ ++ ave_desc_switch(ndev, AVE_DESC_RX_SUSPEND); ++ ++ /* receive all packets before descriptor starts */ ++ ave_rx_receive(ndev, priv->rx.ndesc); ++ ++ /* assert reset */ ++ writel(AVE_GRR_RXFFR, priv->base + AVE_GRR); ++ udelay(50); ++ ++ /* negate reset */ ++ writel(0, priv->base + AVE_GRR); ++ udelay(20); ++ ++ /* negate interrupt status */ ++ writel(AVE_GI_RXOVF, priv->base + AVE_GISR); ++ ++ /* permit descriptor */ ++ ave_desc_switch(ndev, AVE_DESC_RX_PERMIT); ++ ++ /* restore MAC reccieve op */ ++ writel(rxcr_org, priv->base + AVE_RXCR); ++} ++ ++static irqreturn_t ave_irq_handler(int irq, void *netdev) ++{ ++ struct net_device *ndev = (struct net_device *)netdev; ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 gimr_val, gisr_val; ++ ++ gimr_val = ave_irq_disable_all(ndev); ++ ++ /* get interrupt status */ ++ gisr_val = readl(priv->base + AVE_GISR); ++ ++ /* PHY */ ++ if (gisr_val & AVE_GI_PHY) ++ writel(AVE_GI_PHY, priv->base + AVE_GISR); ++ ++ /* check exceeding packet */ ++ if (gisr_val & AVE_GI_RXERR) { ++ writel(AVE_GI_RXERR, priv->base + AVE_GISR); ++ netdev_err(ndev, "receive a packet exceeding frame buffer\n"); ++ } ++ ++ gisr_val &= gimr_val; ++ if (!gisr_val) ++ goto exit_isr; ++ ++ /* RxFIFO overflow */ ++ if (gisr_val & AVE_GI_RXOVF) { ++ priv->stats_rx.fifo_errors++; ++ ave_rxfifo_reset(ndev); ++ goto exit_isr; ++ } ++ ++ /* Rx drop */ ++ if (gisr_val & AVE_GI_RXDROP) { ++ priv->stats_rx.dropped++; ++ writel(AVE_GI_RXDROP, priv->base + AVE_GISR); ++ } ++ ++ /* Rx interval */ ++ if (gisr_val & AVE_GI_RXIINT) { ++ napi_schedule(&priv->napi_rx); ++ /* still force to disable Rx interrupt until NAPI finishes */ ++ gimr_val &= ~AVE_GI_RXIINT; ++ } ++ ++ /* Tx completed */ ++ if (gisr_val & AVE_GI_TX) { ++ napi_schedule(&priv->napi_tx); ++ /* still force to disable Tx interrupt until NAPI finishes */ ++ gimr_val &= ~AVE_GI_TX; ++ } ++ ++exit_isr: ++ ave_irq_restore(ndev, gimr_val); ++ ++ return IRQ_HANDLED; ++} ++ ++static int ave_pfsel_start(struct net_device *ndev, unsigned int entry) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 val; ++ ++ if (WARN_ON(entry > AVE_PF_SIZE)) ++ return -EINVAL; ++ ++ val = readl(priv->base + AVE_PFEN); ++ writel(val | BIT(entry), priv->base + AVE_PFEN); ++ ++ return 0; ++} ++ ++static int ave_pfsel_stop(struct net_device *ndev, unsigned int entry) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 val; ++ ++ if (WARN_ON(entry > AVE_PF_SIZE)) ++ return -EINVAL; ++ ++ val = readl(priv->base + AVE_PFEN); ++ writel(val & ~BIT(entry), priv->base + AVE_PFEN); ++ ++ return 0; ++} ++ ++static int ave_pfsel_set_macaddr(struct net_device *ndev, ++ unsigned int entry, ++ const unsigned char *mac_addr, ++ unsigned int set_size) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ if (WARN_ON(entry > AVE_PF_SIZE)) ++ return -EINVAL; ++ if (WARN_ON(set_size > 6)) ++ return -EINVAL; ++ ++ ave_pfsel_stop(ndev, entry); ++ ++ /* set MAC address for the filter */ ++ ave_hw_write_macaddr(ndev, mac_addr, ++ AVE_PKTF(entry), AVE_PKTF(entry) + 4); ++ ++ /* set byte mask */ ++ writel(GENMASK(31, set_size) & AVE_PFMBYTE_MASK0, ++ priv->base + AVE_PFMBYTE(entry)); ++ writel(AVE_PFMBYTE_MASK1, priv->base + AVE_PFMBYTE(entry) + 4); ++ ++ /* set bit mask filter */ ++ writel(AVE_PFMBIT_MASK, priv->base + AVE_PFMBIT(entry)); ++ ++ /* set selector to ring 0 */ ++ writel(0, priv->base + AVE_PFSEL(entry)); ++ ++ /* restart filter */ ++ ave_pfsel_start(ndev, entry); ++ ++ return 0; ++} ++ ++static void ave_pfsel_set_promisc(struct net_device *ndev, ++ unsigned int entry, u32 rxring) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ if (WARN_ON(entry > AVE_PF_SIZE)) ++ return; ++ ++ ave_pfsel_stop(ndev, entry); ++ ++ /* set byte mask */ ++ writel(AVE_PFMBYTE_MASK0, priv->base + AVE_PFMBYTE(entry)); ++ writel(AVE_PFMBYTE_MASK1, priv->base + AVE_PFMBYTE(entry) + 4); ++ ++ /* set bit mask filter */ ++ writel(AVE_PFMBIT_MASK, priv->base + AVE_PFMBIT(entry)); ++ ++ /* set selector to rxring */ ++ writel(rxring, priv->base + AVE_PFSEL(entry)); ++ ++ ave_pfsel_start(ndev, entry); ++} ++ ++static void ave_pfsel_init(struct net_device *ndev) ++{ ++ unsigned char bcast_mac[ETH_ALEN]; ++ int i; ++ ++ eth_broadcast_addr(bcast_mac); ++ ++ for (i = 0; i < AVE_PF_SIZE; i++) ++ ave_pfsel_stop(ndev, i); ++ ++ /* promiscious entry, select ring 0 */ ++ ave_pfsel_set_promisc(ndev, AVE_PFNUM_FILTER, 0); ++ ++ /* unicast entry */ ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_UNICAST, ndev->dev_addr, 6); ++ ++ /* broadcast entry */ ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_BROADCAST, bcast_mac, 6); ++} ++ ++static void ave_phy_adjust_link(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ struct phy_device *phydev = ndev->phydev; ++ u32 val, txcr, rxcr, rxcr_org; ++ u16 rmt_adv = 0, lcl_adv = 0; ++ u8 cap; ++ ++ /* set RGMII speed */ ++ val = readl(priv->base + AVE_TXCR); ++ val &= ~(AVE_TXCR_TXSPD_100 | AVE_TXCR_TXSPD_1G); ++ ++ if (phy_interface_is_rgmii(phydev) && phydev->speed == SPEED_1000) ++ val |= AVE_TXCR_TXSPD_1G; ++ else if (phydev->speed == SPEED_100) ++ val |= AVE_TXCR_TXSPD_100; ++ ++ writel(val, priv->base + AVE_TXCR); ++ ++ /* set RMII speed (100M/10M only) */ ++ if (!phy_interface_is_rgmii(phydev)) { ++ val = readl(priv->base + AVE_LINKSEL); ++ if (phydev->speed == SPEED_10) ++ val &= ~AVE_LINKSEL_100M; ++ else ++ val |= AVE_LINKSEL_100M; ++ writel(val, priv->base + AVE_LINKSEL); ++ } ++ ++ /* check current RXCR/TXCR */ ++ rxcr = readl(priv->base + AVE_RXCR); ++ txcr = readl(priv->base + AVE_TXCR); ++ rxcr_org = rxcr; ++ ++ if (phydev->duplex) { ++ rxcr |= AVE_RXCR_FDUPEN; ++ ++ if (phydev->pause) ++ rmt_adv |= LPA_PAUSE_CAP; ++ if (phydev->asym_pause) ++ rmt_adv |= LPA_PAUSE_ASYM; ++ ++ lcl_adv = linkmode_adv_to_lcl_adv_t(phydev->advertising); ++ cap = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv); ++ if (cap & FLOW_CTRL_TX) ++ txcr |= AVE_TXCR_FLOCTR; ++ else ++ txcr &= ~AVE_TXCR_FLOCTR; ++ if (cap & FLOW_CTRL_RX) ++ rxcr |= AVE_RXCR_FLOCTR; ++ else ++ rxcr &= ~AVE_RXCR_FLOCTR; ++ } else { ++ rxcr &= ~AVE_RXCR_FDUPEN; ++ rxcr &= ~AVE_RXCR_FLOCTR; ++ txcr &= ~AVE_TXCR_FLOCTR; ++ } ++ ++ if (rxcr_org != rxcr) { ++ /* disable Rx mac */ ++ writel(rxcr & ~AVE_RXCR_RXEN, priv->base + AVE_RXCR); ++ /* change and enable TX/Rx mac */ ++ writel(txcr, priv->base + AVE_TXCR); ++ writel(rxcr, priv->base + AVE_RXCR); ++ } ++ ++ phy_print_status(phydev); ++} ++ ++static void ave_macaddr_init(struct net_device *ndev) ++{ ++ ave_hw_write_macaddr(ndev, ndev->dev_addr, AVE_RXMAC1R, AVE_RXMAC2R); ++ ++ /* pfsel unicast entry */ ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_UNICAST, ndev->dev_addr, 6); ++} ++ ++static int ave_init(struct net_device *ndev) ++{ ++ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; ++ struct ave_private *priv = netdev_priv(ndev); ++ struct device *dev = ndev->dev.parent; ++ struct device_node *np = dev->of_node; ++ struct device_node *mdio_np; ++ struct phy_device *phydev; ++ int nc, nr, ret; ++ ++ /* enable clk because of hw access until ndo_open */ ++ for (nc = 0; nc < priv->nclks; nc++) { ++ ret = clk_prepare_enable(priv->clk[nc]); ++ if (ret) { ++ dev_err(dev, "can't enable clock\n"); ++ goto out_clk_disable; ++ } ++ } ++ ++ for (nr = 0; nr < priv->nrsts; nr++) { ++ ret = reset_control_deassert(priv->rst[nr]); ++ if (ret) { ++ dev_err(dev, "can't deassert reset\n"); ++ goto out_reset_assert; ++ } ++ } ++ ++ ret = regmap_update_bits(priv->regmap, SG_ETPINMODE, ++ priv->pinmode_mask, priv->pinmode_val); ++ if (ret) ++ goto out_reset_assert; ++ ++ ave_global_reset(ndev); ++ ++ mdio_np = of_get_child_by_name(np, "mdio"); ++ if (!mdio_np) { ++ dev_err(dev, "mdio node not found\n"); ++ ret = -EINVAL; ++ goto out_reset_assert; ++ } ++ ret = of_mdiobus_register(priv->mdio, mdio_np); ++ of_node_put(mdio_np); ++ if (ret) { ++ dev_err(dev, "failed to register mdiobus\n"); ++ goto out_reset_assert; ++ } ++ ++ phydev = of_phy_get_and_connect(ndev, np, ave_phy_adjust_link); ++ if (!phydev) { ++ dev_err(dev, "could not attach to PHY\n"); ++ ret = -ENODEV; ++ goto out_mdio_unregister; ++ } ++ ++ priv->phydev = phydev; ++ ++ ave_ethtool_get_wol(ndev, &wol); ++ device_set_wakeup_capable(&ndev->dev, !!wol.supported); ++ ++ /* set wol initial state disabled */ ++ wol.wolopts = 0; ++ __ave_ethtool_set_wol(ndev, &wol); ++ ++ if (!phy_interface_is_rgmii(phydev)) ++ phy_set_max_speed(phydev, SPEED_100); ++ ++ phy_support_asym_pause(phydev); ++ ++ phydev->mac_managed_pm = true; ++ ++ phy_attached_info(phydev); ++ ++ return 0; ++ ++out_mdio_unregister: ++ mdiobus_unregister(priv->mdio); ++out_reset_assert: ++ while (--nr >= 0) ++ reset_control_assert(priv->rst[nr]); ++out_clk_disable: ++ while (--nc >= 0) ++ clk_disable_unprepare(priv->clk[nc]); ++ ++ return ret; ++} ++ ++static void ave_uninit(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ int i; ++ ++ phy_disconnect(priv->phydev); ++ mdiobus_unregister(priv->mdio); ++ ++ /* disable clk because of hw access after ndo_stop */ ++ for (i = 0; i < priv->nrsts; i++) ++ reset_control_assert(priv->rst[i]); ++ for (i = 0; i < priv->nclks; i++) ++ clk_disable_unprepare(priv->clk[i]); ++} ++ ++static int ave_open(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ int entry; ++ int ret; ++ u32 val; ++ ++ ret = request_irq(priv->irq, ave_irq_handler, IRQF_SHARED, ndev->name, ++ ndev); ++ if (ret) ++ return ret; ++ ++ priv->tx.desc = kcalloc(priv->tx.ndesc, sizeof(*priv->tx.desc), ++ GFP_KERNEL); ++ if (!priv->tx.desc) { ++ ret = -ENOMEM; ++ goto out_free_irq; ++ } ++ ++ priv->rx.desc = kcalloc(priv->rx.ndesc, sizeof(*priv->rx.desc), ++ GFP_KERNEL); ++ if (!priv->rx.desc) { ++ kfree(priv->tx.desc); ++ ret = -ENOMEM; ++ goto out_free_irq; ++ } ++ ++ /* initialize Tx work and descriptor */ ++ priv->tx.proc_idx = 0; ++ priv->tx.done_idx = 0; ++ for (entry = 0; entry < priv->tx.ndesc; entry++) { ++ ave_desc_write_cmdsts(ndev, AVE_DESCID_TX, entry, 0); ++ ave_desc_write_addr(ndev, AVE_DESCID_TX, entry, 0); ++ } ++ writel(AVE_TXDC_ADDR_START | ++ (((priv->tx.ndesc * priv->desc_size) << 16) & AVE_TXDC_SIZE), ++ priv->base + AVE_TXDC); ++ ++ /* initialize Rx work and descriptor */ ++ priv->rx.proc_idx = 0; ++ priv->rx.done_idx = 0; ++ for (entry = 0; entry < priv->rx.ndesc; entry++) { ++ if (ave_rxdesc_prepare(ndev, entry)) ++ break; ++ } ++ writel(AVE_RXDC0_ADDR_START | ++ (((priv->rx.ndesc * priv->desc_size) << 16) & AVE_RXDC0_SIZE), ++ priv->base + AVE_RXDC0); ++ ++ ave_desc_switch(ndev, AVE_DESC_START); ++ ++ ave_pfsel_init(ndev); ++ ave_macaddr_init(ndev); ++ ++ /* set Rx configuration */ ++ /* full duplex, enable pause drop, enalbe flow control */ ++ val = AVE_RXCR_RXEN | AVE_RXCR_FDUPEN | AVE_RXCR_DRPEN | ++ AVE_RXCR_FLOCTR | (AVE_MAX_ETHFRAME & AVE_RXCR_MPSIZ_MASK); ++ writel(val, priv->base + AVE_RXCR); ++ ++ /* set Tx configuration */ ++ /* enable flow control, disable loopback */ ++ writel(AVE_TXCR_FLOCTR, priv->base + AVE_TXCR); ++ ++ /* enable timer, clear EN,INTM, and mask interval unit(BSCK) */ ++ val = readl(priv->base + AVE_IIRQC) & AVE_IIRQC_BSCK; ++ val |= AVE_IIRQC_EN0 | (AVE_INTM_COUNT << 16); ++ writel(val, priv->base + AVE_IIRQC); ++ ++ val = AVE_GI_RXIINT | AVE_GI_RXOVF | AVE_GI_TX | AVE_GI_RXDROP; ++ ave_irq_restore(ndev, val); ++ ++ napi_enable(&priv->napi_rx); ++ napi_enable(&priv->napi_tx); ++ ++ phy_start(ndev->phydev); ++ phy_start_aneg(ndev->phydev); ++ netif_start_queue(ndev); ++ ++ return 0; ++ ++out_free_irq: ++ disable_irq(priv->irq); ++ free_irq(priv->irq, ndev); ++ ++ return ret; ++} ++ ++static int ave_stop(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ int entry; ++ ++ ave_irq_disable_all(ndev); ++ disable_irq(priv->irq); ++ free_irq(priv->irq, ndev); ++ ++ netif_tx_disable(ndev); ++ phy_stop(ndev->phydev); ++ napi_disable(&priv->napi_tx); ++ napi_disable(&priv->napi_rx); ++ ++ ave_desc_switch(ndev, AVE_DESC_STOP); ++ ++ /* free Tx buffer */ ++ for (entry = 0; entry < priv->tx.ndesc; entry++) { ++ if (!priv->tx.desc[entry].skbs) ++ continue; ++ ++ ave_dma_unmap(ndev, &priv->tx.desc[entry], DMA_TO_DEVICE); ++ dev_kfree_skb_any(priv->tx.desc[entry].skbs); ++ priv->tx.desc[entry].skbs = NULL; ++ } ++ priv->tx.proc_idx = 0; ++ priv->tx.done_idx = 0; ++ ++ /* free Rx buffer */ ++ for (entry = 0; entry < priv->rx.ndesc; entry++) { ++ if (!priv->rx.desc[entry].skbs) ++ continue; ++ ++ ave_dma_unmap(ndev, &priv->rx.desc[entry], DMA_FROM_DEVICE); ++ dev_kfree_skb_any(priv->rx.desc[entry].skbs); ++ priv->rx.desc[entry].skbs = NULL; ++ } ++ priv->rx.proc_idx = 0; ++ priv->rx.done_idx = 0; ++ ++ kfree(priv->tx.desc); ++ kfree(priv->rx.desc); ++ ++ return 0; ++} ++ ++static netdev_tx_t ave_start_xmit(struct sk_buff *skb, struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ u32 proc_idx, done_idx, ndesc, cmdsts; ++ int ret, freepkt; ++ dma_addr_t paddr; ++ ++ proc_idx = priv->tx.proc_idx; ++ done_idx = priv->tx.done_idx; ++ ndesc = priv->tx.ndesc; ++ freepkt = ((done_idx + ndesc - 1) - proc_idx) % ndesc; ++ ++ /* stop queue when not enough entry */ ++ if (unlikely(freepkt < 1)) { ++ netif_stop_queue(ndev); ++ return NETDEV_TX_BUSY; ++ } ++ ++ /* add padding for short packet */ ++ if (skb_put_padto(skb, ETH_ZLEN)) { ++ priv->stats_tx.dropped++; ++ return NETDEV_TX_OK; ++ } ++ ++ /* map Tx buffer ++ * Tx buffer set to the Tx descriptor doesn't have any restriction. ++ */ ++ ret = ave_dma_map(ndev, &priv->tx.desc[proc_idx], ++ skb->data, skb->len, DMA_TO_DEVICE, &paddr); ++ if (ret) { ++ dev_kfree_skb_any(skb); ++ priv->stats_tx.dropped++; ++ return NETDEV_TX_OK; ++ } ++ ++ priv->tx.desc[proc_idx].skbs = skb; ++ ++ ave_desc_write_addr(ndev, AVE_DESCID_TX, proc_idx, paddr); ++ ++ cmdsts = AVE_STS_OWN | AVE_STS_1ST | AVE_STS_LAST | ++ (skb->len & AVE_STS_PKTLEN_TX_MASK); ++ ++ /* set interrupt per AVE_FORCE_TXINTCNT or when queue is stopped */ ++ if (!(proc_idx % AVE_FORCE_TXINTCNT) || netif_queue_stopped(ndev)) ++ cmdsts |= AVE_STS_INTR; ++ ++ /* disable checksum calculation when skb doesn't calurate checksum */ ++ if (skb->ip_summed == CHECKSUM_NONE || ++ skb->ip_summed == CHECKSUM_UNNECESSARY) ++ cmdsts |= AVE_STS_NOCSUM; ++ ++ ave_desc_write_cmdsts(ndev, AVE_DESCID_TX, proc_idx, cmdsts); ++ ++ priv->tx.proc_idx = (proc_idx + 1) % ndesc; ++ ++ return NETDEV_TX_OK; ++} ++ ++static int ave_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd) ++{ ++ return phy_mii_ioctl(ndev->phydev, ifr, cmd); ++} ++ ++static const u8 v4multi_macadr[] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 }; ++static const u8 v6multi_macadr[] = { 0x33, 0x00, 0x00, 0x00, 0x00, 0x00 }; ++ ++static void ave_set_rx_mode(struct net_device *ndev) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ struct netdev_hw_addr *hw_adr; ++ int count, mc_cnt; ++ u32 val; ++ ++ /* MAC addr filter enable for promiscious mode */ ++ mc_cnt = netdev_mc_count(ndev); ++ val = readl(priv->base + AVE_RXCR); ++ if (ndev->flags & IFF_PROMISC || !mc_cnt) ++ val &= ~AVE_RXCR_AFEN; ++ else ++ val |= AVE_RXCR_AFEN; ++ writel(val, priv->base + AVE_RXCR); ++ ++ /* set all multicast address */ ++ if ((ndev->flags & IFF_ALLMULTI) || mc_cnt > AVE_PF_MULTICAST_SIZE) { ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST, ++ v4multi_macadr, 1); ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST + 1, ++ v6multi_macadr, 1); ++ } else { ++ /* stop all multicast filter */ ++ for (count = 0; count < AVE_PF_MULTICAST_SIZE; count++) ++ ave_pfsel_stop(ndev, AVE_PFNUM_MULTICAST + count); ++ ++ /* set multicast addresses */ ++ count = 0; ++ netdev_for_each_mc_addr(hw_adr, ndev) { ++ if (count == mc_cnt) ++ break; ++ ave_pfsel_set_macaddr(ndev, AVE_PFNUM_MULTICAST + count, ++ hw_adr->addr, 6); ++ count++; ++ } ++ } ++} ++ ++static void ave_get_stats64(struct net_device *ndev, ++ struct rtnl_link_stats64 *stats) ++{ ++ struct ave_private *priv = netdev_priv(ndev); ++ unsigned int start; ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&priv->stats_rx.syncp); ++ stats->rx_packets = priv->stats_rx.packets; ++ stats->rx_bytes = priv->stats_rx.bytes; ++ } while (u64_stats_fetch_retry_irq(&priv->stats_rx.syncp, start)); ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&priv->stats_tx.syncp); ++ stats->tx_packets = priv->stats_tx.packets; ++ stats->tx_bytes = priv->stats_tx.bytes; ++ } while (u64_stats_fetch_retry_irq(&priv->stats_tx.syncp, start)); ++ ++ stats->rx_errors = priv->stats_rx.errors; ++ stats->tx_errors = priv->stats_tx.errors; ++ stats->rx_dropped = priv->stats_rx.dropped; ++ stats->tx_dropped = priv->stats_tx.dropped; ++ stats->rx_fifo_errors = priv->stats_rx.fifo_errors; ++ stats->collisions = priv->stats_tx.collisions; ++} ++ ++static int ave_set_mac_address(struct net_device *ndev, void *p) ++{ ++ int ret = eth_mac_addr(ndev, p); ++ ++ if (ret) ++ return ret; ++ ++ ave_macaddr_init(ndev); ++ ++ return 0; ++} ++ ++static const struct net_device_ops ave_netdev_ops = { ++ .ndo_init = ave_init, ++ .ndo_uninit = ave_uninit, ++ .ndo_open = ave_open, ++ .ndo_stop = ave_stop, ++ .ndo_start_xmit = ave_start_xmit, ++ .ndo_eth_ioctl = ave_ioctl, ++ .ndo_set_rx_mode = ave_set_rx_mode, ++ .ndo_get_stats64 = ave_get_stats64, ++ .ndo_set_mac_address = ave_set_mac_address, ++}; ++ ++static int ave_probe(struct platform_device *pdev) ++{ ++ const struct ave_soc_data *data; ++ struct device *dev = &pdev->dev; ++ char buf[ETHTOOL_FWVERS_LEN]; ++ struct of_phandle_args args; ++ phy_interface_t phy_mode; ++ struct ave_private *priv; ++ struct net_device *ndev; ++ struct device_node *np; ++ void __iomem *base; ++ const char *name; ++ int i, irq, ret; ++ u64 dma_mask; ++ u32 ave_id; ++ ++ data = of_device_get_match_data(dev); ++ if (WARN_ON(!data)) ++ return -EINVAL; ++ ++ np = dev->of_node; ++ ret = of_get_phy_mode(np, &phy_mode); ++ if (ret) { ++ dev_err(dev, "phy-mode not found\n"); ++ return ret; ++ } ++ ++ irq = platform_get_irq(pdev, 0); ++ if (irq < 0) ++ return irq; ++ ++ base = devm_platform_ioremap_resource(pdev, 0); ++ if (IS_ERR(base)) ++ return PTR_ERR(base); ++ ++ ndev = devm_alloc_etherdev(dev, sizeof(struct ave_private)); ++ if (!ndev) { ++ dev_err(dev, "can't allocate ethernet device\n"); ++ return -ENOMEM; ++ } ++ ++ ndev->netdev_ops = &ave_netdev_ops; ++ ndev->ethtool_ops = &ave_ethtool_ops; ++ SET_NETDEV_DEV(ndev, dev); ++ ++ ndev->features |= (NETIF_F_IP_CSUM | NETIF_F_RXCSUM); ++ ndev->hw_features |= (NETIF_F_IP_CSUM | NETIF_F_RXCSUM); ++ ++ ndev->max_mtu = AVE_MAX_ETHFRAME - (ETH_HLEN + ETH_FCS_LEN); ++ ++ ret = of_get_ethdev_address(np, ndev); ++ if (ret) { ++ /* if the mac address is invalid, use random mac address */ ++ eth_hw_addr_random(ndev); ++ dev_warn(dev, "Using random MAC address: %pM\n", ++ ndev->dev_addr); ++ } ++ ++ priv = netdev_priv(ndev); ++ priv->base = base; ++ priv->irq = irq; ++ priv->ndev = ndev; ++ priv->msg_enable = netif_msg_init(-1, AVE_DEFAULT_MSG_ENABLE); ++ priv->phy_mode = phy_mode; ++ priv->data = data; ++ ++ if (IS_DESC_64BIT(priv)) { ++ priv->desc_size = AVE_DESC_SIZE_64; ++ priv->tx.daddr = AVE_TXDM_64; ++ priv->rx.daddr = AVE_RXDM_64; ++ dma_mask = DMA_BIT_MASK(64); ++ } else { ++ priv->desc_size = AVE_DESC_SIZE_32; ++ priv->tx.daddr = AVE_TXDM_32; ++ priv->rx.daddr = AVE_RXDM_32; ++ dma_mask = DMA_BIT_MASK(32); ++ } ++ ret = dma_set_mask(dev, dma_mask); ++ if (ret) ++ return ret; ++ ++ priv->tx.ndesc = AVE_NR_TXDESC; ++ priv->rx.ndesc = AVE_NR_RXDESC; ++ ++ u64_stats_init(&priv->stats_tx.syncp); ++ u64_stats_init(&priv->stats_rx.syncp); ++ ++ for (i = 0; i < AVE_MAX_CLKS; i++) { ++ name = priv->data->clock_names[i]; ++ if (!name) ++ break; ++ priv->clk[i] = devm_clk_get(dev, name); ++ if (IS_ERR(priv->clk[i])) ++ return PTR_ERR(priv->clk[i]); ++ priv->nclks++; ++ } ++ ++ for (i = 0; i < AVE_MAX_RSTS; i++) { ++ name = priv->data->reset_names[i]; ++ if (!name) ++ break; ++ priv->rst[i] = devm_reset_control_get_shared(dev, name); ++ if (IS_ERR(priv->rst[i])) ++ return PTR_ERR(priv->rst[i]); ++ priv->nrsts++; ++ } ++ ++ ret = of_parse_phandle_with_fixed_args(np, ++ "socionext,syscon-phy-mode", ++ 1, 0, &args); ++ if (ret) { ++ dev_err(dev, "can't get syscon-phy-mode property\n"); ++ return ret; ++ } ++ priv->regmap = syscon_node_to_regmap(args.np); ++ of_node_put(args.np); ++ if (IS_ERR(priv->regmap)) { ++ dev_err(dev, "can't map syscon-phy-mode\n"); ++ return PTR_ERR(priv->regmap); ++ } ++ ret = priv->data->get_pinmode(priv, phy_mode, args.args[0]); ++ if (ret) { ++ dev_err(dev, "invalid phy-mode setting\n"); ++ return ret; ++ } ++ ++ priv->mdio = devm_mdiobus_alloc(dev); ++ if (!priv->mdio) ++ return -ENOMEM; ++ priv->mdio->priv = ndev; ++ priv->mdio->parent = dev; ++ priv->mdio->read = ave_mdiobus_read; ++ priv->mdio->write = ave_mdiobus_write; ++ priv->mdio->name = "uniphier-mdio"; ++ snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "%s-%x", ++ pdev->name, pdev->id); ++ ++ /* Register as a NAPI supported driver */ ++ netif_napi_add(ndev, &priv->napi_rx, ave_napi_poll_rx, ++ NAPI_POLL_WEIGHT); ++ netif_napi_add_tx(ndev, &priv->napi_tx, ave_napi_poll_tx); ++ ++ platform_set_drvdata(pdev, ndev); ++ ++ ret = register_netdev(ndev); ++ if (ret) { ++ dev_err(dev, "failed to register netdevice\n"); ++ goto out_del_napi; ++ } ++ ++ /* get ID and version */ ++ ave_id = readl(priv->base + AVE_IDR); ++ ave_hw_read_version(ndev, buf, sizeof(buf)); ++ ++ dev_info(dev, "Socionext %c%c%c%c Ethernet IP %s (irq=%d, phy=%s)\n", ++ (ave_id >> 24) & 0xff, (ave_id >> 16) & 0xff, ++ (ave_id >> 8) & 0xff, (ave_id >> 0) & 0xff, ++ buf, priv->irq, phy_modes(phy_mode)); ++ ++ return 0; ++ ++out_del_napi: ++ netif_napi_del(&priv->napi_rx); ++ netif_napi_del(&priv->napi_tx); ++ ++ return ret; ++} ++ ++static int ave_remove(struct platform_device *pdev) ++{ ++ struct net_device *ndev = platform_get_drvdata(pdev); ++ struct ave_private *priv = netdev_priv(ndev); ++ ++ unregister_netdev(ndev); ++ netif_napi_del(&priv->napi_rx); ++ netif_napi_del(&priv->napi_tx); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_PM_SLEEP ++static int ave_suspend(struct device *dev) ++{ ++ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; ++ struct net_device *ndev = dev_get_drvdata(dev); ++ struct ave_private *priv = netdev_priv(ndev); ++ int ret = 0; ++ ++ if (netif_running(ndev)) { ++ ret = ave_stop(ndev); ++ netif_device_detach(ndev); ++ } ++ ++ ave_ethtool_get_wol(ndev, &wol); ++ priv->wolopts = wol.wolopts; ++ ++ return ret; ++} ++ ++static int ave_resume(struct device *dev) ++{ ++ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; ++ struct net_device *ndev = dev_get_drvdata(dev); ++ struct ave_private *priv = netdev_priv(ndev); ++ int ret = 0; ++ ++ ave_global_reset(ndev); ++ ++ ret = phy_init_hw(ndev->phydev); ++ if (ret) ++ return ret; ++ ++ ave_ethtool_get_wol(ndev, &wol); ++ wol.wolopts = priv->wolopts; ++ __ave_ethtool_set_wol(ndev, &wol); ++ ++ if (ndev->phydev) { ++ ret = phy_resume(ndev->phydev); ++ if (ret) ++ return ret; ++ } ++ ++ if (netif_running(ndev)) { ++ ret = ave_open(ndev); ++ netif_device_attach(ndev); ++ } ++ ++ return ret; ++} ++ ++static SIMPLE_DEV_PM_OPS(ave_pm_ops, ave_suspend, ave_resume); ++#define AVE_PM_OPS (&ave_pm_ops) ++#else ++#define AVE_PM_OPS NULL ++#endif ++ ++static int ave_pro4_get_pinmode(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg) ++{ ++ if (arg > 0) ++ return -EINVAL; ++ ++ priv->pinmode_mask = SG_ETPINMODE_RMII(0); ++ ++ switch (phy_mode) { ++ case PHY_INTERFACE_MODE_RMII: ++ priv->pinmode_val = SG_ETPINMODE_RMII(0); ++ break; ++ case PHY_INTERFACE_MODE_MII: ++ case PHY_INTERFACE_MODE_RGMII: ++ case PHY_INTERFACE_MODE_RGMII_ID: ++ case PHY_INTERFACE_MODE_RGMII_RXID: ++ case PHY_INTERFACE_MODE_RGMII_TXID: ++ priv->pinmode_val = 0; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ave_ld11_get_pinmode(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg) ++{ ++ if (arg > 0) ++ return -EINVAL; ++ ++ priv->pinmode_mask = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0); ++ ++ switch (phy_mode) { ++ case PHY_INTERFACE_MODE_INTERNAL: ++ priv->pinmode_val = 0; ++ break; ++ case PHY_INTERFACE_MODE_RMII: ++ priv->pinmode_val = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ave_ld20_get_pinmode(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg) ++{ ++ if (arg > 0) ++ return -EINVAL; ++ ++ priv->pinmode_mask = SG_ETPINMODE_RMII(0); ++ ++ switch (phy_mode) { ++ case PHY_INTERFACE_MODE_RMII: ++ priv->pinmode_val = SG_ETPINMODE_RMII(0); ++ break; ++ case PHY_INTERFACE_MODE_RGMII: ++ case PHY_INTERFACE_MODE_RGMII_ID: ++ case PHY_INTERFACE_MODE_RGMII_RXID: ++ case PHY_INTERFACE_MODE_RGMII_TXID: ++ priv->pinmode_val = 0; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ave_pxs3_get_pinmode(struct ave_private *priv, ++ phy_interface_t phy_mode, u32 arg) ++{ ++ if (arg > 1) ++ return -EINVAL; ++ ++ priv->pinmode_mask = SG_ETPINMODE_RMII(arg); ++ ++ switch (phy_mode) { ++ case PHY_INTERFACE_MODE_RMII: ++ priv->pinmode_val = SG_ETPINMODE_RMII(arg); ++ break; ++ case PHY_INTERFACE_MODE_RGMII: ++ case PHY_INTERFACE_MODE_RGMII_ID: ++ case PHY_INTERFACE_MODE_RGMII_RXID: ++ case PHY_INTERFACE_MODE_RGMII_TXID: ++ priv->pinmode_val = 0; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static const struct ave_soc_data ave_pro4_data = { ++ .is_desc_64bit = false, ++ .clock_names = { ++ "gio", "ether", "ether-gb", "ether-phy", ++ }, ++ .reset_names = { ++ "gio", "ether", ++ }, ++ .get_pinmode = ave_pro4_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_pxs2_data = { ++ .is_desc_64bit = false, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_pro4_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_ld11_data = { ++ .is_desc_64bit = false, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_ld11_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_ld20_data = { ++ .is_desc_64bit = true, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_ld20_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_pxs3_data = { ++ .is_desc_64bit = false, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_pxs3_get_pinmode, ++}; ++ ++static const struct ave_soc_data ave_nx1_data = { ++ .is_desc_64bit = true, ++ .clock_names = { ++ "ether", ++ }, ++ .reset_names = { ++ "ether", ++ }, ++ .get_pinmode = ave_pxs3_get_pinmode, ++}; ++ ++static const struct of_device_id of_ave_match[] = { ++ { ++ .compatible = "socionext,uniphier-pro4-ave4", ++ .data = &ave_pro4_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-pxs2-ave4", ++ .data = &ave_pxs2_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-ld11-ave4", ++ .data = &ave_ld11_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-ld20-ave4", ++ .data = &ave_ld20_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-pxs3-ave4", ++ .data = &ave_pxs3_data, ++ }, ++ { ++ .compatible = "socionext,uniphier-nx1-ave4", ++ .data = &ave_nx1_data, ++ }, ++ { /* Sentinel */ } ++}; ++MODULE_DEVICE_TABLE(of, of_ave_match); ++ ++static struct platform_driver ave_driver = { ++ .probe = ave_probe, ++ .remove = ave_remove, ++ .driver = { ++ .name = "ave", ++ .pm = AVE_PM_OPS, ++ .of_match_table = of_ave_match, ++ }, ++}; ++module_platform_driver(ave_driver); ++ ++MODULE_AUTHOR("Kunihiko Hayashi "); ++MODULE_DESCRIPTION("Socionext UniPhier AVE ethernet driver"); ++MODULE_LICENSE("GPL v2"); +diff -rupN linux.orig/drivers/net/ethernet/ti/am65-cpsw-nuss.c linux/drivers/net/ethernet/ti/am65-cpsw-nuss.c +--- linux.orig/drivers/net/ethernet/ti/am65-cpsw-nuss.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/ti/am65-cpsw-nuss.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1365,12 +1365,12 @@ static void am65_cpsw_nuss_ndo_get_stats cpu_stats = per_cpu_ptr(ndev_priv->stats, cpu); do { @@ -2501,11 +14651,10 @@ index f4a6b590a1e39..1b62400c19049 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c -index b15d44261e766..68c7b2c05aab3 100644 ---- a/drivers/net/ethernet/ti/netcp_core.c -+++ b/drivers/net/ethernet/ti/netcp_core.c -@@ -1916,16 +1916,16 @@ netcp_get_stats(struct net_device *ndev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/ethernet/ti/netcp_core.c linux/drivers/net/ethernet/ti/netcp_core.c +--- linux.orig/drivers/net/ethernet/ti/netcp_core.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/ti/netcp_core.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1916,16 +1916,16 @@ netcp_get_stats(struct net_device *ndev, unsigned int start; do { @@ -2526,11 +14675,10 @@ index b15d44261e766..68c7b2c05aab3 100644 stats->rx_packets = rxpackets; stats->rx_bytes = rxbytes; -diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c -index 509c5e9b29dfa..5301c907b5ae3 100644 ---- a/drivers/net/ethernet/via/via-rhine.c -+++ b/drivers/net/ethernet/via/via-rhine.c -@@ -2217,16 +2217,16 @@ rhine_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/ethernet/via/via-rhine.c linux/drivers/net/ethernet/via/via-rhine.c +--- linux.orig/drivers/net/ethernet/via/via-rhine.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/via/via-rhine.c 2022-12-04 10:40:26.696034096 -0500 +@@ -2217,16 +2217,16 @@ rhine_get_stats64(struct net_device *dev netdev_stats_to_stats64(stats, &dev->stats); do { @@ -2551,11 +14699,10 @@ index 509c5e9b29dfa..5301c907b5ae3 100644 } static void rhine_set_rx_mode(struct net_device *dev) -diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c -index 9262988d26a32..2c233b59e7d93 100644 ---- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c -+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c -@@ -1305,16 +1305,16 @@ axienet_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/ethernet/xilinx/xilinx_axienet_main.c linux/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +--- linux.orig/drivers/net/ethernet/xilinx/xilinx_axienet_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ethernet/xilinx/xilinx_axienet_main.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1305,16 +1305,16 @@ axienet_get_stats64(struct net_device *d netdev_stats_to_stats64(stats, &dev->stats); do { @@ -2576,11 +14723,10 @@ index 9262988d26a32..2c233b59e7d93 100644 } static const struct net_device_ops axienet_netdev_ops = { -diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c -index 8113ac17ab70a..2fd8b9c51e839 100644 ---- a/drivers/net/hyperv/netvsc_drv.c -+++ b/drivers/net/hyperv/netvsc_drv.c -@@ -1264,12 +1264,12 @@ static void netvsc_get_vf_stats(struct net_device *net, +diff -rupN linux.orig/drivers/net/hyperv/netvsc_drv.c linux/drivers/net/hyperv/netvsc_drv.c +--- linux.orig/drivers/net/hyperv/netvsc_drv.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/hyperv/netvsc_drv.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1264,12 +1264,12 @@ static void netvsc_get_vf_stats(struct n unsigned int start; do { @@ -2595,7 +14741,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 tot->rx_packets += rx_packets; tot->tx_packets += tx_packets; -@@ -1294,12 +1294,12 @@ static void netvsc_get_pcpu_stats(struct net_device *net, +@@ -1294,12 +1294,12 @@ static void netvsc_get_pcpu_stats(struct unsigned int start; do { @@ -2610,7 +14756,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 this_tot->rx_packets = this_tot->vf_rx_packets; this_tot->tx_packets = this_tot->vf_tx_packets; this_tot->rx_bytes = this_tot->vf_rx_bytes; -@@ -1318,20 +1318,20 @@ static void netvsc_get_pcpu_stats(struct net_device *net, +@@ -1318,20 +1318,20 @@ static void netvsc_get_pcpu_stats(struct tx_stats = &nvchan->tx_stats; do { @@ -2635,7 +14781,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 this_tot->rx_bytes += bytes; this_tot->rx_packets += packets; -@@ -1370,21 +1370,21 @@ static void netvsc_get_stats64(struct net_device *net, +@@ -1370,21 +1370,21 @@ static void netvsc_get_stats64(struct ne tx_stats = &nvchan->tx_stats; do { @@ -2661,7 +14807,7 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 t->rx_bytes += bytes; t->rx_packets += packets; -@@ -1527,24 +1527,24 @@ static void netvsc_get_ethtool_stats(struct net_device *dev, +@@ -1527,24 +1527,24 @@ static void netvsc_get_ethtool_stats(str tx_stats = &nvdev->chan_table[j].tx_stats; do { @@ -2690,11 +14836,10 @@ index 8113ac17ab70a..2fd8b9c51e839 100644 data[i++] = packets; data[i++] = bytes; data[i++] = xdp_drop; -diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c -index 1c64d5347b8e0..78253ad57b2ef 100644 ---- a/drivers/net/ifb.c -+++ b/drivers/net/ifb.c -@@ -162,18 +162,18 @@ static void ifb_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ifb.c linux/drivers/net/ifb.c +--- linux.orig/drivers/net/ifb.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ifb.c 2022-12-04 10:40:26.696034096 -0500 +@@ -162,18 +162,18 @@ static void ifb_stats64(struct net_devic for (i = 0; i < dev->num_tx_queues; i++,txp++) { do { @@ -2717,7 +14862,7 @@ index 1c64d5347b8e0..78253ad57b2ef 100644 stats->tx_packets += packets; stats->tx_bytes += bytes; } -@@ -245,12 +245,12 @@ static void ifb_fill_stats_data(u64 **data, +@@ -245,12 +245,12 @@ static void ifb_fill_stats_data(u64 **da int j; do { @@ -2732,11 +14877,10 @@ index 1c64d5347b8e0..78253ad57b2ef 100644 *data += IFB_Q_STATS_LEN; } -diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c -index 49ba8a50dfb1e..8a58d74638cd8 100644 ---- a/drivers/net/ipvlan/ipvlan_main.c -+++ b/drivers/net/ipvlan/ipvlan_main.c -@@ -299,13 +299,13 @@ static void ipvlan_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/ipvlan/ipvlan_main.c linux/drivers/net/ipvlan/ipvlan_main.c +--- linux.orig/drivers/net/ipvlan/ipvlan_main.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/ipvlan/ipvlan_main.c 2022-12-04 10:40:26.696034096 -0500 +@@ -301,13 +301,13 @@ static void ipvlan_get_stats64(struct ne for_each_possible_cpu(idx) { pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx); do { @@ -2752,11 +14896,1096 @@ index 49ba8a50dfb1e..8a58d74638cd8 100644 strt)); s->rx_packets += rx_pkts; -diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c -index 14e8d04cb4347..c4ad98d39ea60 100644 ---- a/drivers/net/loopback.c -+++ b/drivers/net/loopback.c -@@ -106,10 +106,10 @@ void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes) +diff -rupN linux.orig/drivers/net/ipvlan/ipvlan_main.c.orig linux/drivers/net/ipvlan/ipvlan_main.c.orig +--- linux.orig/drivers/net/ipvlan/ipvlan_main.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/ipvlan/ipvlan_main.c.orig 2022-12-04 10:40:18.180055916 -0500 +@@ -0,0 +1,1082 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* Copyright (c) 2014 Mahesh Bandewar ++ */ ++ ++#include ++ ++#include "ipvlan.h" ++ ++static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, ++ struct netlink_ext_ack *extack) ++{ ++ struct ipvl_dev *ipvlan; ++ unsigned int flags; ++ int err; ++ ++ ASSERT_RTNL(); ++ if (port->mode != nval) { ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) { ++ flags = ipvlan->dev->flags; ++ if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) { ++ err = dev_change_flags(ipvlan->dev, ++ flags | IFF_NOARP, ++ extack); ++ } else { ++ err = dev_change_flags(ipvlan->dev, ++ flags & ~IFF_NOARP, ++ extack); ++ } ++ if (unlikely(err)) ++ goto fail; ++ } ++ if (nval == IPVLAN_MODE_L3S) { ++ /* New mode is L3S */ ++ err = ipvlan_l3s_register(port); ++ if (err) ++ goto fail; ++ } else if (port->mode == IPVLAN_MODE_L3S) { ++ /* Old mode was L3S */ ++ ipvlan_l3s_unregister(port); ++ } ++ port->mode = nval; ++ } ++ return 0; ++ ++fail: ++ /* Undo the flags changes that have been done so far. */ ++ list_for_each_entry_continue_reverse(ipvlan, &port->ipvlans, pnode) { ++ flags = ipvlan->dev->flags; ++ if (port->mode == IPVLAN_MODE_L3 || ++ port->mode == IPVLAN_MODE_L3S) ++ dev_change_flags(ipvlan->dev, flags | IFF_NOARP, ++ NULL); ++ else ++ dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP, ++ NULL); ++ } ++ ++ return err; ++} ++ ++static int ipvlan_port_create(struct net_device *dev) ++{ ++ struct ipvl_port *port; ++ int err, idx; ++ ++ port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL); ++ if (!port) ++ return -ENOMEM; ++ ++ write_pnet(&port->pnet, dev_net(dev)); ++ port->dev = dev; ++ port->mode = IPVLAN_MODE_L3; ++ INIT_LIST_HEAD(&port->ipvlans); ++ for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++) ++ INIT_HLIST_HEAD(&port->hlhead[idx]); ++ ++ skb_queue_head_init(&port->backlog); ++ INIT_WORK(&port->wq, ipvlan_process_multicast); ++ ida_init(&port->ida); ++ port->dev_id_start = 1; ++ ++ err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); ++ if (err) ++ goto err; ++ ++ netdev_hold(dev, &port->dev_tracker, GFP_KERNEL); ++ return 0; ++ ++err: ++ kfree(port); ++ return err; ++} ++ ++static void ipvlan_port_destroy(struct net_device *dev) ++{ ++ struct ipvl_port *port = ipvlan_port_get_rtnl(dev); ++ struct sk_buff *skb; ++ ++ netdev_put(dev, &port->dev_tracker); ++ if (port->mode == IPVLAN_MODE_L3S) ++ ipvlan_l3s_unregister(port); ++ netdev_rx_handler_unregister(dev); ++ cancel_work_sync(&port->wq); ++ while ((skb = __skb_dequeue(&port->backlog)) != NULL) { ++ dev_put(skb->dev); ++ kfree_skb(skb); ++ } ++ ida_destroy(&port->ida); ++ kfree(port); ++} ++ ++#define IPVLAN_ALWAYS_ON_OFLOADS \ ++ (NETIF_F_SG | NETIF_F_HW_CSUM | \ ++ NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL) ++ ++#define IPVLAN_ALWAYS_ON \ ++ (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED) ++ ++#define IPVLAN_FEATURES \ ++ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ ++ NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \ ++ NETIF_F_GRO | NETIF_F_RXCSUM | \ ++ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) ++ ++ /* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */ ++ ++#define IPVLAN_STATE_MASK \ ++ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) ++ ++static int ipvlan_init(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ struct ipvl_port *port; ++ int err; ++ ++ dev->state = (dev->state & ~IPVLAN_STATE_MASK) | ++ (phy_dev->state & IPVLAN_STATE_MASK); ++ dev->features = phy_dev->features & IPVLAN_FEATURES; ++ dev->features |= IPVLAN_ALWAYS_ON; ++ dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; ++ dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; ++ dev->hw_enc_features |= dev->features; ++ netif_inherit_tso_max(dev, phy_dev); ++ dev->hard_header_len = phy_dev->hard_header_len; ++ ++ netdev_lockdep_set_classes(dev); ++ ++ ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); ++ if (!ipvlan->pcpu_stats) ++ return -ENOMEM; ++ ++ if (!netif_is_ipvlan_port(phy_dev)) { ++ err = ipvlan_port_create(phy_dev); ++ if (err < 0) { ++ free_percpu(ipvlan->pcpu_stats); ++ return err; ++ } ++ } ++ port = ipvlan_port_get_rtnl(phy_dev); ++ port->count += 1; ++ return 0; ++} ++ ++static void ipvlan_uninit(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ struct ipvl_port *port; ++ ++ free_percpu(ipvlan->pcpu_stats); ++ ++ port = ipvlan_port_get_rtnl(phy_dev); ++ port->count -= 1; ++ if (!port->count) ++ ipvlan_port_destroy(port->dev); ++} ++ ++static int ipvlan_open(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_addr *addr; ++ ++ if (ipvlan->port->mode == IPVLAN_MODE_L3 || ++ ipvlan->port->mode == IPVLAN_MODE_L3S) ++ dev->flags |= IFF_NOARP; ++ else ++ dev->flags &= ~IFF_NOARP; ++ ++ rcu_read_lock(); ++ list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ++ ipvlan_ht_addr_add(ipvlan, addr); ++ rcu_read_unlock(); ++ ++ return 0; ++} ++ ++static int ipvlan_stop(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ struct ipvl_addr *addr; ++ ++ dev_uc_unsync(phy_dev, dev); ++ dev_mc_unsync(phy_dev, dev); ++ ++ rcu_read_lock(); ++ list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ++ ipvlan_ht_addr_del(addr); ++ rcu_read_unlock(); ++ ++ return 0; ++} ++ ++static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ const struct ipvl_dev *ipvlan = netdev_priv(dev); ++ int skblen = skb->len; ++ int ret; ++ ++ ret = ipvlan_queue_xmit(skb, dev); ++ if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { ++ struct ipvl_pcpu_stats *pcptr; ++ ++ pcptr = this_cpu_ptr(ipvlan->pcpu_stats); ++ ++ u64_stats_update_begin(&pcptr->syncp); ++ u64_stats_inc(&pcptr->tx_pkts); ++ u64_stats_add(&pcptr->tx_bytes, skblen); ++ u64_stats_update_end(&pcptr->syncp); ++ } else { ++ this_cpu_inc(ipvlan->pcpu_stats->tx_drps); ++ } ++ return ret; ++} ++ ++static netdev_features_t ipvlan_fix_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ features |= NETIF_F_ALL_FOR_ALL; ++ features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES); ++ features = netdev_increment_features(ipvlan->phy_dev->features, ++ features, features); ++ features |= IPVLAN_ALWAYS_ON; ++ features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON); ++ ++ return features; ++} ++ ++static void ipvlan_change_rx_flags(struct net_device *dev, int change) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ ++ if (change & IFF_ALLMULTI) ++ dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1); ++} ++ ++static void ipvlan_set_multicast_mac_filter(struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { ++ bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE); ++ } else { ++ struct netdev_hw_addr *ha; ++ DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE); ++ ++ bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE); ++ netdev_for_each_mc_addr(ha, dev) ++ __set_bit(ipvlan_mac_hash(ha->addr), mc_filters); ++ ++ /* Turn-on broadcast bit irrespective of address family, ++ * since broadcast is deferred to a work-queue, hence no ++ * impact on fast-path processing. ++ */ ++ __set_bit(ipvlan_mac_hash(dev->broadcast), mc_filters); ++ ++ bitmap_copy(ipvlan->mac_filters, mc_filters, ++ IPVLAN_MAC_FILTER_SIZE); ++ } ++ dev_uc_sync(ipvlan->phy_dev, dev); ++ dev_mc_sync(ipvlan->phy_dev, dev); ++} ++ ++static void ipvlan_get_stats64(struct net_device *dev, ++ struct rtnl_link_stats64 *s) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (ipvlan->pcpu_stats) { ++ struct ipvl_pcpu_stats *pcptr; ++ u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes; ++ u32 rx_errs = 0, tx_drps = 0; ++ u32 strt; ++ int idx; ++ ++ for_each_possible_cpu(idx) { ++ pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx); ++ do { ++ strt= u64_stats_fetch_begin_irq(&pcptr->syncp); ++ rx_pkts = u64_stats_read(&pcptr->rx_pkts); ++ rx_bytes = u64_stats_read(&pcptr->rx_bytes); ++ rx_mcast = u64_stats_read(&pcptr->rx_mcast); ++ tx_pkts = u64_stats_read(&pcptr->tx_pkts); ++ tx_bytes = u64_stats_read(&pcptr->tx_bytes); ++ } while (u64_stats_fetch_retry_irq(&pcptr->syncp, ++ strt)); ++ ++ s->rx_packets += rx_pkts; ++ s->rx_bytes += rx_bytes; ++ s->multicast += rx_mcast; ++ s->tx_packets += tx_pkts; ++ s->tx_bytes += tx_bytes; ++ ++ /* u32 values are updated without syncp protection. */ ++ rx_errs += READ_ONCE(pcptr->rx_errs); ++ tx_drps += READ_ONCE(pcptr->tx_drps); ++ } ++ s->rx_errors = rx_errs; ++ s->rx_dropped = rx_errs; ++ s->tx_dropped = tx_drps; ++ } ++} ++ ++static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ ++ return vlan_vid_add(phy_dev, proto, vid); ++} ++ ++static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, ++ u16 vid) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ ++ vlan_vid_del(phy_dev, proto, vid); ++ return 0; ++} ++ ++static int ipvlan_get_iflink(const struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ return ipvlan->phy_dev->ifindex; ++} ++ ++static const struct net_device_ops ipvlan_netdev_ops = { ++ .ndo_init = ipvlan_init, ++ .ndo_uninit = ipvlan_uninit, ++ .ndo_open = ipvlan_open, ++ .ndo_stop = ipvlan_stop, ++ .ndo_start_xmit = ipvlan_start_xmit, ++ .ndo_fix_features = ipvlan_fix_features, ++ .ndo_change_rx_flags = ipvlan_change_rx_flags, ++ .ndo_set_rx_mode = ipvlan_set_multicast_mac_filter, ++ .ndo_get_stats64 = ipvlan_get_stats64, ++ .ndo_vlan_rx_add_vid = ipvlan_vlan_rx_add_vid, ++ .ndo_vlan_rx_kill_vid = ipvlan_vlan_rx_kill_vid, ++ .ndo_get_iflink = ipvlan_get_iflink, ++}; ++ ++static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev, ++ unsigned short type, const void *daddr, ++ const void *saddr, unsigned len) ++{ ++ const struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct net_device *phy_dev = ipvlan->phy_dev; ++ ++ /* TODO Probably use a different field than dev_addr so that the ++ * mac-address on the virtual device is portable and can be carried ++ * while the packets use the mac-addr on the physical device. ++ */ ++ return dev_hard_header(skb, phy_dev, type, daddr, ++ saddr ? : phy_dev->dev_addr, len); ++} ++ ++static const struct header_ops ipvlan_header_ops = { ++ .create = ipvlan_hard_header, ++ .parse = eth_header_parse, ++ .cache = eth_header_cache, ++ .cache_update = eth_header_cache_update, ++}; ++ ++static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) ++{ ++ ipvlan->dev->mtu = dev->mtu; ++} ++ ++static bool netif_is_ipvlan(const struct net_device *dev) ++{ ++ /* both ipvlan and ipvtap devices use the same netdev_ops */ ++ return dev->netdev_ops == &ipvlan_netdev_ops; ++} ++ ++static int ipvlan_ethtool_get_link_ksettings(struct net_device *dev, ++ struct ethtool_link_ksettings *cmd) ++{ ++ const struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ return __ethtool_get_link_ksettings(ipvlan->phy_dev, cmd); ++} ++ ++static void ipvlan_ethtool_get_drvinfo(struct net_device *dev, ++ struct ethtool_drvinfo *drvinfo) ++{ ++ strlcpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver)); ++ strlcpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version)); ++} ++ ++static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev) ++{ ++ const struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ return ipvlan->msg_enable; ++} ++ ++static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ ipvlan->msg_enable = value; ++} ++ ++static const struct ethtool_ops ipvlan_ethtool_ops = { ++ .get_link = ethtool_op_get_link, ++ .get_link_ksettings = ipvlan_ethtool_get_link_ksettings, ++ .get_drvinfo = ipvlan_ethtool_get_drvinfo, ++ .get_msglevel = ipvlan_ethtool_get_msglevel, ++ .set_msglevel = ipvlan_ethtool_set_msglevel, ++}; ++ ++static int ipvlan_nl_changelink(struct net_device *dev, ++ struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); ++ int err = 0; ++ ++ if (!data) ++ return 0; ++ if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ if (data[IFLA_IPVLAN_MODE]) { ++ u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); ++ ++ err = ipvlan_set_port_mode(port, nmode, extack); ++ } ++ ++ if (!err && data[IFLA_IPVLAN_FLAGS]) { ++ u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); ++ ++ if (flags & IPVLAN_F_PRIVATE) ++ ipvlan_mark_private(port); ++ else ++ ipvlan_clear_private(port); ++ ++ if (flags & IPVLAN_F_VEPA) ++ ipvlan_mark_vepa(port); ++ else ++ ipvlan_clear_vepa(port); ++ } ++ ++ return err; ++} ++ ++static size_t ipvlan_nl_getsize(const struct net_device *dev) ++{ ++ return (0 ++ + nla_total_size(2) /* IFLA_IPVLAN_MODE */ ++ + nla_total_size(2) /* IFLA_IPVLAN_FLAGS */ ++ ); ++} ++ ++static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ if (!data) ++ return 0; ++ ++ if (data[IFLA_IPVLAN_MODE]) { ++ u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); ++ ++ if (mode >= IPVLAN_MODE_MAX) ++ return -EINVAL; ++ } ++ if (data[IFLA_IPVLAN_FLAGS]) { ++ u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); ++ ++ /* Only two bits are used at this moment. */ ++ if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ++ return -EINVAL; ++ /* Also both flags can't be active at the same time. */ ++ if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) == ++ (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ipvlan_nl_fillinfo(struct sk_buff *skb, ++ const struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); ++ int ret = -EINVAL; ++ ++ if (!port) ++ goto err; ++ ++ ret = -EMSGSIZE; ++ if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode)) ++ goto err; ++ if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags)) ++ goto err; ++ ++ return 0; ++ ++err: ++ return ret; ++} ++ ++int ipvlan_link_new(struct net *src_net, struct net_device *dev, ++ struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_port *port; ++ struct net_device *phy_dev; ++ int err; ++ u16 mode = IPVLAN_MODE_L3; ++ ++ if (!tb[IFLA_LINK]) ++ return -EINVAL; ++ ++ phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); ++ if (!phy_dev) ++ return -ENODEV; ++ ++ if (netif_is_ipvlan(phy_dev)) { ++ struct ipvl_dev *tmp = netdev_priv(phy_dev); ++ ++ phy_dev = tmp->phy_dev; ++ if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN)) ++ return -EPERM; ++ } else if (!netif_is_ipvlan_port(phy_dev)) { ++ /* Exit early if the underlying link is invalid or busy */ ++ if (phy_dev->type != ARPHRD_ETHER || ++ phy_dev->flags & IFF_LOOPBACK) { ++ netdev_err(phy_dev, ++ "Master is either lo or non-ether device\n"); ++ return -EINVAL; ++ } ++ ++ if (netdev_is_rx_handler_busy(phy_dev)) { ++ netdev_err(phy_dev, "Device is already in use.\n"); ++ return -EBUSY; ++ } ++ } ++ ++ ipvlan->phy_dev = phy_dev; ++ ipvlan->dev = dev; ++ ipvlan->sfeatures = IPVLAN_FEATURES; ++ if (!tb[IFLA_MTU]) ++ ipvlan_adjust_mtu(ipvlan, phy_dev); ++ INIT_LIST_HEAD(&ipvlan->addrs); ++ spin_lock_init(&ipvlan->addrs_lock); ++ ++ /* TODO Probably put random address here to be presented to the ++ * world but keep using the physical-dev address for the outgoing ++ * packets. ++ */ ++ eth_hw_addr_set(dev, phy_dev->dev_addr); ++ ++ dev->priv_flags |= IFF_NO_RX_HANDLER; ++ ++ err = register_netdevice(dev); ++ if (err < 0) ++ return err; ++ ++ /* ipvlan_init() would have created the port, if required */ ++ port = ipvlan_port_get_rtnl(phy_dev); ++ ipvlan->port = port; ++ ++ /* If the port-id base is at the MAX value, then wrap it around and ++ * begin from 0x1 again. This may be due to a busy system where lots ++ * of slaves are getting created and deleted. ++ */ ++ if (port->dev_id_start == 0xFFFE) ++ port->dev_id_start = 0x1; ++ ++ /* Since L2 address is shared among all IPvlan slaves including ++ * master, use unique 16 bit dev-ids to diffentiate among them. ++ * Assign IDs between 0x1 and 0xFFFE (used by the master) to each ++ * slave link [see addrconf_ifid_eui48()]. ++ */ ++ err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE, ++ GFP_KERNEL); ++ if (err < 0) ++ err = ida_simple_get(&port->ida, 0x1, port->dev_id_start, ++ GFP_KERNEL); ++ if (err < 0) ++ goto unregister_netdev; ++ dev->dev_id = err; ++ ++ /* Increment id-base to the next slot for the future assignment */ ++ port->dev_id_start = err + 1; ++ ++ err = netdev_upper_dev_link(phy_dev, dev, extack); ++ if (err) ++ goto remove_ida; ++ ++ /* Flags are per port and latest update overrides. User has ++ * to be consistent in setting it just like the mode attribute. ++ */ ++ if (data && data[IFLA_IPVLAN_FLAGS]) ++ port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); ++ ++ if (data && data[IFLA_IPVLAN_MODE]) ++ mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); ++ ++ err = ipvlan_set_port_mode(port, mode, extack); ++ if (err) ++ goto unlink_netdev; ++ ++ list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); ++ netif_stacked_transfer_operstate(phy_dev, dev); ++ return 0; ++ ++unlink_netdev: ++ netdev_upper_dev_unlink(phy_dev, dev); ++remove_ida: ++ ida_simple_remove(&port->ida, dev->dev_id); ++unregister_netdev: ++ unregister_netdevice(dev); ++ return err; ++} ++EXPORT_SYMBOL_GPL(ipvlan_link_new); ++ ++void ipvlan_link_delete(struct net_device *dev, struct list_head *head) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct ipvl_addr *addr, *next; ++ ++ spin_lock_bh(&ipvlan->addrs_lock); ++ list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ++ ipvlan_ht_addr_del(addr); ++ list_del_rcu(&addr->anode); ++ kfree_rcu(addr, rcu); ++ } ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ ++ ida_simple_remove(&ipvlan->port->ida, dev->dev_id); ++ list_del_rcu(&ipvlan->pnode); ++ unregister_netdevice_queue(dev, head); ++ netdev_upper_dev_unlink(ipvlan->phy_dev, dev); ++} ++EXPORT_SYMBOL_GPL(ipvlan_link_delete); ++ ++void ipvlan_link_setup(struct net_device *dev) ++{ ++ ether_setup(dev); ++ ++ dev->max_mtu = ETH_MAX_MTU; ++ dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); ++ dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE; ++ dev->netdev_ops = &ipvlan_netdev_ops; ++ dev->needs_free_netdev = true; ++ dev->header_ops = &ipvlan_header_ops; ++ dev->ethtool_ops = &ipvlan_ethtool_ops; ++} ++EXPORT_SYMBOL_GPL(ipvlan_link_setup); ++ ++static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] = ++{ ++ [IFLA_IPVLAN_MODE] = { .type = NLA_U16 }, ++ [IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 }, ++}; ++ ++static struct net *ipvlan_get_link_net(const struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ return dev_net(ipvlan->phy_dev); ++} ++ ++static struct rtnl_link_ops ipvlan_link_ops = { ++ .kind = "ipvlan", ++ .priv_size = sizeof(struct ipvl_dev), ++ ++ .setup = ipvlan_link_setup, ++ .newlink = ipvlan_link_new, ++ .dellink = ipvlan_link_delete, ++ .get_link_net = ipvlan_get_link_net, ++}; ++ ++int ipvlan_link_register(struct rtnl_link_ops *ops) ++{ ++ ops->get_size = ipvlan_nl_getsize; ++ ops->policy = ipvlan_nl_policy; ++ ops->validate = ipvlan_nl_validate; ++ ops->fill_info = ipvlan_nl_fillinfo; ++ ops->changelink = ipvlan_nl_changelink; ++ ops->maxtype = IFLA_IPVLAN_MAX; ++ return rtnl_link_register(ops); ++} ++EXPORT_SYMBOL_GPL(ipvlan_link_register); ++ ++static int ipvlan_device_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); ++ struct netdev_notifier_pre_changeaddr_info *prechaddr_info; ++ struct net_device *dev = netdev_notifier_info_to_dev(ptr); ++ struct ipvl_dev *ipvlan, *next; ++ struct ipvl_port *port; ++ LIST_HEAD(lst_kill); ++ int err; ++ ++ if (!netif_is_ipvlan_port(dev)) ++ return NOTIFY_DONE; ++ ++ port = ipvlan_port_get_rtnl(dev); ++ ++ switch (event) { ++ case NETDEV_UP: ++ case NETDEV_CHANGE: ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) ++ netif_stacked_transfer_operstate(ipvlan->phy_dev, ++ ipvlan->dev); ++ break; ++ ++ case NETDEV_REGISTER: { ++ struct net *oldnet, *newnet = dev_net(dev); ++ ++ oldnet = read_pnet(&port->pnet); ++ if (net_eq(newnet, oldnet)) ++ break; ++ ++ write_pnet(&port->pnet, newnet); ++ ++ ipvlan_migrate_l3s_hook(oldnet, newnet); ++ break; ++ } ++ case NETDEV_UNREGISTER: ++ if (dev->reg_state != NETREG_UNREGISTERING) ++ break; ++ ++ list_for_each_entry_safe(ipvlan, next, &port->ipvlans, pnode) ++ ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev, ++ &lst_kill); ++ unregister_netdevice_many(&lst_kill); ++ break; ++ ++ case NETDEV_FEAT_CHANGE: ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) { ++ netif_inherit_tso_max(ipvlan->dev, dev); ++ netdev_update_features(ipvlan->dev); ++ } ++ break; ++ ++ case NETDEV_CHANGEMTU: ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) ++ ipvlan_adjust_mtu(ipvlan, dev); ++ break; ++ ++ case NETDEV_PRE_CHANGEADDR: ++ prechaddr_info = ptr; ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) { ++ err = dev_pre_changeaddr_notify(ipvlan->dev, ++ prechaddr_info->dev_addr, ++ extack); ++ if (err) ++ return notifier_from_errno(err); ++ } ++ break; ++ ++ case NETDEV_CHANGEADDR: ++ list_for_each_entry(ipvlan, &port->ipvlans, pnode) { ++ eth_hw_addr_set(ipvlan->dev, dev->dev_addr); ++ call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev); ++ } ++ break; ++ ++ case NETDEV_PRE_TYPE_CHANGE: ++ /* Forbid underlying device to change its type. */ ++ return NOTIFY_BAD; ++ } ++ return NOTIFY_DONE; ++} ++ ++/* the caller must held the addrs lock */ ++static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) ++{ ++ struct ipvl_addr *addr; ++ ++ addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC); ++ if (!addr) ++ return -ENOMEM; ++ ++ addr->master = ipvlan; ++ if (!is_v6) { ++ memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr)); ++ addr->atype = IPVL_IPV4; ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { ++ memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr)); ++ addr->atype = IPVL_IPV6; ++#endif ++ } ++ ++ list_add_tail_rcu(&addr->anode, &ipvlan->addrs); ++ ++ /* If the interface is not up, the address will be added to the hash ++ * list by ipvlan_open. ++ */ ++ if (netif_running(ipvlan->dev)) ++ ipvlan_ht_addr_add(ipvlan, addr); ++ ++ return 0; ++} ++ ++static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) ++{ ++ struct ipvl_addr *addr; ++ ++ spin_lock_bh(&ipvlan->addrs_lock); ++ addr = ipvlan_find_addr(ipvlan, iaddr, is_v6); ++ if (!addr) { ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ return; ++ } ++ ++ ipvlan_ht_addr_del(addr); ++ list_del_rcu(&addr->anode); ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ kfree_rcu(addr, rcu); ++} ++ ++static bool ipvlan_is_valid_dev(const struct net_device *dev) ++{ ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (!netif_is_ipvlan(dev)) ++ return false; ++ ++ if (!ipvlan || !ipvlan->port) ++ return false; ++ ++ return true; ++} ++ ++#if IS_ENABLED(CONFIG_IPV6) ++static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) ++{ ++ int ret = -EINVAL; ++ ++ spin_lock_bh(&ipvlan->addrs_lock); ++ if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) ++ netif_err(ipvlan, ifup, ipvlan->dev, ++ "Failed to add IPv6=%pI6c addr for %s intf\n", ++ ip6_addr, ipvlan->dev->name); ++ else ++ ret = ipvlan_add_addr(ipvlan, ip6_addr, true); ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ return ret; ++} ++ ++static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) ++{ ++ return ipvlan_del_addr(ipvlan, ip6_addr, true); ++} ++ ++static int ipvlan_addr6_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr; ++ struct net_device *dev = (struct net_device *)if6->idev->dev; ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (!ipvlan_is_valid_dev(dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_UP: ++ if (ipvlan_add_addr6(ipvlan, &if6->addr)) ++ return NOTIFY_BAD; ++ break; ++ ++ case NETDEV_DOWN: ++ ipvlan_del_addr6(ipvlan, &if6->addr); ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static int ipvlan_addr6_validator_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr; ++ struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev; ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (!ipvlan_is_valid_dev(dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_UP: ++ if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) { ++ NL_SET_ERR_MSG(i6vi->extack, ++ "Address already assigned to an ipvlan device"); ++ return notifier_from_errno(-EADDRINUSE); ++ } ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++#endif ++ ++static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) ++{ ++ int ret = -EINVAL; ++ ++ spin_lock_bh(&ipvlan->addrs_lock); ++ if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false)) ++ netif_err(ipvlan, ifup, ipvlan->dev, ++ "Failed to add IPv4=%pI4 on %s intf.\n", ++ ip4_addr, ipvlan->dev->name); ++ else ++ ret = ipvlan_add_addr(ipvlan, ip4_addr, false); ++ spin_unlock_bh(&ipvlan->addrs_lock); ++ return ret; ++} ++ ++static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) ++{ ++ return ipvlan_del_addr(ipvlan, ip4_addr, false); ++} ++ ++static int ipvlan_addr4_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct in_ifaddr *if4 = (struct in_ifaddr *)ptr; ++ struct net_device *dev = (struct net_device *)if4->ifa_dev->dev; ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ struct in_addr ip4_addr; ++ ++ if (!ipvlan_is_valid_dev(dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_UP: ++ ip4_addr.s_addr = if4->ifa_address; ++ if (ipvlan_add_addr4(ipvlan, &ip4_addr)) ++ return NOTIFY_BAD; ++ break; ++ ++ case NETDEV_DOWN: ++ ip4_addr.s_addr = if4->ifa_address; ++ ipvlan_del_addr4(ipvlan, &ip4_addr); ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static int ipvlan_addr4_validator_event(struct notifier_block *unused, ++ unsigned long event, void *ptr) ++{ ++ struct in_validator_info *ivi = (struct in_validator_info *)ptr; ++ struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev; ++ struct ipvl_dev *ipvlan = netdev_priv(dev); ++ ++ if (!ipvlan_is_valid_dev(dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_UP: ++ if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) { ++ NL_SET_ERR_MSG(ivi->extack, ++ "Address already assigned to an ipvlan device"); ++ return notifier_from_errno(-EADDRINUSE); ++ } ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_addr4_event, ++}; ++ ++static struct notifier_block ipvlan_addr4_vtor_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_addr4_validator_event, ++}; ++ ++static struct notifier_block ipvlan_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_device_event, ++}; ++ ++#if IS_ENABLED(CONFIG_IPV6) ++static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_addr6_event, ++}; ++ ++static struct notifier_block ipvlan_addr6_vtor_notifier_block __read_mostly = { ++ .notifier_call = ipvlan_addr6_validator_event, ++}; ++#endif ++ ++static int __init ipvlan_init_module(void) ++{ ++ int err; ++ ++ ipvlan_init_secret(); ++ register_netdevice_notifier(&ipvlan_notifier_block); ++#if IS_ENABLED(CONFIG_IPV6) ++ register_inet6addr_notifier(&ipvlan_addr6_notifier_block); ++ register_inet6addr_validator_notifier( ++ &ipvlan_addr6_vtor_notifier_block); ++#endif ++ register_inetaddr_notifier(&ipvlan_addr4_notifier_block); ++ register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block); ++ ++ err = ipvlan_l3s_init(); ++ if (err < 0) ++ goto error; ++ ++ err = ipvlan_link_register(&ipvlan_link_ops); ++ if (err < 0) { ++ ipvlan_l3s_cleanup(); ++ goto error; ++ } ++ ++ return 0; ++error: ++ unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); ++ unregister_inetaddr_validator_notifier( ++ &ipvlan_addr4_vtor_notifier_block); ++#if IS_ENABLED(CONFIG_IPV6) ++ unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); ++ unregister_inet6addr_validator_notifier( ++ &ipvlan_addr6_vtor_notifier_block); ++#endif ++ unregister_netdevice_notifier(&ipvlan_notifier_block); ++ return err; ++} ++ ++static void __exit ipvlan_cleanup_module(void) ++{ ++ rtnl_link_unregister(&ipvlan_link_ops); ++ ipvlan_l3s_cleanup(); ++ unregister_netdevice_notifier(&ipvlan_notifier_block); ++ unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); ++ unregister_inetaddr_validator_notifier( ++ &ipvlan_addr4_vtor_notifier_block); ++#if IS_ENABLED(CONFIG_IPV6) ++ unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); ++ unregister_inet6addr_validator_notifier( ++ &ipvlan_addr6_vtor_notifier_block); ++#endif ++} ++ ++module_init(ipvlan_init_module); ++module_exit(ipvlan_cleanup_module); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Mahesh Bandewar "); ++MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs"); ++MODULE_ALIAS_RTNL_LINK("ipvlan"); +diff -rupN linux.orig/drivers/net/loopback.c linux/drivers/net/loopback.c +--- linux.orig/drivers/net/loopback.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/loopback.c 2022-12-04 10:40:26.696034096 -0500 +@@ -106,10 +106,10 @@ void dev_lstats_read(struct net_device * lb_stats = per_cpu_ptr(dev->lstats, i); do { @@ -2769,11 +15998,10 @@ index 14e8d04cb4347..c4ad98d39ea60 100644 *bytes += tbytes; *packets += tpackets; } -diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c -index c6d271e5687e9..5056f3cd5699a 100644 ---- a/drivers/net/macsec.c -+++ b/drivers/net/macsec.c -@@ -2823,9 +2823,9 @@ static void get_rx_sc_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/macsec.c linux/drivers/net/macsec.c +--- linux.orig/drivers/net/macsec.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/macsec.c 2022-12-04 10:40:26.696034096 -0500 +@@ -2821,9 +2821,9 @@ static void get_rx_sc_stats(struct net_d stats = per_cpu_ptr(rx_sc->stats, cpu); do { @@ -2785,7 +16013,7 @@ index c6d271e5687e9..5056f3cd5699a 100644 sum->InOctetsValidated += tmp.InOctetsValidated; sum->InOctetsDecrypted += tmp.InOctetsDecrypted; -@@ -2904,9 +2904,9 @@ static void get_tx_sc_stats(struct net_device *dev, +@@ -2902,9 +2902,9 @@ static void get_tx_sc_stats(struct net_d stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu); do { @@ -2797,7 +16025,7 @@ index c6d271e5687e9..5056f3cd5699a 100644 sum->OutPktsProtected += tmp.OutPktsProtected; sum->OutPktsEncrypted += tmp.OutPktsEncrypted; -@@ -2960,9 +2960,9 @@ static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum) +@@ -2958,9 +2958,9 @@ static void get_secy_stats(struct net_de stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu); do { @@ -2809,11 +16037,4431 @@ index c6d271e5687e9..5056f3cd5699a 100644 sum->OutPktsUntagged += tmp.OutPktsUntagged; sum->InPktsUntagged += tmp.InPktsUntagged; -diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c -index 1080d6ebff63b..a1c7823f0ba66 100644 ---- a/drivers/net/macvlan.c -+++ b/drivers/net/macvlan.c -@@ -948,13 +948,13 @@ static void macvlan_dev_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/macsec.c.orig linux/drivers/net/macsec.c.orig +--- linux.orig/drivers/net/macsec.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/net/macsec.c.orig 2022-12-04 10:40:18.180055916 -0500 +@@ -0,0 +1,4417 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * drivers/net/macsec.c - MACsec device ++ * ++ * Copyright (c) 2015 Sabrina Dubroca ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define MACSEC_SCI_LEN 8 ++ ++/* SecTAG length = macsec_eth_header without the optional SCI */ ++#define MACSEC_TAG_LEN 6 ++ ++struct macsec_eth_header { ++ struct ethhdr eth; ++ /* SecTAG */ ++ u8 tci_an; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ u8 short_length:6, ++ unused:2; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ u8 unused:2, ++ short_length:6; ++#else ++#error "Please fix " ++#endif ++ __be32 packet_number; ++ u8 secure_channel_id[8]; /* optional */ ++} __packed; ++ ++#define MACSEC_TCI_VERSION 0x80 ++#define MACSEC_TCI_ES 0x40 /* end station */ ++#define MACSEC_TCI_SC 0x20 /* SCI present */ ++#define MACSEC_TCI_SCB 0x10 /* epon */ ++#define MACSEC_TCI_E 0x08 /* encryption */ ++#define MACSEC_TCI_C 0x04 /* changed text */ ++#define MACSEC_AN_MASK 0x03 /* association number */ ++#define MACSEC_TCI_CONFID (MACSEC_TCI_E | MACSEC_TCI_C) ++ ++/* minimum secure data length deemed "not short", see IEEE 802.1AE-2006 9.7 */ ++#define MIN_NON_SHORT_LEN 48 ++ ++#define GCM_AES_IV_LEN 12 ++#define DEFAULT_ICV_LEN 16 ++ ++#define for_each_rxsc(secy, sc) \ ++ for (sc = rcu_dereference_bh(secy->rx_sc); \ ++ sc; \ ++ sc = rcu_dereference_bh(sc->next)) ++#define for_each_rxsc_rtnl(secy, sc) \ ++ for (sc = rtnl_dereference(secy->rx_sc); \ ++ sc; \ ++ sc = rtnl_dereference(sc->next)) ++ ++#define pn_same_half(pn1, pn2) (!(((pn1) >> 31) ^ ((pn2) >> 31))) ++ ++struct gcm_iv_xpn { ++ union { ++ u8 short_secure_channel_id[4]; ++ ssci_t ssci; ++ }; ++ __be64 pn; ++} __packed; ++ ++struct gcm_iv { ++ union { ++ u8 secure_channel_id[8]; ++ sci_t sci; ++ }; ++ __be32 pn; ++}; ++ ++#define MACSEC_VALIDATE_DEFAULT MACSEC_VALIDATE_STRICT ++ ++struct pcpu_secy_stats { ++ struct macsec_dev_stats stats; ++ struct u64_stats_sync syncp; ++}; ++ ++/** ++ * struct macsec_dev - private data ++ * @secy: SecY config ++ * @real_dev: pointer to underlying netdevice ++ * @dev_tracker: refcount tracker for @real_dev reference ++ * @stats: MACsec device stats ++ * @secys: linked list of SecY's on the underlying device ++ * @gro_cells: pointer to the Generic Receive Offload cell ++ * @offload: status of offloading on the MACsec device ++ */ ++struct macsec_dev { ++ struct macsec_secy secy; ++ struct net_device *real_dev; ++ netdevice_tracker dev_tracker; ++ struct pcpu_secy_stats __percpu *stats; ++ struct list_head secys; ++ struct gro_cells gro_cells; ++ enum macsec_offload offload; ++}; ++ ++/** ++ * struct macsec_rxh_data - rx_handler private argument ++ * @secys: linked list of SecY's on this underlying device ++ */ ++struct macsec_rxh_data { ++ struct list_head secys; ++}; ++ ++static struct macsec_dev *macsec_priv(const struct net_device *dev) ++{ ++ return (struct macsec_dev *)netdev_priv(dev); ++} ++ ++static struct macsec_rxh_data *macsec_data_rcu(const struct net_device *dev) ++{ ++ return rcu_dereference_bh(dev->rx_handler_data); ++} ++ ++static struct macsec_rxh_data *macsec_data_rtnl(const struct net_device *dev) ++{ ++ return rtnl_dereference(dev->rx_handler_data); ++} ++ ++struct macsec_cb { ++ struct aead_request *req; ++ union { ++ struct macsec_tx_sa *tx_sa; ++ struct macsec_rx_sa *rx_sa; ++ }; ++ u8 assoc_num; ++ bool valid; ++ bool has_sci; ++}; ++ ++static struct macsec_rx_sa *macsec_rxsa_get(struct macsec_rx_sa __rcu *ptr) ++{ ++ struct macsec_rx_sa *sa = rcu_dereference_bh(ptr); ++ ++ if (!sa || !sa->active) ++ return NULL; ++ ++ if (!refcount_inc_not_zero(&sa->refcnt)) ++ return NULL; ++ ++ return sa; ++} ++ ++static struct macsec_rx_sa *macsec_active_rxsa_get(struct macsec_rx_sc *rx_sc) ++{ ++ struct macsec_rx_sa *sa = NULL; ++ int an; ++ ++ for (an = 0; an < MACSEC_NUM_AN; an++) { ++ sa = macsec_rxsa_get(rx_sc->sa[an]); ++ if (sa) ++ break; ++ } ++ return sa; ++} ++ ++static void free_rx_sc_rcu(struct rcu_head *head) ++{ ++ struct macsec_rx_sc *rx_sc = container_of(head, struct macsec_rx_sc, rcu_head); ++ ++ free_percpu(rx_sc->stats); ++ kfree(rx_sc); ++} ++ ++static struct macsec_rx_sc *macsec_rxsc_get(struct macsec_rx_sc *sc) ++{ ++ return refcount_inc_not_zero(&sc->refcnt) ? sc : NULL; ++} ++ ++static void macsec_rxsc_put(struct macsec_rx_sc *sc) ++{ ++ if (refcount_dec_and_test(&sc->refcnt)) ++ call_rcu(&sc->rcu_head, free_rx_sc_rcu); ++} ++ ++static void free_rxsa(struct rcu_head *head) ++{ ++ struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu); ++ ++ crypto_free_aead(sa->key.tfm); ++ free_percpu(sa->stats); ++ kfree(sa); ++} ++ ++static void macsec_rxsa_put(struct macsec_rx_sa *sa) ++{ ++ if (refcount_dec_and_test(&sa->refcnt)) ++ call_rcu(&sa->rcu, free_rxsa); ++} ++ ++static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) ++{ ++ struct macsec_tx_sa *sa = rcu_dereference_bh(ptr); ++ ++ if (!sa || !sa->active) ++ return NULL; ++ ++ if (!refcount_inc_not_zero(&sa->refcnt)) ++ return NULL; ++ ++ return sa; ++} ++ ++static void free_txsa(struct rcu_head *head) ++{ ++ struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu); ++ ++ crypto_free_aead(sa->key.tfm); ++ free_percpu(sa->stats); ++ kfree(sa); ++} ++ ++static void macsec_txsa_put(struct macsec_tx_sa *sa) ++{ ++ if (refcount_dec_and_test(&sa->refcnt)) ++ call_rcu(&sa->rcu, free_txsa); ++} ++ ++static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) ++{ ++ BUILD_BUG_ON(sizeof(struct macsec_cb) > sizeof(skb->cb)); ++ return (struct macsec_cb *)skb->cb; ++} ++ ++#define MACSEC_PORT_ES (htons(0x0001)) ++#define MACSEC_PORT_SCB (0x0000) ++#define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL) ++#define MACSEC_UNDEF_SSCI ((__force ssci_t)0xffffffff) ++ ++#define MACSEC_GCM_AES_128_SAK_LEN 16 ++#define MACSEC_GCM_AES_256_SAK_LEN 32 ++ ++#define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN ++#define DEFAULT_XPN false ++#define DEFAULT_SEND_SCI true ++#define DEFAULT_ENCRYPT false ++#define DEFAULT_ENCODING_SA 0 ++#define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1)) ++ ++static bool send_sci(const struct macsec_secy *secy) ++{ ++ const struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ ++ return tx_sc->send_sci || ++ (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); ++} ++ ++static sci_t make_sci(const u8 *addr, __be16 port) ++{ ++ sci_t sci; ++ ++ memcpy(&sci, addr, ETH_ALEN); ++ memcpy(((char *)&sci) + ETH_ALEN, &port, sizeof(port)); ++ ++ return sci; ++} ++ ++static sci_t macsec_frame_sci(struct macsec_eth_header *hdr, bool sci_present) ++{ ++ sci_t sci; ++ ++ if (sci_present) ++ memcpy(&sci, hdr->secure_channel_id, ++ sizeof(hdr->secure_channel_id)); ++ else ++ sci = make_sci(hdr->eth.h_source, MACSEC_PORT_ES); ++ ++ return sci; ++} ++ ++static unsigned int macsec_sectag_len(bool sci_present) ++{ ++ return MACSEC_TAG_LEN + (sci_present ? MACSEC_SCI_LEN : 0); ++} ++ ++static unsigned int macsec_hdr_len(bool sci_present) ++{ ++ return macsec_sectag_len(sci_present) + ETH_HLEN; ++} ++ ++static unsigned int macsec_extra_len(bool sci_present) ++{ ++ return macsec_sectag_len(sci_present) + sizeof(__be16); ++} ++ ++/* Fill SecTAG according to IEEE 802.1AE-2006 10.5.3 */ ++static void macsec_fill_sectag(struct macsec_eth_header *h, ++ const struct macsec_secy *secy, u32 pn, ++ bool sci_present) ++{ ++ const struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ ++ memset(&h->tci_an, 0, macsec_sectag_len(sci_present)); ++ h->eth.h_proto = htons(ETH_P_MACSEC); ++ ++ if (sci_present) { ++ h->tci_an |= MACSEC_TCI_SC; ++ memcpy(&h->secure_channel_id, &secy->sci, ++ sizeof(h->secure_channel_id)); ++ } else { ++ if (tx_sc->end_station) ++ h->tci_an |= MACSEC_TCI_ES; ++ if (tx_sc->scb) ++ h->tci_an |= MACSEC_TCI_SCB; ++ } ++ ++ h->packet_number = htonl(pn); ++ ++ /* with GCM, C/E clear for !encrypt, both set for encrypt */ ++ if (tx_sc->encrypt) ++ h->tci_an |= MACSEC_TCI_CONFID; ++ else if (secy->icv_len != DEFAULT_ICV_LEN) ++ h->tci_an |= MACSEC_TCI_C; ++ ++ h->tci_an |= tx_sc->encoding_sa; ++} ++ ++static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len) ++{ ++ if (data_len < MIN_NON_SHORT_LEN) ++ h->short_length = data_len; ++} ++ ++/* Checks if a MACsec interface is being offloaded to an hardware engine */ ++static bool macsec_is_offloaded(struct macsec_dev *macsec) ++{ ++ if (macsec->offload == MACSEC_OFFLOAD_MAC || ++ macsec->offload == MACSEC_OFFLOAD_PHY) ++ return true; ++ ++ return false; ++} ++ ++/* Checks if underlying layers implement MACsec offloading functions. */ ++static bool macsec_check_offload(enum macsec_offload offload, ++ struct macsec_dev *macsec) ++{ ++ if (!macsec || !macsec->real_dev) ++ return false; ++ ++ if (offload == MACSEC_OFFLOAD_PHY) ++ return macsec->real_dev->phydev && ++ macsec->real_dev->phydev->macsec_ops; ++ else if (offload == MACSEC_OFFLOAD_MAC) ++ return macsec->real_dev->features & NETIF_F_HW_MACSEC && ++ macsec->real_dev->macsec_ops; ++ ++ return false; ++} ++ ++static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload, ++ struct macsec_dev *macsec, ++ struct macsec_context *ctx) ++{ ++ if (ctx) { ++ memset(ctx, 0, sizeof(*ctx)); ++ ctx->offload = offload; ++ ++ if (offload == MACSEC_OFFLOAD_PHY) ++ ctx->phydev = macsec->real_dev->phydev; ++ else if (offload == MACSEC_OFFLOAD_MAC) ++ ctx->netdev = macsec->real_dev; ++ } ++ ++ if (offload == MACSEC_OFFLOAD_PHY) ++ return macsec->real_dev->phydev->macsec_ops; ++ else ++ return macsec->real_dev->macsec_ops; ++} ++ ++/* Returns a pointer to the MACsec ops struct if any and updates the MACsec ++ * context device reference if provided. ++ */ ++static const struct macsec_ops *macsec_get_ops(struct macsec_dev *macsec, ++ struct macsec_context *ctx) ++{ ++ if (!macsec_check_offload(macsec->offload, macsec)) ++ return NULL; ++ ++ return __macsec_get_ops(macsec->offload, macsec, ctx); ++} ++ ++/* validate MACsec packet according to IEEE 802.1AE-2018 9.12 */ ++static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len, bool xpn) ++{ ++ struct macsec_eth_header *h = (struct macsec_eth_header *)skb->data; ++ int len = skb->len - 2 * ETH_ALEN; ++ int extra_len = macsec_extra_len(!!(h->tci_an & MACSEC_TCI_SC)) + icv_len; ++ ++ /* a) It comprises at least 17 octets */ ++ if (skb->len <= 16) ++ return false; ++ ++ /* b) MACsec EtherType: already checked */ ++ ++ /* c) V bit is clear */ ++ if (h->tci_an & MACSEC_TCI_VERSION) ++ return false; ++ ++ /* d) ES or SCB => !SC */ ++ if ((h->tci_an & MACSEC_TCI_ES || h->tci_an & MACSEC_TCI_SCB) && ++ (h->tci_an & MACSEC_TCI_SC)) ++ return false; ++ ++ /* e) Bits 7 and 8 of octet 4 of the SecTAG are clear */ ++ if (h->unused) ++ return false; ++ ++ /* rx.pn != 0 if not XPN (figure 10-5 with 802.11AEbw-2013 amendment) */ ++ if (!h->packet_number && !xpn) ++ return false; ++ ++ /* length check, f) g) h) i) */ ++ if (h->short_length) ++ return len == extra_len + h->short_length; ++ return len >= extra_len + MIN_NON_SHORT_LEN; ++} ++ ++#define MACSEC_NEEDED_HEADROOM (macsec_extra_len(true)) ++#define MACSEC_NEEDED_TAILROOM MACSEC_STD_ICV_LEN ++ ++static void macsec_fill_iv_xpn(unsigned char *iv, ssci_t ssci, u64 pn, ++ salt_t salt) ++{ ++ struct gcm_iv_xpn *gcm_iv = (struct gcm_iv_xpn *)iv; ++ ++ gcm_iv->ssci = ssci ^ salt.ssci; ++ gcm_iv->pn = cpu_to_be64(pn) ^ salt.pn; ++} ++ ++static void macsec_fill_iv(unsigned char *iv, sci_t sci, u32 pn) ++{ ++ struct gcm_iv *gcm_iv = (struct gcm_iv *)iv; ++ ++ gcm_iv->sci = sci; ++ gcm_iv->pn = htonl(pn); ++} ++ ++static struct macsec_eth_header *macsec_ethhdr(struct sk_buff *skb) ++{ ++ return (struct macsec_eth_header *)skb_mac_header(skb); ++} ++ ++static void __macsec_pn_wrapped(struct macsec_secy *secy, ++ struct macsec_tx_sa *tx_sa) ++{ ++ pr_debug("PN wrapped, transitioning to !oper\n"); ++ tx_sa->active = false; ++ if (secy->protect_frames) ++ secy->operational = false; ++} ++ ++void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa) ++{ ++ spin_lock_bh(&tx_sa->lock); ++ __macsec_pn_wrapped(secy, tx_sa); ++ spin_unlock_bh(&tx_sa->lock); ++} ++EXPORT_SYMBOL_GPL(macsec_pn_wrapped); ++ ++static pn_t tx_sa_update_pn(struct macsec_tx_sa *tx_sa, ++ struct macsec_secy *secy) ++{ ++ pn_t pn; ++ ++ spin_lock_bh(&tx_sa->lock); ++ ++ pn = tx_sa->next_pn_halves; ++ if (secy->xpn) ++ tx_sa->next_pn++; ++ else ++ tx_sa->next_pn_halves.lower++; ++ ++ if (tx_sa->next_pn == 0) ++ __macsec_pn_wrapped(secy, tx_sa); ++ spin_unlock_bh(&tx_sa->lock); ++ ++ return pn; ++} ++ ++static void macsec_encrypt_finish(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct macsec_dev *macsec = netdev_priv(dev); ++ ++ skb->dev = macsec->real_dev; ++ skb_reset_mac_header(skb); ++ skb->protocol = eth_hdr(skb)->h_proto; ++} ++ ++static unsigned int macsec_msdu_len(struct sk_buff *skb) ++{ ++ struct macsec_dev *macsec = macsec_priv(skb->dev); ++ struct macsec_secy *secy = &macsec->secy; ++ bool sci_present = macsec_skb_cb(skb)->has_sci; ++ ++ return skb->len - macsec_hdr_len(sci_present) - secy->icv_len; ++} ++ ++static void macsec_count_tx(struct sk_buff *skb, struct macsec_tx_sc *tx_sc, ++ struct macsec_tx_sa *tx_sa) ++{ ++ unsigned int msdu_len = macsec_msdu_len(skb); ++ struct pcpu_tx_sc_stats *txsc_stats = this_cpu_ptr(tx_sc->stats); ++ ++ u64_stats_update_begin(&txsc_stats->syncp); ++ if (tx_sc->encrypt) { ++ txsc_stats->stats.OutOctetsEncrypted += msdu_len; ++ txsc_stats->stats.OutPktsEncrypted++; ++ this_cpu_inc(tx_sa->stats->OutPktsEncrypted); ++ } else { ++ txsc_stats->stats.OutOctetsProtected += msdu_len; ++ txsc_stats->stats.OutPktsProtected++; ++ this_cpu_inc(tx_sa->stats->OutPktsProtected); ++ } ++ u64_stats_update_end(&txsc_stats->syncp); ++} ++ ++static void count_tx(struct net_device *dev, int ret, int len) ++{ ++ if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { ++ struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats); ++ ++ u64_stats_update_begin(&stats->syncp); ++ u64_stats_inc(&stats->tx_packets); ++ u64_stats_add(&stats->tx_bytes, len); ++ u64_stats_update_end(&stats->syncp); ++ } ++} ++ ++static void macsec_encrypt_done(struct crypto_async_request *base, int err) ++{ ++ struct sk_buff *skb = base->data; ++ struct net_device *dev = skb->dev; ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa; ++ int len, ret; ++ ++ aead_request_free(macsec_skb_cb(skb)->req); ++ ++ rcu_read_lock_bh(); ++ macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); ++ /* packet is encrypted/protected so tx_bytes must be calculated */ ++ len = macsec_msdu_len(skb) + 2 * ETH_ALEN; ++ macsec_encrypt_finish(skb, dev); ++ ret = dev_queue_xmit(skb); ++ count_tx(dev, ret, len); ++ rcu_read_unlock_bh(); ++ ++ macsec_txsa_put(sa); ++ dev_put(dev); ++} ++ ++static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm, ++ unsigned char **iv, ++ struct scatterlist **sg, ++ int num_frags) ++{ ++ size_t size, iv_offset, sg_offset; ++ struct aead_request *req; ++ void *tmp; ++ ++ size = sizeof(struct aead_request) + crypto_aead_reqsize(tfm); ++ iv_offset = size; ++ size += GCM_AES_IV_LEN; ++ ++ size = ALIGN(size, __alignof__(struct scatterlist)); ++ sg_offset = size; ++ size += sizeof(struct scatterlist) * num_frags; ++ ++ tmp = kmalloc(size, GFP_ATOMIC); ++ if (!tmp) ++ return NULL; ++ ++ *iv = (unsigned char *)(tmp + iv_offset); ++ *sg = (struct scatterlist *)(tmp + sg_offset); ++ req = tmp; ++ ++ aead_request_set_tfm(req, tfm); ++ ++ return req; ++} ++ ++static struct sk_buff *macsec_encrypt(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ int ret; ++ struct scatterlist *sg; ++ struct sk_buff *trailer; ++ unsigned char *iv; ++ struct ethhdr *eth; ++ struct macsec_eth_header *hh; ++ size_t unprotected_len; ++ struct aead_request *req; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ struct macsec_dev *macsec = macsec_priv(dev); ++ bool sci_present; ++ pn_t pn; ++ ++ secy = &macsec->secy; ++ tx_sc = &secy->tx_sc; ++ ++ /* 10.5.1 TX SA assignment */ ++ tx_sa = macsec_txsa_get(tx_sc->sa[tx_sc->encoding_sa]); ++ if (!tx_sa) { ++ secy->operational = false; ++ kfree_skb(skb); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ if (unlikely(skb_headroom(skb) < MACSEC_NEEDED_HEADROOM || ++ skb_tailroom(skb) < MACSEC_NEEDED_TAILROOM)) { ++ struct sk_buff *nskb = skb_copy_expand(skb, ++ MACSEC_NEEDED_HEADROOM, ++ MACSEC_NEEDED_TAILROOM, ++ GFP_ATOMIC); ++ if (likely(nskb)) { ++ consume_skb(skb); ++ skb = nskb; ++ } else { ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(-ENOMEM); ++ } ++ } else { ++ skb = skb_unshare(skb, GFP_ATOMIC); ++ if (!skb) { ++ macsec_txsa_put(tx_sa); ++ return ERR_PTR(-ENOMEM); ++ } ++ } ++ ++ unprotected_len = skb->len; ++ eth = eth_hdr(skb); ++ sci_present = send_sci(secy); ++ hh = skb_push(skb, macsec_extra_len(sci_present)); ++ memmove(hh, eth, 2 * ETH_ALEN); ++ ++ pn = tx_sa_update_pn(tx_sa, secy); ++ if (pn.full64 == 0) { ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(-ENOLINK); ++ } ++ macsec_fill_sectag(hh, secy, pn.lower, sci_present); ++ macsec_set_shortlen(hh, unprotected_len - 2 * ETH_ALEN); ++ ++ skb_put(skb, secy->icv_len); ++ ++ if (skb->len - ETH_HLEN > macsec_priv(dev)->real_dev->mtu) { ++ struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); ++ ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.OutPktsTooLong++; ++ u64_stats_update_end(&secy_stats->syncp); ++ ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ ret = skb_cow_data(skb, 0, &trailer); ++ if (unlikely(ret < 0)) { ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(ret); ++ } ++ ++ req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret); ++ if (!req) { ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ if (secy->xpn) ++ macsec_fill_iv_xpn(iv, tx_sa->ssci, pn.full64, tx_sa->key.salt); ++ else ++ macsec_fill_iv(iv, secy->sci, pn.lower); ++ ++ sg_init_table(sg, ret); ++ ret = skb_to_sgvec(skb, sg, 0, skb->len); ++ if (unlikely(ret < 0)) { ++ aead_request_free(req); ++ macsec_txsa_put(tx_sa); ++ kfree_skb(skb); ++ return ERR_PTR(ret); ++ } ++ ++ if (tx_sc->encrypt) { ++ int len = skb->len - macsec_hdr_len(sci_present) - ++ secy->icv_len; ++ aead_request_set_crypt(req, sg, sg, len, iv); ++ aead_request_set_ad(req, macsec_hdr_len(sci_present)); ++ } else { ++ aead_request_set_crypt(req, sg, sg, 0, iv); ++ aead_request_set_ad(req, skb->len - secy->icv_len); ++ } ++ ++ macsec_skb_cb(skb)->req = req; ++ macsec_skb_cb(skb)->tx_sa = tx_sa; ++ macsec_skb_cb(skb)->has_sci = sci_present; ++ aead_request_set_callback(req, 0, macsec_encrypt_done, skb); ++ ++ dev_hold(skb->dev); ++ ret = crypto_aead_encrypt(req); ++ if (ret == -EINPROGRESS) { ++ return ERR_PTR(ret); ++ } else if (ret != 0) { ++ dev_put(skb->dev); ++ kfree_skb(skb); ++ aead_request_free(req); ++ macsec_txsa_put(tx_sa); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ dev_put(skb->dev); ++ aead_request_free(req); ++ macsec_txsa_put(tx_sa); ++ ++ return skb; ++} ++ ++static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u32 pn) ++{ ++ struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; ++ struct pcpu_rx_sc_stats *rxsc_stats = this_cpu_ptr(rx_sa->sc->stats); ++ struct macsec_eth_header *hdr = macsec_ethhdr(skb); ++ u32 lowest_pn = 0; ++ ++ spin_lock(&rx_sa->lock); ++ if (rx_sa->next_pn_halves.lower >= secy->replay_window) ++ lowest_pn = rx_sa->next_pn_halves.lower - secy->replay_window; ++ ++ /* Now perform replay protection check again ++ * (see IEEE 802.1AE-2006 figure 10-5) ++ */ ++ if (secy->replay_protect && pn < lowest_pn && ++ (!secy->xpn || pn_same_half(pn, lowest_pn))) { ++ spin_unlock(&rx_sa->lock); ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsLate++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ secy->netdev->stats.rx_dropped++; ++ return false; ++ } ++ ++ if (secy->validate_frames != MACSEC_VALIDATE_DISABLED) { ++ unsigned int msdu_len = macsec_msdu_len(skb); ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ if (hdr->tci_an & MACSEC_TCI_E) ++ rxsc_stats->stats.InOctetsDecrypted += msdu_len; ++ else ++ rxsc_stats->stats.InOctetsValidated += msdu_len; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ } ++ ++ if (!macsec_skb_cb(skb)->valid) { ++ spin_unlock(&rx_sa->lock); ++ ++ /* 10.6.5 */ ++ if (hdr->tci_an & MACSEC_TCI_C || ++ secy->validate_frames == MACSEC_VALIDATE_STRICT) { ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsNotValid++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ this_cpu_inc(rx_sa->stats->InPktsNotValid); ++ secy->netdev->stats.rx_errors++; ++ return false; ++ } ++ ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ if (secy->validate_frames == MACSEC_VALIDATE_CHECK) { ++ rxsc_stats->stats.InPktsInvalid++; ++ this_cpu_inc(rx_sa->stats->InPktsInvalid); ++ } else if (pn < lowest_pn) { ++ rxsc_stats->stats.InPktsDelayed++; ++ } else { ++ rxsc_stats->stats.InPktsUnchecked++; ++ } ++ u64_stats_update_end(&rxsc_stats->syncp); ++ } else { ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ if (pn < lowest_pn) { ++ rxsc_stats->stats.InPktsDelayed++; ++ } else { ++ rxsc_stats->stats.InPktsOK++; ++ this_cpu_inc(rx_sa->stats->InPktsOK); ++ } ++ u64_stats_update_end(&rxsc_stats->syncp); ++ ++ // Instead of "pn >=" - to support pn overflow in xpn ++ if (pn + 1 > rx_sa->next_pn_halves.lower) { ++ rx_sa->next_pn_halves.lower = pn + 1; ++ } else if (secy->xpn && ++ !pn_same_half(pn, rx_sa->next_pn_halves.lower)) { ++ rx_sa->next_pn_halves.upper++; ++ rx_sa->next_pn_halves.lower = pn + 1; ++ } ++ ++ spin_unlock(&rx_sa->lock); ++ } ++ ++ return true; ++} ++ ++static void macsec_reset_skb(struct sk_buff *skb, struct net_device *dev) ++{ ++ skb->pkt_type = PACKET_HOST; ++ skb->protocol = eth_type_trans(skb, dev); ++ ++ skb_reset_network_header(skb); ++ if (!skb_transport_header_was_set(skb)) ++ skb_reset_transport_header(skb); ++ skb_reset_mac_len(skb); ++} ++ ++static void macsec_finalize_skb(struct sk_buff *skb, u8 icv_len, u8 hdr_len) ++{ ++ skb->ip_summed = CHECKSUM_NONE; ++ memmove(skb->data + hdr_len, skb->data, 2 * ETH_ALEN); ++ skb_pull(skb, hdr_len); ++ pskb_trim_unique(skb, skb->len - icv_len); ++} ++ ++static void count_rx(struct net_device *dev, int len) ++{ ++ struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats); ++ ++ u64_stats_update_begin(&stats->syncp); ++ u64_stats_inc(&stats->rx_packets); ++ u64_stats_add(&stats->rx_bytes, len); ++ u64_stats_update_end(&stats->syncp); ++} ++ ++static void macsec_decrypt_done(struct crypto_async_request *base, int err) ++{ ++ struct sk_buff *skb = base->data; ++ struct net_device *dev = skb->dev; ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; ++ struct macsec_rx_sc *rx_sc = rx_sa->sc; ++ int len; ++ u32 pn; ++ ++ aead_request_free(macsec_skb_cb(skb)->req); ++ ++ if (!err) ++ macsec_skb_cb(skb)->valid = true; ++ ++ rcu_read_lock_bh(); ++ pn = ntohl(macsec_ethhdr(skb)->packet_number); ++ if (!macsec_post_decrypt(skb, &macsec->secy, pn)) { ++ rcu_read_unlock_bh(); ++ kfree_skb(skb); ++ goto out; ++ } ++ ++ macsec_finalize_skb(skb, macsec->secy.icv_len, ++ macsec_extra_len(macsec_skb_cb(skb)->has_sci)); ++ len = skb->len; ++ macsec_reset_skb(skb, macsec->secy.netdev); ++ ++ if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS) ++ count_rx(dev, len); ++ ++ rcu_read_unlock_bh(); ++ ++out: ++ macsec_rxsa_put(rx_sa); ++ macsec_rxsc_put(rx_sc); ++ dev_put(dev); ++} ++ ++static struct sk_buff *macsec_decrypt(struct sk_buff *skb, ++ struct net_device *dev, ++ struct macsec_rx_sa *rx_sa, ++ sci_t sci, ++ struct macsec_secy *secy) ++{ ++ int ret; ++ struct scatterlist *sg; ++ struct sk_buff *trailer; ++ unsigned char *iv; ++ struct aead_request *req; ++ struct macsec_eth_header *hdr; ++ u32 hdr_pn; ++ u16 icv_len = secy->icv_len; ++ ++ macsec_skb_cb(skb)->valid = false; ++ skb = skb_share_check(skb, GFP_ATOMIC); ++ if (!skb) ++ return ERR_PTR(-ENOMEM); ++ ++ ret = skb_cow_data(skb, 0, &trailer); ++ if (unlikely(ret < 0)) { ++ kfree_skb(skb); ++ return ERR_PTR(ret); ++ } ++ req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret); ++ if (!req) { ++ kfree_skb(skb); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ hdr = (struct macsec_eth_header *)skb->data; ++ hdr_pn = ntohl(hdr->packet_number); ++ ++ if (secy->xpn) { ++ pn_t recovered_pn = rx_sa->next_pn_halves; ++ ++ recovered_pn.lower = hdr_pn; ++ if (hdr_pn < rx_sa->next_pn_halves.lower && ++ !pn_same_half(hdr_pn, rx_sa->next_pn_halves.lower)) ++ recovered_pn.upper++; ++ ++ macsec_fill_iv_xpn(iv, rx_sa->ssci, recovered_pn.full64, ++ rx_sa->key.salt); ++ } else { ++ macsec_fill_iv(iv, sci, hdr_pn); ++ } ++ ++ sg_init_table(sg, ret); ++ ret = skb_to_sgvec(skb, sg, 0, skb->len); ++ if (unlikely(ret < 0)) { ++ aead_request_free(req); ++ kfree_skb(skb); ++ return ERR_PTR(ret); ++ } ++ ++ if (hdr->tci_an & MACSEC_TCI_E) { ++ /* confidentiality: ethernet + macsec header ++ * authenticated, encrypted payload ++ */ ++ int len = skb->len - macsec_hdr_len(macsec_skb_cb(skb)->has_sci); ++ ++ aead_request_set_crypt(req, sg, sg, len, iv); ++ aead_request_set_ad(req, macsec_hdr_len(macsec_skb_cb(skb)->has_sci)); ++ skb = skb_unshare(skb, GFP_ATOMIC); ++ if (!skb) { ++ aead_request_free(req); ++ return ERR_PTR(-ENOMEM); ++ } ++ } else { ++ /* integrity only: all headers + data authenticated */ ++ aead_request_set_crypt(req, sg, sg, icv_len, iv); ++ aead_request_set_ad(req, skb->len - icv_len); ++ } ++ ++ macsec_skb_cb(skb)->req = req; ++ skb->dev = dev; ++ aead_request_set_callback(req, 0, macsec_decrypt_done, skb); ++ ++ dev_hold(dev); ++ ret = crypto_aead_decrypt(req); ++ if (ret == -EINPROGRESS) { ++ return ERR_PTR(ret); ++ } else if (ret != 0) { ++ /* decryption/authentication failed ++ * 10.6 if validateFrames is disabled, deliver anyway ++ */ ++ if (ret != -EBADMSG) { ++ kfree_skb(skb); ++ skb = ERR_PTR(ret); ++ } ++ } else { ++ macsec_skb_cb(skb)->valid = true; ++ } ++ dev_put(dev); ++ ++ aead_request_free(req); ++ ++ return skb; ++} ++ ++static struct macsec_rx_sc *find_rx_sc(struct macsec_secy *secy, sci_t sci) ++{ ++ struct macsec_rx_sc *rx_sc; ++ ++ for_each_rxsc(secy, rx_sc) { ++ if (rx_sc->sci == sci) ++ return rx_sc; ++ } ++ ++ return NULL; ++} ++ ++static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci) ++{ ++ struct macsec_rx_sc *rx_sc; ++ ++ for_each_rxsc_rtnl(secy, rx_sc) { ++ if (rx_sc->sci == sci) ++ return rx_sc; ++ } ++ ++ return NULL; ++} ++ ++static enum rx_handler_result handle_not_macsec(struct sk_buff *skb) ++{ ++ /* Deliver to the uncontrolled port by default */ ++ enum rx_handler_result ret = RX_HANDLER_PASS; ++ struct ethhdr *hdr = eth_hdr(skb); ++ struct macsec_rxh_data *rxd; ++ struct macsec_dev *macsec; ++ ++ rcu_read_lock(); ++ rxd = macsec_data_rcu(skb->dev); ++ ++ list_for_each_entry_rcu(macsec, &rxd->secys, secys) { ++ struct sk_buff *nskb; ++ struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); ++ struct net_device *ndev = macsec->secy.netdev; ++ ++ /* If h/w offloading is enabled, HW decodes frames and strips ++ * the SecTAG, so we have to deduce which port to deliver to. ++ */ ++ if (macsec_is_offloaded(macsec) && netif_running(ndev)) { ++ if (ether_addr_equal_64bits(hdr->h_dest, ++ ndev->dev_addr)) { ++ /* exact match, divert skb to this port */ ++ skb->dev = ndev; ++ skb->pkt_type = PACKET_HOST; ++ ret = RX_HANDLER_ANOTHER; ++ goto out; ++ } else if (is_multicast_ether_addr_64bits( ++ hdr->h_dest)) { ++ /* multicast frame, deliver on this port too */ ++ nskb = skb_clone(skb, GFP_ATOMIC); ++ if (!nskb) ++ break; ++ ++ nskb->dev = ndev; ++ if (ether_addr_equal_64bits(hdr->h_dest, ++ ndev->broadcast)) ++ nskb->pkt_type = PACKET_BROADCAST; ++ else ++ nskb->pkt_type = PACKET_MULTICAST; ++ ++ __netif_rx(nskb); ++ } ++ continue; ++ } ++ ++ /* 10.6 If the management control validateFrames is not ++ * Strict, frames without a SecTAG are received, counted, and ++ * delivered to the Controlled Port ++ */ ++ if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsNoTag++; ++ u64_stats_update_end(&secy_stats->syncp); ++ macsec->secy.netdev->stats.rx_dropped++; ++ continue; ++ } ++ ++ /* deliver on this port */ ++ nskb = skb_clone(skb, GFP_ATOMIC); ++ if (!nskb) ++ break; ++ ++ nskb->dev = ndev; ++ ++ if (__netif_rx(nskb) == NET_RX_SUCCESS) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsUntagged++; ++ u64_stats_update_end(&secy_stats->syncp); ++ } ++ } ++ ++out: ++ rcu_read_unlock(); ++ return ret; ++} ++ ++static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) ++{ ++ struct sk_buff *skb = *pskb; ++ struct net_device *dev = skb->dev; ++ struct macsec_eth_header *hdr; ++ struct macsec_secy *secy = NULL; ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ struct macsec_rxh_data *rxd; ++ struct macsec_dev *macsec; ++ unsigned int len; ++ sci_t sci; ++ u32 hdr_pn; ++ bool cbit; ++ struct pcpu_rx_sc_stats *rxsc_stats; ++ struct pcpu_secy_stats *secy_stats; ++ bool pulled_sci; ++ int ret; ++ ++ if (skb_headroom(skb) < ETH_HLEN) ++ goto drop_direct; ++ ++ hdr = macsec_ethhdr(skb); ++ if (hdr->eth.h_proto != htons(ETH_P_MACSEC)) ++ return handle_not_macsec(skb); ++ ++ skb = skb_unshare(skb, GFP_ATOMIC); ++ *pskb = skb; ++ if (!skb) ++ return RX_HANDLER_CONSUMED; ++ ++ pulled_sci = pskb_may_pull(skb, macsec_extra_len(true)); ++ if (!pulled_sci) { ++ if (!pskb_may_pull(skb, macsec_extra_len(false))) ++ goto drop_direct; ++ } ++ ++ hdr = macsec_ethhdr(skb); ++ ++ /* Frames with a SecTAG that has the TCI E bit set but the C ++ * bit clear are discarded, as this reserved encoding is used ++ * to identify frames with a SecTAG that are not to be ++ * delivered to the Controlled Port. ++ */ ++ if ((hdr->tci_an & (MACSEC_TCI_C | MACSEC_TCI_E)) == MACSEC_TCI_E) ++ return RX_HANDLER_PASS; ++ ++ /* now, pull the extra length */ ++ if (hdr->tci_an & MACSEC_TCI_SC) { ++ if (!pulled_sci) ++ goto drop_direct; ++ } ++ ++ /* ethernet header is part of crypto processing */ ++ skb_push(skb, ETH_HLEN); ++ ++ macsec_skb_cb(skb)->has_sci = !!(hdr->tci_an & MACSEC_TCI_SC); ++ macsec_skb_cb(skb)->assoc_num = hdr->tci_an & MACSEC_AN_MASK; ++ sci = macsec_frame_sci(hdr, macsec_skb_cb(skb)->has_sci); ++ ++ rcu_read_lock(); ++ rxd = macsec_data_rcu(skb->dev); ++ ++ list_for_each_entry_rcu(macsec, &rxd->secys, secys) { ++ struct macsec_rx_sc *sc = find_rx_sc(&macsec->secy, sci); ++ ++ sc = sc ? macsec_rxsc_get(sc) : NULL; ++ ++ if (sc) { ++ secy = &macsec->secy; ++ rx_sc = sc; ++ break; ++ } ++ } ++ ++ if (!secy) ++ goto nosci; ++ ++ dev = secy->netdev; ++ macsec = macsec_priv(dev); ++ secy_stats = this_cpu_ptr(macsec->stats); ++ rxsc_stats = this_cpu_ptr(rx_sc->stats); ++ ++ if (!macsec_validate_skb(skb, secy->icv_len, secy->xpn)) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsBadTag++; ++ u64_stats_update_end(&secy_stats->syncp); ++ secy->netdev->stats.rx_errors++; ++ goto drop_nosa; ++ } ++ ++ rx_sa = macsec_rxsa_get(rx_sc->sa[macsec_skb_cb(skb)->assoc_num]); ++ if (!rx_sa) { ++ /* 10.6.1 if the SA is not in use */ ++ ++ /* If validateFrames is Strict or the C bit in the ++ * SecTAG is set, discard ++ */ ++ struct macsec_rx_sa *active_rx_sa = macsec_active_rxsa_get(rx_sc); ++ if (hdr->tci_an & MACSEC_TCI_C || ++ secy->validate_frames == MACSEC_VALIDATE_STRICT) { ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsNotUsingSA++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ secy->netdev->stats.rx_errors++; ++ if (active_rx_sa) ++ this_cpu_inc(active_rx_sa->stats->InPktsNotUsingSA); ++ goto drop_nosa; ++ } ++ ++ /* not Strict, the frame (with the SecTAG and ICV ++ * removed) is delivered to the Controlled Port. ++ */ ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsUnusedSA++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ if (active_rx_sa) ++ this_cpu_inc(active_rx_sa->stats->InPktsUnusedSA); ++ goto deliver; ++ } ++ ++ /* First, PN check to avoid decrypting obviously wrong packets */ ++ hdr_pn = ntohl(hdr->packet_number); ++ if (secy->replay_protect) { ++ bool late; ++ ++ spin_lock(&rx_sa->lock); ++ late = rx_sa->next_pn_halves.lower >= secy->replay_window && ++ hdr_pn < (rx_sa->next_pn_halves.lower - secy->replay_window); ++ ++ if (secy->xpn) ++ late = late && pn_same_half(rx_sa->next_pn_halves.lower, hdr_pn); ++ spin_unlock(&rx_sa->lock); ++ ++ if (late) { ++ u64_stats_update_begin(&rxsc_stats->syncp); ++ rxsc_stats->stats.InPktsLate++; ++ u64_stats_update_end(&rxsc_stats->syncp); ++ macsec->secy.netdev->stats.rx_dropped++; ++ goto drop; ++ } ++ } ++ ++ macsec_skb_cb(skb)->rx_sa = rx_sa; ++ ++ /* Disabled && !changed text => skip validation */ ++ if (hdr->tci_an & MACSEC_TCI_C || ++ secy->validate_frames != MACSEC_VALIDATE_DISABLED) ++ skb = macsec_decrypt(skb, dev, rx_sa, sci, secy); ++ ++ if (IS_ERR(skb)) { ++ /* the decrypt callback needs the reference */ ++ if (PTR_ERR(skb) != -EINPROGRESS) { ++ macsec_rxsa_put(rx_sa); ++ macsec_rxsc_put(rx_sc); ++ } ++ rcu_read_unlock(); ++ *pskb = NULL; ++ return RX_HANDLER_CONSUMED; ++ } ++ ++ if (!macsec_post_decrypt(skb, secy, hdr_pn)) ++ goto drop; ++ ++deliver: ++ macsec_finalize_skb(skb, secy->icv_len, ++ macsec_extra_len(macsec_skb_cb(skb)->has_sci)); ++ len = skb->len; ++ macsec_reset_skb(skb, secy->netdev); ++ ++ if (rx_sa) ++ macsec_rxsa_put(rx_sa); ++ macsec_rxsc_put(rx_sc); ++ ++ skb_orphan(skb); ++ ret = gro_cells_receive(&macsec->gro_cells, skb); ++ if (ret == NET_RX_SUCCESS) ++ count_rx(dev, len); ++ else ++ macsec->secy.netdev->stats.rx_dropped++; ++ ++ rcu_read_unlock(); ++ ++ *pskb = NULL; ++ return RX_HANDLER_CONSUMED; ++ ++drop: ++ macsec_rxsa_put(rx_sa); ++drop_nosa: ++ macsec_rxsc_put(rx_sc); ++ rcu_read_unlock(); ++drop_direct: ++ kfree_skb(skb); ++ *pskb = NULL; ++ return RX_HANDLER_CONSUMED; ++ ++nosci: ++ /* 10.6.1 if the SC is not found */ ++ cbit = !!(hdr->tci_an & MACSEC_TCI_C); ++ if (!cbit) ++ macsec_finalize_skb(skb, DEFAULT_ICV_LEN, ++ macsec_extra_len(macsec_skb_cb(skb)->has_sci)); ++ ++ list_for_each_entry_rcu(macsec, &rxd->secys, secys) { ++ struct sk_buff *nskb; ++ ++ secy_stats = this_cpu_ptr(macsec->stats); ++ ++ /* If validateFrames is Strict or the C bit in the ++ * SecTAG is set, discard ++ */ ++ if (cbit || ++ macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsNoSCI++; ++ u64_stats_update_end(&secy_stats->syncp); ++ macsec->secy.netdev->stats.rx_errors++; ++ continue; ++ } ++ ++ /* not strict, the frame (with the SecTAG and ICV ++ * removed) is delivered to the Controlled Port. ++ */ ++ nskb = skb_clone(skb, GFP_ATOMIC); ++ if (!nskb) ++ break; ++ ++ macsec_reset_skb(nskb, macsec->secy.netdev); ++ ++ ret = __netif_rx(nskb); ++ if (ret == NET_RX_SUCCESS) { ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.InPktsUnknownSCI++; ++ u64_stats_update_end(&secy_stats->syncp); ++ } else { ++ macsec->secy.netdev->stats.rx_dropped++; ++ } ++ } ++ ++ rcu_read_unlock(); ++ *pskb = skb; ++ return RX_HANDLER_PASS; ++} ++ ++static struct crypto_aead *macsec_alloc_tfm(char *key, int key_len, int icv_len) ++{ ++ struct crypto_aead *tfm; ++ int ret; ++ ++ /* Pick a sync gcm(aes) cipher to ensure order is preserved. */ ++ tfm = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); ++ ++ if (IS_ERR(tfm)) ++ return tfm; ++ ++ ret = crypto_aead_setkey(tfm, key, key_len); ++ if (ret < 0) ++ goto fail; ++ ++ ret = crypto_aead_setauthsize(tfm, icv_len); ++ if (ret < 0) ++ goto fail; ++ ++ return tfm; ++fail: ++ crypto_free_aead(tfm); ++ return ERR_PTR(ret); ++} ++ ++static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len, ++ int icv_len) ++{ ++ rx_sa->stats = alloc_percpu(struct macsec_rx_sa_stats); ++ if (!rx_sa->stats) ++ return -ENOMEM; ++ ++ rx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); ++ if (IS_ERR(rx_sa->key.tfm)) { ++ free_percpu(rx_sa->stats); ++ return PTR_ERR(rx_sa->key.tfm); ++ } ++ ++ rx_sa->ssci = MACSEC_UNDEF_SSCI; ++ rx_sa->active = false; ++ rx_sa->next_pn = 1; ++ refcount_set(&rx_sa->refcnt, 1); ++ spin_lock_init(&rx_sa->lock); ++ ++ return 0; ++} ++ ++static void clear_rx_sa(struct macsec_rx_sa *rx_sa) ++{ ++ rx_sa->active = false; ++ ++ macsec_rxsa_put(rx_sa); ++} ++ ++static void free_rx_sc(struct macsec_rx_sc *rx_sc) ++{ ++ int i; ++ ++ for (i = 0; i < MACSEC_NUM_AN; i++) { ++ struct macsec_rx_sa *sa = rtnl_dereference(rx_sc->sa[i]); ++ ++ RCU_INIT_POINTER(rx_sc->sa[i], NULL); ++ if (sa) ++ clear_rx_sa(sa); ++ } ++ ++ macsec_rxsc_put(rx_sc); ++} ++ ++static struct macsec_rx_sc *del_rx_sc(struct macsec_secy *secy, sci_t sci) ++{ ++ struct macsec_rx_sc *rx_sc, __rcu **rx_scp; ++ ++ for (rx_scp = &secy->rx_sc, rx_sc = rtnl_dereference(*rx_scp); ++ rx_sc; ++ rx_scp = &rx_sc->next, rx_sc = rtnl_dereference(*rx_scp)) { ++ if (rx_sc->sci == sci) { ++ if (rx_sc->active) ++ secy->n_rx_sc--; ++ rcu_assign_pointer(*rx_scp, rx_sc->next); ++ return rx_sc; ++ } ++ } ++ ++ return NULL; ++} ++ ++static struct macsec_rx_sc *create_rx_sc(struct net_device *dev, sci_t sci, ++ bool active) ++{ ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_dev *macsec; ++ struct net_device *real_dev = macsec_priv(dev)->real_dev; ++ struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); ++ struct macsec_secy *secy; ++ ++ list_for_each_entry(macsec, &rxd->secys, secys) { ++ if (find_rx_sc_rtnl(&macsec->secy, sci)) ++ return ERR_PTR(-EEXIST); ++ } ++ ++ rx_sc = kzalloc(sizeof(*rx_sc), GFP_KERNEL); ++ if (!rx_sc) ++ return ERR_PTR(-ENOMEM); ++ ++ rx_sc->stats = netdev_alloc_pcpu_stats(struct pcpu_rx_sc_stats); ++ if (!rx_sc->stats) { ++ kfree(rx_sc); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ rx_sc->sci = sci; ++ rx_sc->active = active; ++ refcount_set(&rx_sc->refcnt, 1); ++ ++ secy = &macsec_priv(dev)->secy; ++ rcu_assign_pointer(rx_sc->next, secy->rx_sc); ++ rcu_assign_pointer(secy->rx_sc, rx_sc); ++ ++ if (rx_sc->active) ++ secy->n_rx_sc++; ++ ++ return rx_sc; ++} ++ ++static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len, ++ int icv_len) ++{ ++ tx_sa->stats = alloc_percpu(struct macsec_tx_sa_stats); ++ if (!tx_sa->stats) ++ return -ENOMEM; ++ ++ tx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); ++ if (IS_ERR(tx_sa->key.tfm)) { ++ free_percpu(tx_sa->stats); ++ return PTR_ERR(tx_sa->key.tfm); ++ } ++ ++ tx_sa->ssci = MACSEC_UNDEF_SSCI; ++ tx_sa->active = false; ++ refcount_set(&tx_sa->refcnt, 1); ++ spin_lock_init(&tx_sa->lock); ++ ++ return 0; ++} ++ ++static void clear_tx_sa(struct macsec_tx_sa *tx_sa) ++{ ++ tx_sa->active = false; ++ ++ macsec_txsa_put(tx_sa); ++} ++ ++static struct genl_family macsec_fam; ++ ++static struct net_device *get_dev_from_nl(struct net *net, ++ struct nlattr **attrs) ++{ ++ int ifindex = nla_get_u32(attrs[MACSEC_ATTR_IFINDEX]); ++ struct net_device *dev; ++ ++ dev = __dev_get_by_index(net, ifindex); ++ if (!dev) ++ return ERR_PTR(-ENODEV); ++ ++ if (!netif_is_macsec(dev)) ++ return ERR_PTR(-ENODEV); ++ ++ return dev; ++} ++ ++static enum macsec_offload nla_get_offload(const struct nlattr *nla) ++{ ++ return (__force enum macsec_offload)nla_get_u8(nla); ++} ++ ++static sci_t nla_get_sci(const struct nlattr *nla) ++{ ++ return (__force sci_t)nla_get_u64(nla); ++} ++ ++static int nla_put_sci(struct sk_buff *skb, int attrtype, sci_t value, ++ int padattr) ++{ ++ return nla_put_u64_64bit(skb, attrtype, (__force u64)value, padattr); ++} ++ ++static ssci_t nla_get_ssci(const struct nlattr *nla) ++{ ++ return (__force ssci_t)nla_get_u32(nla); ++} ++ ++static int nla_put_ssci(struct sk_buff *skb, int attrtype, ssci_t value) ++{ ++ return nla_put_u32(skb, attrtype, (__force u64)value); ++} ++ ++static struct macsec_tx_sa *get_txsa_from_nl(struct net *net, ++ struct nlattr **attrs, ++ struct nlattr **tb_sa, ++ struct net_device **devp, ++ struct macsec_secy **secyp, ++ struct macsec_tx_sc **scp, ++ u8 *assoc_num) ++{ ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ ++ if (!tb_sa[MACSEC_SA_ATTR_AN]) ++ return ERR_PTR(-EINVAL); ++ ++ *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); ++ ++ dev = get_dev_from_nl(net, attrs); ++ if (IS_ERR(dev)) ++ return ERR_CAST(dev); ++ ++ if (*assoc_num >= MACSEC_NUM_AN) ++ return ERR_PTR(-EINVAL); ++ ++ secy = &macsec_priv(dev)->secy; ++ tx_sc = &secy->tx_sc; ++ ++ tx_sa = rtnl_dereference(tx_sc->sa[*assoc_num]); ++ if (!tx_sa) ++ return ERR_PTR(-ENODEV); ++ ++ *devp = dev; ++ *scp = tx_sc; ++ *secyp = secy; ++ return tx_sa; ++} ++ ++static struct macsec_rx_sc *get_rxsc_from_nl(struct net *net, ++ struct nlattr **attrs, ++ struct nlattr **tb_rxsc, ++ struct net_device **devp, ++ struct macsec_secy **secyp) ++{ ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ sci_t sci; ++ ++ dev = get_dev_from_nl(net, attrs); ++ if (IS_ERR(dev)) ++ return ERR_CAST(dev); ++ ++ secy = &macsec_priv(dev)->secy; ++ ++ if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) ++ return ERR_PTR(-EINVAL); ++ ++ sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); ++ rx_sc = find_rx_sc_rtnl(secy, sci); ++ if (!rx_sc) ++ return ERR_PTR(-ENODEV); ++ ++ *secyp = secy; ++ *devp = dev; ++ ++ return rx_sc; ++} ++ ++static struct macsec_rx_sa *get_rxsa_from_nl(struct net *net, ++ struct nlattr **attrs, ++ struct nlattr **tb_rxsc, ++ struct nlattr **tb_sa, ++ struct net_device **devp, ++ struct macsec_secy **secyp, ++ struct macsec_rx_sc **scp, ++ u8 *assoc_num) ++{ ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ ++ if (!tb_sa[MACSEC_SA_ATTR_AN]) ++ return ERR_PTR(-EINVAL); ++ ++ *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); ++ if (*assoc_num >= MACSEC_NUM_AN) ++ return ERR_PTR(-EINVAL); ++ ++ rx_sc = get_rxsc_from_nl(net, attrs, tb_rxsc, devp, secyp); ++ if (IS_ERR(rx_sc)) ++ return ERR_CAST(rx_sc); ++ ++ rx_sa = rtnl_dereference(rx_sc->sa[*assoc_num]); ++ if (!rx_sa) ++ return ERR_PTR(-ENODEV); ++ ++ *scp = rx_sc; ++ return rx_sa; ++} ++ ++static const struct nla_policy macsec_genl_policy[NUM_MACSEC_ATTR] = { ++ [MACSEC_ATTR_IFINDEX] = { .type = NLA_U32 }, ++ [MACSEC_ATTR_RXSC_CONFIG] = { .type = NLA_NESTED }, ++ [MACSEC_ATTR_SA_CONFIG] = { .type = NLA_NESTED }, ++ [MACSEC_ATTR_OFFLOAD] = { .type = NLA_NESTED }, ++}; ++ ++static const struct nla_policy macsec_genl_rxsc_policy[NUM_MACSEC_RXSC_ATTR] = { ++ [MACSEC_RXSC_ATTR_SCI] = { .type = NLA_U64 }, ++ [MACSEC_RXSC_ATTR_ACTIVE] = { .type = NLA_U8 }, ++}; ++ ++static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = { ++ [MACSEC_SA_ATTR_AN] = { .type = NLA_U8 }, ++ [MACSEC_SA_ATTR_ACTIVE] = { .type = NLA_U8 }, ++ [MACSEC_SA_ATTR_PN] = NLA_POLICY_MIN_LEN(4), ++ [MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY, ++ .len = MACSEC_KEYID_LEN, }, ++ [MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY, ++ .len = MACSEC_MAX_KEY_LEN, }, ++ [MACSEC_SA_ATTR_SSCI] = { .type = NLA_U32 }, ++ [MACSEC_SA_ATTR_SALT] = { .type = NLA_BINARY, ++ .len = MACSEC_SALT_LEN, }, ++}; ++ ++static const struct nla_policy macsec_genl_offload_policy[NUM_MACSEC_OFFLOAD_ATTR] = { ++ [MACSEC_OFFLOAD_ATTR_TYPE] = { .type = NLA_U8 }, ++}; ++ ++/* Offloads an operation to a device driver */ ++static int macsec_offload(int (* const func)(struct macsec_context *), ++ struct macsec_context *ctx) ++{ ++ int ret; ++ ++ if (unlikely(!func)) ++ return 0; ++ ++ if (ctx->offload == MACSEC_OFFLOAD_PHY) ++ mutex_lock(&ctx->phydev->lock); ++ ++ /* Phase I: prepare. The drive should fail here if there are going to be ++ * issues in the commit phase. ++ */ ++ ctx->prepare = true; ++ ret = (*func)(ctx); ++ if (ret) ++ goto phy_unlock; ++ ++ /* Phase II: commit. This step cannot fail. */ ++ ctx->prepare = false; ++ ret = (*func)(ctx); ++ /* This should never happen: commit is not allowed to fail */ ++ if (unlikely(ret)) ++ WARN(1, "MACsec offloading commit failed (%d)\n", ret); ++ ++phy_unlock: ++ if (ctx->offload == MACSEC_OFFLOAD_PHY) ++ mutex_unlock(&ctx->phydev->lock); ++ ++ return ret; ++} ++ ++static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa) ++{ ++ if (!attrs[MACSEC_ATTR_SA_CONFIG]) ++ return -EINVAL; ++ ++ if (nla_parse_nested_deprecated(tb_sa, MACSEC_SA_ATTR_MAX, attrs[MACSEC_ATTR_SA_CONFIG], macsec_genl_sa_policy, NULL)) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int parse_rxsc_config(struct nlattr **attrs, struct nlattr **tb_rxsc) ++{ ++ if (!attrs[MACSEC_ATTR_RXSC_CONFIG]) ++ return -EINVAL; ++ ++ if (nla_parse_nested_deprecated(tb_rxsc, MACSEC_RXSC_ATTR_MAX, attrs[MACSEC_ATTR_RXSC_CONFIG], macsec_genl_rxsc_policy, NULL)) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static bool validate_add_rxsa(struct nlattr **attrs) ++{ ++ if (!attrs[MACSEC_SA_ATTR_AN] || ++ !attrs[MACSEC_SA_ATTR_KEY] || ++ !attrs[MACSEC_SA_ATTR_KEYID]) ++ return false; ++ ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_PN] && ++ nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_ACTIVE]) { ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) ++ return false; ++ } ++ ++ if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) ++ return false; ++ ++ return true; ++} ++ ++static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct net_device *dev; ++ struct nlattr **attrs = info->attrs; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ unsigned char assoc_num; ++ int pn_len; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ int err; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (!validate_add_rxsa(tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); ++ if (IS_ERR(rx_sc)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sc); ++ } ++ ++ assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); ++ ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { ++ pr_notice("macsec: nl: add_rxsa: bad key length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; ++ if (tb_sa[MACSEC_SA_ATTR_PN] && ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { ++ pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ if (secy->xpn) { ++ if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { ++ pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), ++ MACSEC_SALT_LEN); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ } ++ ++ rx_sa = rtnl_dereference(rx_sc->sa[assoc_num]); ++ if (rx_sa) { ++ rtnl_unlock(); ++ return -EBUSY; ++ } ++ ++ rx_sa = kmalloc(sizeof(*rx_sa), GFP_KERNEL); ++ if (!rx_sa) { ++ rtnl_unlock(); ++ return -ENOMEM; ++ } ++ ++ err = init_rx_sa(rx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), ++ secy->key_len, secy->icv_len); ++ if (err < 0) { ++ kfree(rx_sa); ++ rtnl_unlock(); ++ return err; ++ } ++ ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ spin_lock_bh(&rx_sa->lock); ++ rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); ++ spin_unlock_bh(&rx_sa->lock); ++ } ++ ++ if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) ++ rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); ++ ++ rx_sa->sc = rx_sc; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ err = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.rx_sa = rx_sa; ++ ctx.secy = secy; ++ memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), ++ secy->key_len); ++ ++ err = macsec_offload(ops->mdo_add_rxsa, &ctx); ++ memzero_explicit(ctx.sa.key, secy->key_len); ++ if (err) ++ goto cleanup; ++ } ++ ++ if (secy->xpn) { ++ rx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); ++ nla_memcpy(rx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], ++ MACSEC_SALT_LEN); ++ } ++ ++ nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); ++ rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa); ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ macsec_rxsa_put(rx_sa); ++ rtnl_unlock(); ++ return err; ++} ++ ++static bool validate_add_rxsc(struct nlattr **attrs) ++{ ++ if (!attrs[MACSEC_RXSC_ATTR_SCI]) ++ return false; ++ ++ if (attrs[MACSEC_RXSC_ATTR_ACTIVE]) { ++ if (nla_get_u8(attrs[MACSEC_RXSC_ATTR_ACTIVE]) > 1) ++ return false; ++ } ++ ++ return true; ++} ++ ++static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct net_device *dev; ++ sci_t sci = MACSEC_UNDEF_SCI; ++ struct nlattr **attrs = info->attrs; ++ struct macsec_rx_sc *rx_sc; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ struct macsec_secy *secy; ++ bool active = true; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (!validate_add_rxsc(tb_rxsc)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ dev = get_dev_from_nl(genl_info_net(info), attrs); ++ if (IS_ERR(dev)) { ++ rtnl_unlock(); ++ return PTR_ERR(dev); ++ } ++ ++ secy = &macsec_priv(dev)->secy; ++ sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); ++ ++ if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) ++ active = nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); ++ ++ rx_sc = create_rx_sc(dev, sci, active); ++ if (IS_ERR(rx_sc)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sc); ++ } ++ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.rx_sc = rx_sc; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_add_rxsc, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ del_rx_sc(secy, sci); ++ free_rx_sc(rx_sc); ++ rtnl_unlock(); ++ return ret; ++} ++ ++static bool validate_add_txsa(struct nlattr **attrs) ++{ ++ if (!attrs[MACSEC_SA_ATTR_AN] || ++ !attrs[MACSEC_SA_ATTR_PN] || ++ !attrs[MACSEC_SA_ATTR_KEY] || ++ !attrs[MACSEC_SA_ATTR_KEYID]) ++ return false; ++ ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) ++ return false; ++ ++ if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_ACTIVE]) { ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) ++ return false; ++ } ++ ++ if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) ++ return false; ++ ++ return true; ++} ++ ++static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct net_device *dev; ++ struct nlattr **attrs = info->attrs; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ unsigned char assoc_num; ++ int pn_len; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ bool was_operational; ++ int err; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (!validate_add_txsa(tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ dev = get_dev_from_nl(genl_info_net(info), attrs); ++ if (IS_ERR(dev)) { ++ rtnl_unlock(); ++ return PTR_ERR(dev); ++ } ++ ++ secy = &macsec_priv(dev)->secy; ++ tx_sc = &secy->tx_sc; ++ ++ assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); ++ ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { ++ pr_notice("macsec: nl: add_txsa: bad key length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { ++ pr_notice("macsec: nl: add_txsa: bad pn length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ if (secy->xpn) { ++ if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { ++ pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), ++ MACSEC_SALT_LEN); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ } ++ ++ tx_sa = rtnl_dereference(tx_sc->sa[assoc_num]); ++ if (tx_sa) { ++ rtnl_unlock(); ++ return -EBUSY; ++ } ++ ++ tx_sa = kmalloc(sizeof(*tx_sa), GFP_KERNEL); ++ if (!tx_sa) { ++ rtnl_unlock(); ++ return -ENOMEM; ++ } ++ ++ err = init_tx_sa(tx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), ++ secy->key_len, secy->icv_len); ++ if (err < 0) { ++ kfree(tx_sa); ++ rtnl_unlock(); ++ return err; ++ } ++ ++ spin_lock_bh(&tx_sa->lock); ++ tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); ++ spin_unlock_bh(&tx_sa->lock); ++ ++ if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) ++ tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); ++ ++ was_operational = secy->operational; ++ if (assoc_num == tx_sc->encoding_sa && tx_sa->active) ++ secy->operational = true; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ err = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.tx_sa = tx_sa; ++ ctx.secy = secy; ++ memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), ++ secy->key_len); ++ ++ err = macsec_offload(ops->mdo_add_txsa, &ctx); ++ memzero_explicit(ctx.sa.key, secy->key_len); ++ if (err) ++ goto cleanup; ++ } ++ ++ if (secy->xpn) { ++ tx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); ++ nla_memcpy(tx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], ++ MACSEC_SALT_LEN); ++ } ++ ++ nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); ++ rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa); ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ secy->operational = was_operational; ++ macsec_txsa_put(tx_sa); ++ rtnl_unlock(); ++ return err; ++} ++ ++static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ u8 assoc_num; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, ++ &dev, &secy, &rx_sc, &assoc_num); ++ if (IS_ERR(rx_sa)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sa); ++ } ++ ++ if (rx_sa->active) { ++ rtnl_unlock(); ++ return -EBUSY; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.rx_sa = rx_sa; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_del_rxsa, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL); ++ clear_rx_sa(rx_sa); ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ rtnl_unlock(); ++ return ret; ++} ++ ++static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ sci_t sci; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ dev = get_dev_from_nl(genl_info_net(info), info->attrs); ++ if (IS_ERR(dev)) { ++ rtnl_unlock(); ++ return PTR_ERR(dev); ++ } ++ ++ secy = &macsec_priv(dev)->secy; ++ sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); ++ ++ rx_sc = del_rx_sc(secy, sci); ++ if (!rx_sc) { ++ rtnl_unlock(); ++ return -ENODEV; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.rx_sc = rx_sc; ++ ctx.secy = secy; ++ ret = macsec_offload(ops->mdo_del_rxsc, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ free_rx_sc(rx_sc); ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ rtnl_unlock(); ++ return ret; ++} ++ ++static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ u8 assoc_num; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, ++ &dev, &secy, &tx_sc, &assoc_num); ++ if (IS_ERR(tx_sa)) { ++ rtnl_unlock(); ++ return PTR_ERR(tx_sa); ++ } ++ ++ if (tx_sa->active) { ++ rtnl_unlock(); ++ return -EBUSY; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.tx_sa = tx_sa; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_del_txsa, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL); ++ clear_tx_sa(tx_sa); ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ rtnl_unlock(); ++ return ret; ++} ++ ++static bool validate_upd_sa(struct nlattr **attrs) ++{ ++ if (!attrs[MACSEC_SA_ATTR_AN] || ++ attrs[MACSEC_SA_ATTR_KEY] || ++ attrs[MACSEC_SA_ATTR_KEYID] || ++ attrs[MACSEC_SA_ATTR_SSCI] || ++ attrs[MACSEC_SA_ATTR_SALT]) ++ return false; ++ ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) ++ return false; ++ ++ if (attrs[MACSEC_SA_ATTR_ACTIVE]) { ++ if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) ++ return false; ++ } ++ ++ return true; ++} ++ ++static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ struct macsec_tx_sa *tx_sa; ++ u8 assoc_num; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ bool was_operational, was_active; ++ pn_t prev_pn; ++ int ret = 0; ++ ++ prev_pn.full64 = 0; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (!validate_upd_sa(tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, ++ &dev, &secy, &tx_sc, &assoc_num); ++ if (IS_ERR(tx_sa)) { ++ rtnl_unlock(); ++ return PTR_ERR(tx_sa); ++ } ++ ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ int pn_len; ++ ++ pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { ++ pr_notice("macsec: nl: upd_txsa: bad pn length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ spin_lock_bh(&tx_sa->lock); ++ prev_pn = tx_sa->next_pn_halves; ++ tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); ++ spin_unlock_bh(&tx_sa->lock); ++ } ++ ++ was_active = tx_sa->active; ++ if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) ++ tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); ++ ++ was_operational = secy->operational; ++ if (assoc_num == tx_sc->encoding_sa) ++ secy->operational = tx_sa->active; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.tx_sa = tx_sa; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_upd_txsa, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ spin_lock_bh(&tx_sa->lock); ++ tx_sa->next_pn_halves = prev_pn; ++ spin_unlock_bh(&tx_sa->lock); ++ } ++ tx_sa->active = was_active; ++ secy->operational = was_operational; ++ rtnl_unlock(); ++ return ret; ++} ++ ++static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ struct macsec_rx_sa *rx_sa; ++ u8 assoc_num; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; ++ bool was_active; ++ pn_t prev_pn; ++ int ret = 0; ++ ++ prev_pn.full64 = 0; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (parse_sa_config(attrs, tb_sa)) ++ return -EINVAL; ++ ++ if (!validate_upd_sa(tb_sa)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, ++ &dev, &secy, &rx_sc, &assoc_num); ++ if (IS_ERR(rx_sa)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sa); ++ } ++ ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ int pn_len; ++ ++ pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; ++ if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { ++ pr_notice("macsec: nl: upd_rxsa: bad pn length: %d != %d\n", ++ nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ spin_lock_bh(&rx_sa->lock); ++ prev_pn = rx_sa->next_pn_halves; ++ rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); ++ spin_unlock_bh(&rx_sa->lock); ++ } ++ ++ was_active = rx_sa->active; ++ if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) ++ rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.sa.assoc_num = assoc_num; ++ ctx.sa.rx_sa = rx_sa; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_upd_rxsa, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ rtnl_unlock(); ++ return 0; ++ ++cleanup: ++ if (tb_sa[MACSEC_SA_ATTR_PN]) { ++ spin_lock_bh(&rx_sa->lock); ++ rx_sa->next_pn_halves = prev_pn; ++ spin_unlock_bh(&rx_sa->lock); ++ } ++ rx_sa->active = was_active; ++ rtnl_unlock(); ++ return ret; ++} ++ ++static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ struct macsec_secy *secy; ++ struct macsec_rx_sc *rx_sc; ++ struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; ++ unsigned int prev_n_rx_sc; ++ bool was_active; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (parse_rxsc_config(attrs, tb_rxsc)) ++ return -EINVAL; ++ ++ if (!validate_add_rxsc(tb_rxsc)) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); ++ if (IS_ERR(rx_sc)) { ++ rtnl_unlock(); ++ return PTR_ERR(rx_sc); ++ } ++ ++ was_active = rx_sc->active; ++ prev_n_rx_sc = secy->n_rx_sc; ++ if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) { ++ bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); ++ ++ if (rx_sc->active != new) ++ secy->n_rx_sc += new ? 1 : -1; ++ ++ rx_sc->active = new; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.rx_sc = rx_sc; ++ ctx.secy = secy; ++ ++ ret = macsec_offload(ops->mdo_upd_rxsc, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ rtnl_unlock(); ++ ++ return 0; ++ ++cleanup: ++ secy->n_rx_sc = prev_n_rx_sc; ++ rx_sc->active = was_active; ++ rtnl_unlock(); ++ return ret; ++} ++ ++static bool macsec_is_configured(struct macsec_dev *macsec) ++{ ++ struct macsec_secy *secy = &macsec->secy; ++ struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ int i; ++ ++ if (secy->rx_sc) ++ return true; ++ ++ for (i = 0; i < MACSEC_NUM_AN; i++) ++ if (tx_sc->sa[i]) ++ return true; ++ ++ return false; ++} ++ ++static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct nlattr *tb_offload[MACSEC_OFFLOAD_ATTR_MAX + 1]; ++ enum macsec_offload offload, prev_offload; ++ int (*func)(struct macsec_context *ctx); ++ struct nlattr **attrs = info->attrs; ++ struct net_device *dev; ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ struct macsec_dev *macsec; ++ int ret; ++ ++ if (!attrs[MACSEC_ATTR_IFINDEX]) ++ return -EINVAL; ++ ++ if (!attrs[MACSEC_ATTR_OFFLOAD]) ++ return -EINVAL; ++ ++ if (nla_parse_nested_deprecated(tb_offload, MACSEC_OFFLOAD_ATTR_MAX, ++ attrs[MACSEC_ATTR_OFFLOAD], ++ macsec_genl_offload_policy, NULL)) ++ return -EINVAL; ++ ++ dev = get_dev_from_nl(genl_info_net(info), attrs); ++ if (IS_ERR(dev)) ++ return PTR_ERR(dev); ++ macsec = macsec_priv(dev); ++ ++ if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) ++ return -EINVAL; ++ ++ offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]); ++ if (macsec->offload == offload) ++ return 0; ++ ++ /* Check if the offloading mode is supported by the underlying layers */ ++ if (offload != MACSEC_OFFLOAD_OFF && ++ !macsec_check_offload(offload, macsec)) ++ return -EOPNOTSUPP; ++ ++ /* Check if the net device is busy. */ ++ if (netif_running(dev)) ++ return -EBUSY; ++ ++ rtnl_lock(); ++ ++ prev_offload = macsec->offload; ++ macsec->offload = offload; ++ ++ /* Check if the device already has rules configured: we do not support ++ * rules migration. ++ */ ++ if (macsec_is_configured(macsec)) { ++ ret = -EBUSY; ++ goto rollback; ++ } ++ ++ ops = __macsec_get_ops(offload == MACSEC_OFFLOAD_OFF ? prev_offload : offload, ++ macsec, &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto rollback; ++ } ++ ++ if (prev_offload == MACSEC_OFFLOAD_OFF) ++ func = ops->mdo_add_secy; ++ else ++ func = ops->mdo_del_secy; ++ ++ ctx.secy = &macsec->secy; ++ ret = macsec_offload(func, &ctx); ++ if (ret) ++ goto rollback; ++ ++ rtnl_unlock(); ++ return 0; ++ ++rollback: ++ macsec->offload = prev_offload; ++ ++ rtnl_unlock(); ++ return ret; ++} ++ ++static void get_tx_sa_stats(struct net_device *dev, int an, ++ struct macsec_tx_sa *tx_sa, ++ struct macsec_tx_sa_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.sa.assoc_num = an; ++ ctx.sa.tx_sa = tx_sa; ++ ctx.stats.tx_sa_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ macsec_offload(ops->mdo_get_tx_sa_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct macsec_tx_sa_stats *stats = ++ per_cpu_ptr(tx_sa->stats, cpu); ++ ++ sum->OutPktsProtected += stats->OutPktsProtected; ++ sum->OutPktsEncrypted += stats->OutPktsEncrypted; ++ } ++} ++ ++static int copy_tx_sa_stats(struct sk_buff *skb, struct macsec_tx_sa_stats *sum) ++{ ++ if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED, ++ sum->OutPktsProtected) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED, ++ sum->OutPktsEncrypted)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static void get_rx_sa_stats(struct net_device *dev, ++ struct macsec_rx_sc *rx_sc, int an, ++ struct macsec_rx_sa *rx_sa, ++ struct macsec_rx_sa_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.sa.assoc_num = an; ++ ctx.sa.rx_sa = rx_sa; ++ ctx.stats.rx_sa_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ ctx.rx_sc = rx_sc; ++ macsec_offload(ops->mdo_get_rx_sa_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct macsec_rx_sa_stats *stats = ++ per_cpu_ptr(rx_sa->stats, cpu); ++ ++ sum->InPktsOK += stats->InPktsOK; ++ sum->InPktsInvalid += stats->InPktsInvalid; ++ sum->InPktsNotValid += stats->InPktsNotValid; ++ sum->InPktsNotUsingSA += stats->InPktsNotUsingSA; ++ sum->InPktsUnusedSA += stats->InPktsUnusedSA; ++ } ++} ++ ++static int copy_rx_sa_stats(struct sk_buff *skb, ++ struct macsec_rx_sa_stats *sum) ++{ ++ if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID, ++ sum->InPktsInvalid) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID, ++ sum->InPktsNotValid) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA, ++ sum->InPktsNotUsingSA) || ++ nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA, ++ sum->InPktsUnusedSA)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static void get_rx_sc_stats(struct net_device *dev, ++ struct macsec_rx_sc *rx_sc, ++ struct macsec_rx_sc_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.stats.rx_sc_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ ctx.rx_sc = rx_sc; ++ macsec_offload(ops->mdo_get_rx_sc_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct pcpu_rx_sc_stats *stats; ++ struct macsec_rx_sc_stats tmp; ++ unsigned int start; ++ ++ stats = per_cpu_ptr(rx_sc->stats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&stats->syncp); ++ memcpy(&tmp, &stats->stats, sizeof(tmp)); ++ } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); ++ ++ sum->InOctetsValidated += tmp.InOctetsValidated; ++ sum->InOctetsDecrypted += tmp.InOctetsDecrypted; ++ sum->InPktsUnchecked += tmp.InPktsUnchecked; ++ sum->InPktsDelayed += tmp.InPktsDelayed; ++ sum->InPktsOK += tmp.InPktsOK; ++ sum->InPktsInvalid += tmp.InPktsInvalid; ++ sum->InPktsLate += tmp.InPktsLate; ++ sum->InPktsNotValid += tmp.InPktsNotValid; ++ sum->InPktsNotUsingSA += tmp.InPktsNotUsingSA; ++ sum->InPktsUnusedSA += tmp.InPktsUnusedSA; ++ } ++} ++ ++static int copy_rx_sc_stats(struct sk_buff *skb, struct macsec_rx_sc_stats *sum) ++{ ++ if (nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED, ++ sum->InOctetsValidated, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED, ++ sum->InOctetsDecrypted, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED, ++ sum->InPktsUnchecked, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED, ++ sum->InPktsDelayed, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK, ++ sum->InPktsOK, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID, ++ sum->InPktsInvalid, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE, ++ sum->InPktsLate, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID, ++ sum->InPktsNotValid, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA, ++ sum->InPktsNotUsingSA, ++ MACSEC_RXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA, ++ sum->InPktsUnusedSA, ++ MACSEC_RXSC_STATS_ATTR_PAD)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static void get_tx_sc_stats(struct net_device *dev, ++ struct macsec_tx_sc_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.stats.tx_sc_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ macsec_offload(ops->mdo_get_tx_sc_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct pcpu_tx_sc_stats *stats; ++ struct macsec_tx_sc_stats tmp; ++ unsigned int start; ++ ++ stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&stats->syncp); ++ memcpy(&tmp, &stats->stats, sizeof(tmp)); ++ } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); ++ ++ sum->OutPktsProtected += tmp.OutPktsProtected; ++ sum->OutPktsEncrypted += tmp.OutPktsEncrypted; ++ sum->OutOctetsProtected += tmp.OutOctetsProtected; ++ sum->OutOctetsEncrypted += tmp.OutOctetsEncrypted; ++ } ++} ++ ++static int copy_tx_sc_stats(struct sk_buff *skb, struct macsec_tx_sc_stats *sum) ++{ ++ if (nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED, ++ sum->OutPktsProtected, ++ MACSEC_TXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED, ++ sum->OutPktsEncrypted, ++ MACSEC_TXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED, ++ sum->OutOctetsProtected, ++ MACSEC_TXSC_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED, ++ sum->OutOctetsEncrypted, ++ MACSEC_TXSC_STATS_ATTR_PAD)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ int cpu; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.stats.dev_stats = sum; ++ ctx.secy = &macsec_priv(dev)->secy; ++ macsec_offload(ops->mdo_get_dev_stats, &ctx); ++ } ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ const struct pcpu_secy_stats *stats; ++ struct macsec_dev_stats tmp; ++ unsigned int start; ++ ++ stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&stats->syncp); ++ memcpy(&tmp, &stats->stats, sizeof(tmp)); ++ } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); ++ ++ sum->OutPktsUntagged += tmp.OutPktsUntagged; ++ sum->InPktsUntagged += tmp.InPktsUntagged; ++ sum->OutPktsTooLong += tmp.OutPktsTooLong; ++ sum->InPktsNoTag += tmp.InPktsNoTag; ++ sum->InPktsBadTag += tmp.InPktsBadTag; ++ sum->InPktsUnknownSCI += tmp.InPktsUnknownSCI; ++ sum->InPktsNoSCI += tmp.InPktsNoSCI; ++ sum->InPktsOverrun += tmp.InPktsOverrun; ++ } ++} ++ ++static int copy_secy_stats(struct sk_buff *skb, struct macsec_dev_stats *sum) ++{ ++ if (nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED, ++ sum->OutPktsUntagged, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED, ++ sum->InPktsUntagged, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG, ++ sum->OutPktsTooLong, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG, ++ sum->InPktsNoTag, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG, ++ sum->InPktsBadTag, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI, ++ sum->InPktsUnknownSCI, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI, ++ sum->InPktsNoSCI, ++ MACSEC_SECY_STATS_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN, ++ sum->InPktsOverrun, ++ MACSEC_SECY_STATS_ATTR_PAD)) ++ return -EMSGSIZE; ++ ++ return 0; ++} ++ ++static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb) ++{ ++ struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ struct nlattr *secy_nest = nla_nest_start_noflag(skb, ++ MACSEC_ATTR_SECY); ++ u64 csid; ++ ++ if (!secy_nest) ++ return 1; ++ ++ switch (secy->key_len) { ++ case MACSEC_GCM_AES_128_SAK_LEN: ++ csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; ++ break; ++ case MACSEC_GCM_AES_256_SAK_LEN: ++ csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; ++ break; ++ default: ++ goto cancel; ++ } ++ ++ if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci, ++ MACSEC_SECY_ATTR_PAD) || ++ nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE, ++ csid, MACSEC_SECY_ATTR_PAD) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_REPLAY, secy->replay_protect) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_VALIDATE, secy->validate_frames) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_ENCRYPT, tx_sc->encrypt) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_INC_SCI, tx_sc->send_sci) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_ES, tx_sc->end_station) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_SCB, tx_sc->scb) || ++ nla_put_u8(skb, MACSEC_SECY_ATTR_ENCODING_SA, tx_sc->encoding_sa)) ++ goto cancel; ++ ++ if (secy->replay_protect) { ++ if (nla_put_u32(skb, MACSEC_SECY_ATTR_WINDOW, secy->replay_window)) ++ goto cancel; ++ } ++ ++ nla_nest_end(skb, secy_nest); ++ return 0; ++ ++cancel: ++ nla_nest_cancel(skb, secy_nest); ++ return 1; ++} ++ ++static noinline_for_stack int ++dump_secy(struct macsec_secy *secy, struct net_device *dev, ++ struct sk_buff *skb, struct netlink_callback *cb) ++{ ++ struct macsec_tx_sc_stats tx_sc_stats = {0, }; ++ struct macsec_tx_sa_stats tx_sa_stats = {0, }; ++ struct macsec_rx_sc_stats rx_sc_stats = {0, }; ++ struct macsec_rx_sa_stats rx_sa_stats = {0, }; ++ struct macsec_dev *macsec = netdev_priv(dev); ++ struct macsec_dev_stats dev_stats = {0, }; ++ struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ struct nlattr *txsa_list, *rxsc_list; ++ struct macsec_rx_sc *rx_sc; ++ struct nlattr *attr; ++ void *hdr; ++ int i, j; ++ ++ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, ++ &macsec_fam, NLM_F_MULTI, MACSEC_CMD_GET_TXSC); ++ if (!hdr) ++ return -EMSGSIZE; ++ ++ genl_dump_check_consistent(cb, hdr); ++ ++ if (nla_put_u32(skb, MACSEC_ATTR_IFINDEX, dev->ifindex)) ++ goto nla_put_failure; ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_ATTR_OFFLOAD); ++ if (!attr) ++ goto nla_put_failure; ++ if (nla_put_u8(skb, MACSEC_OFFLOAD_ATTR_TYPE, macsec->offload)) ++ goto nla_put_failure; ++ nla_nest_end(skb, attr); ++ ++ if (nla_put_secy(secy, skb)) ++ goto nla_put_failure; ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSC_STATS); ++ if (!attr) ++ goto nla_put_failure; ++ ++ get_tx_sc_stats(dev, &tx_sc_stats); ++ if (copy_tx_sc_stats(skb, &tx_sc_stats)) { ++ nla_nest_cancel(skb, attr); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY_STATS); ++ if (!attr) ++ goto nla_put_failure; ++ get_secy_stats(dev, &dev_stats); ++ if (copy_secy_stats(skb, &dev_stats)) { ++ nla_nest_cancel(skb, attr); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ txsa_list = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSA_LIST); ++ if (!txsa_list) ++ goto nla_put_failure; ++ for (i = 0, j = 1; i < MACSEC_NUM_AN; i++) { ++ struct macsec_tx_sa *tx_sa = rtnl_dereference(tx_sc->sa[i]); ++ struct nlattr *txsa_nest; ++ u64 pn; ++ int pn_len; ++ ++ if (!tx_sa) ++ continue; ++ ++ txsa_nest = nla_nest_start_noflag(skb, j++); ++ if (!txsa_nest) { ++ nla_nest_cancel(skb, txsa_list); ++ goto nla_put_failure; ++ } ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS); ++ if (!attr) { ++ nla_nest_cancel(skb, txsa_nest); ++ nla_nest_cancel(skb, txsa_list); ++ goto nla_put_failure; ++ } ++ memset(&tx_sa_stats, 0, sizeof(tx_sa_stats)); ++ get_tx_sa_stats(dev, i, tx_sa, &tx_sa_stats); ++ if (copy_tx_sa_stats(skb, &tx_sa_stats)) { ++ nla_nest_cancel(skb, attr); ++ nla_nest_cancel(skb, txsa_nest); ++ nla_nest_cancel(skb, txsa_list); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ if (secy->xpn) { ++ pn = tx_sa->next_pn; ++ pn_len = MACSEC_XPN_PN_LEN; ++ } else { ++ pn = tx_sa->next_pn_halves.lower; ++ pn_len = MACSEC_DEFAULT_PN_LEN; ++ } ++ ++ if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || ++ nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || ++ nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, tx_sa->key.id) || ++ (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, tx_sa->ssci)) || ++ nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, tx_sa->active)) { ++ nla_nest_cancel(skb, txsa_nest); ++ nla_nest_cancel(skb, txsa_list); ++ goto nla_put_failure; ++ } ++ ++ nla_nest_end(skb, txsa_nest); ++ } ++ nla_nest_end(skb, txsa_list); ++ ++ rxsc_list = nla_nest_start_noflag(skb, MACSEC_ATTR_RXSC_LIST); ++ if (!rxsc_list) ++ goto nla_put_failure; ++ ++ j = 1; ++ for_each_rxsc_rtnl(secy, rx_sc) { ++ int k; ++ struct nlattr *rxsa_list; ++ struct nlattr *rxsc_nest = nla_nest_start_noflag(skb, j++); ++ ++ if (!rxsc_nest) { ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ ++ if (nla_put_u8(skb, MACSEC_RXSC_ATTR_ACTIVE, rx_sc->active) || ++ nla_put_sci(skb, MACSEC_RXSC_ATTR_SCI, rx_sc->sci, ++ MACSEC_RXSC_ATTR_PAD)) { ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ ++ attr = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_STATS); ++ if (!attr) { ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ memset(&rx_sc_stats, 0, sizeof(rx_sc_stats)); ++ get_rx_sc_stats(dev, rx_sc, &rx_sc_stats); ++ if (copy_rx_sc_stats(skb, &rx_sc_stats)) { ++ nla_nest_cancel(skb, attr); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ rxsa_list = nla_nest_start_noflag(skb, ++ MACSEC_RXSC_ATTR_SA_LIST); ++ if (!rxsa_list) { ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ ++ for (i = 0, k = 1; i < MACSEC_NUM_AN; i++) { ++ struct macsec_rx_sa *rx_sa = rtnl_dereference(rx_sc->sa[i]); ++ struct nlattr *rxsa_nest; ++ u64 pn; ++ int pn_len; ++ ++ if (!rx_sa) ++ continue; ++ ++ rxsa_nest = nla_nest_start_noflag(skb, k++); ++ if (!rxsa_nest) { ++ nla_nest_cancel(skb, rxsa_list); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ ++ attr = nla_nest_start_noflag(skb, ++ MACSEC_SA_ATTR_STATS); ++ if (!attr) { ++ nla_nest_cancel(skb, rxsa_list); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ memset(&rx_sa_stats, 0, sizeof(rx_sa_stats)); ++ get_rx_sa_stats(dev, rx_sc, i, rx_sa, &rx_sa_stats); ++ if (copy_rx_sa_stats(skb, &rx_sa_stats)) { ++ nla_nest_cancel(skb, attr); ++ nla_nest_cancel(skb, rxsa_list); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, attr); ++ ++ if (secy->xpn) { ++ pn = rx_sa->next_pn; ++ pn_len = MACSEC_XPN_PN_LEN; ++ } else { ++ pn = rx_sa->next_pn_halves.lower; ++ pn_len = MACSEC_DEFAULT_PN_LEN; ++ } ++ ++ if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || ++ nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || ++ nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, rx_sa->key.id) || ++ (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, rx_sa->ssci)) || ++ nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, rx_sa->active)) { ++ nla_nest_cancel(skb, rxsa_nest); ++ nla_nest_cancel(skb, rxsc_nest); ++ nla_nest_cancel(skb, rxsc_list); ++ goto nla_put_failure; ++ } ++ nla_nest_end(skb, rxsa_nest); ++ } ++ ++ nla_nest_end(skb, rxsa_list); ++ nla_nest_end(skb, rxsc_nest); ++ } ++ ++ nla_nest_end(skb, rxsc_list); ++ ++ genlmsg_end(skb, hdr); ++ ++ return 0; ++ ++nla_put_failure: ++ genlmsg_cancel(skb, hdr); ++ return -EMSGSIZE; ++} ++ ++static int macsec_generation = 1; /* protected by RTNL */ ++ ++static int macsec_dump_txsc(struct sk_buff *skb, struct netlink_callback *cb) ++{ ++ struct net *net = sock_net(skb->sk); ++ struct net_device *dev; ++ int dev_idx, d; ++ ++ dev_idx = cb->args[0]; ++ ++ d = 0; ++ rtnl_lock(); ++ ++ cb->seq = macsec_generation; ++ ++ for_each_netdev(net, dev) { ++ struct macsec_secy *secy; ++ ++ if (d < dev_idx) ++ goto next; ++ ++ if (!netif_is_macsec(dev)) ++ goto next; ++ ++ secy = &macsec_priv(dev)->secy; ++ if (dump_secy(secy, dev, skb, cb) < 0) ++ goto done; ++next: ++ d++; ++ } ++ ++done: ++ rtnl_unlock(); ++ cb->args[0] = d; ++ return skb->len; ++} ++ ++static const struct genl_small_ops macsec_genl_ops[] = { ++ { ++ .cmd = MACSEC_CMD_GET_TXSC, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .dumpit = macsec_dump_txsc, ++ }, ++ { ++ .cmd = MACSEC_CMD_ADD_RXSC, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_add_rxsc, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_DEL_RXSC, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_del_rxsc, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_UPD_RXSC, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_upd_rxsc, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_ADD_TXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_add_txsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_DEL_TXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_del_txsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_UPD_TXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_upd_txsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_ADD_RXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_add_rxsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_DEL_RXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_del_rxsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_UPD_RXSA, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_upd_rxsa, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MACSEC_CMD_UPD_OFFLOAD, ++ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++ .doit = macsec_upd_offload, ++ .flags = GENL_ADMIN_PERM, ++ }, ++}; ++ ++static struct genl_family macsec_fam __ro_after_init = { ++ .name = MACSEC_GENL_NAME, ++ .hdrsize = 0, ++ .version = MACSEC_GENL_VERSION, ++ .maxattr = MACSEC_ATTR_MAX, ++ .policy = macsec_genl_policy, ++ .netnsok = true, ++ .module = THIS_MODULE, ++ .small_ops = macsec_genl_ops, ++ .n_small_ops = ARRAY_SIZE(macsec_genl_ops), ++}; ++ ++static netdev_tx_t macsec_start_xmit(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ struct macsec_dev *macsec = netdev_priv(dev); ++ struct macsec_secy *secy = &macsec->secy; ++ struct pcpu_secy_stats *secy_stats; ++ int ret, len; ++ ++ if (macsec_is_offloaded(netdev_priv(dev))) { ++ skb->dev = macsec->real_dev; ++ return dev_queue_xmit(skb); ++ } ++ ++ /* 10.5 */ ++ if (!secy->protect_frames) { ++ secy_stats = this_cpu_ptr(macsec->stats); ++ u64_stats_update_begin(&secy_stats->syncp); ++ secy_stats->stats.OutPktsUntagged++; ++ u64_stats_update_end(&secy_stats->syncp); ++ skb->dev = macsec->real_dev; ++ len = skb->len; ++ ret = dev_queue_xmit(skb); ++ count_tx(dev, ret, len); ++ return ret; ++ } ++ ++ if (!secy->operational) { ++ kfree_skb(skb); ++ dev->stats.tx_dropped++; ++ return NETDEV_TX_OK; ++ } ++ ++ len = skb->len; ++ skb = macsec_encrypt(skb, dev); ++ if (IS_ERR(skb)) { ++ if (PTR_ERR(skb) != -EINPROGRESS) ++ dev->stats.tx_dropped++; ++ return NETDEV_TX_OK; ++ } ++ ++ macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); ++ ++ macsec_encrypt_finish(skb, dev); ++ ret = dev_queue_xmit(skb); ++ count_tx(dev, ret, len); ++ return ret; ++} ++ ++#define MACSEC_FEATURES \ ++ (NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST) ++ ++static int macsec_dev_init(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ int err; ++ ++ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); ++ if (!dev->tstats) ++ return -ENOMEM; ++ ++ err = gro_cells_init(&macsec->gro_cells, dev); ++ if (err) { ++ free_percpu(dev->tstats); ++ return err; ++ } ++ ++ dev->features = real_dev->features & MACSEC_FEATURES; ++ dev->features |= NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE; ++ ++ dev->needed_headroom = real_dev->needed_headroom + ++ MACSEC_NEEDED_HEADROOM; ++ dev->needed_tailroom = real_dev->needed_tailroom + ++ MACSEC_NEEDED_TAILROOM; ++ ++ if (is_zero_ether_addr(dev->dev_addr)) ++ eth_hw_addr_inherit(dev, real_dev); ++ if (is_zero_ether_addr(dev->broadcast)) ++ memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); ++ ++ /* Get macsec's reference to real_dev */ ++ netdev_hold(real_dev, &macsec->dev_tracker, GFP_KERNEL); ++ ++ return 0; ++} ++ ++static void macsec_dev_uninit(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ ++ gro_cells_destroy(&macsec->gro_cells); ++ free_percpu(dev->tstats); ++} ++ ++static netdev_features_t macsec_fix_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ ++ features &= (real_dev->features & MACSEC_FEATURES) | ++ NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES; ++ features |= NETIF_F_LLTX; ++ ++ return features; ++} ++ ++static int macsec_dev_open(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ int err; ++ ++ err = dev_uc_add(real_dev, dev->dev_addr); ++ if (err < 0) ++ return err; ++ ++ if (dev->flags & IFF_ALLMULTI) { ++ err = dev_set_allmulti(real_dev, 1); ++ if (err < 0) ++ goto del_unicast; ++ } ++ ++ if (dev->flags & IFF_PROMISC) { ++ err = dev_set_promiscuity(real_dev, 1); ++ if (err < 0) ++ goto clear_allmulti; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ err = -EOPNOTSUPP; ++ goto clear_allmulti; ++ } ++ ++ ctx.secy = &macsec->secy; ++ err = macsec_offload(ops->mdo_dev_open, &ctx); ++ if (err) ++ goto clear_allmulti; ++ } ++ ++ if (netif_carrier_ok(real_dev)) ++ netif_carrier_on(dev); ++ ++ return 0; ++clear_allmulti: ++ if (dev->flags & IFF_ALLMULTI) ++ dev_set_allmulti(real_dev, -1); ++del_unicast: ++ dev_uc_del(real_dev, dev->dev_addr); ++ netif_carrier_off(dev); ++ return err; ++} ++ ++static int macsec_dev_stop(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ ++ netif_carrier_off(dev); ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.secy = &macsec->secy; ++ macsec_offload(ops->mdo_dev_stop, &ctx); ++ } ++ } ++ ++ dev_mc_unsync(real_dev, dev); ++ dev_uc_unsync(real_dev, dev); ++ ++ if (dev->flags & IFF_ALLMULTI) ++ dev_set_allmulti(real_dev, -1); ++ ++ if (dev->flags & IFF_PROMISC) ++ dev_set_promiscuity(real_dev, -1); ++ ++ dev_uc_del(real_dev, dev->dev_addr); ++ ++ return 0; ++} ++ ++static void macsec_dev_change_rx_flags(struct net_device *dev, int change) ++{ ++ struct net_device *real_dev = macsec_priv(dev)->real_dev; ++ ++ if (!(dev->flags & IFF_UP)) ++ return; ++ ++ if (change & IFF_ALLMULTI) ++ dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); ++ ++ if (change & IFF_PROMISC) ++ dev_set_promiscuity(real_dev, ++ dev->flags & IFF_PROMISC ? 1 : -1); ++} ++ ++static void macsec_dev_set_rx_mode(struct net_device *dev) ++{ ++ struct net_device *real_dev = macsec_priv(dev)->real_dev; ++ ++ dev_mc_sync(real_dev, dev); ++ dev_uc_sync(real_dev, dev); ++} ++ ++static int macsec_set_mac_address(struct net_device *dev, void *p) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ struct sockaddr *addr = p; ++ int err; ++ ++ if (!is_valid_ether_addr(addr->sa_data)) ++ return -EADDRNOTAVAIL; ++ ++ if (!(dev->flags & IFF_UP)) ++ goto out; ++ ++ err = dev_uc_add(real_dev, addr->sa_data); ++ if (err < 0) ++ return err; ++ ++ dev_uc_del(real_dev, dev->dev_addr); ++ ++out: ++ eth_hw_addr_set(dev, addr->sa_data); ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.secy = &macsec->secy; ++ macsec_offload(ops->mdo_upd_secy, &ctx); ++ } ++ } ++ ++ return 0; ++} ++ ++static int macsec_change_mtu(struct net_device *dev, int new_mtu) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ unsigned int extra = macsec->secy.icv_len + macsec_extra_len(true); ++ ++ if (macsec->real_dev->mtu - extra < new_mtu) ++ return -ERANGE; ++ ++ dev->mtu = new_mtu; ++ ++ return 0; ++} ++ ++static void macsec_get_stats64(struct net_device *dev, ++ struct rtnl_link_stats64 *s) ++{ ++ if (!dev->tstats) ++ return; ++ ++ dev_fetch_sw_netstats(s, dev->tstats); ++ ++ s->rx_dropped = dev->stats.rx_dropped; ++ s->tx_dropped = dev->stats.tx_dropped; ++ s->rx_errors = dev->stats.rx_errors; ++} ++ ++static int macsec_get_iflink(const struct net_device *dev) ++{ ++ return macsec_priv(dev)->real_dev->ifindex; ++} ++ ++static const struct net_device_ops macsec_netdev_ops = { ++ .ndo_init = macsec_dev_init, ++ .ndo_uninit = macsec_dev_uninit, ++ .ndo_open = macsec_dev_open, ++ .ndo_stop = macsec_dev_stop, ++ .ndo_fix_features = macsec_fix_features, ++ .ndo_change_mtu = macsec_change_mtu, ++ .ndo_set_rx_mode = macsec_dev_set_rx_mode, ++ .ndo_change_rx_flags = macsec_dev_change_rx_flags, ++ .ndo_set_mac_address = macsec_set_mac_address, ++ .ndo_start_xmit = macsec_start_xmit, ++ .ndo_get_stats64 = macsec_get_stats64, ++ .ndo_get_iflink = macsec_get_iflink, ++}; ++ ++static const struct device_type macsec_type = { ++ .name = "macsec", ++}; ++ ++static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = { ++ [IFLA_MACSEC_SCI] = { .type = NLA_U64 }, ++ [IFLA_MACSEC_PORT] = { .type = NLA_U16 }, ++ [IFLA_MACSEC_ICV_LEN] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_CIPHER_SUITE] = { .type = NLA_U64 }, ++ [IFLA_MACSEC_WINDOW] = { .type = NLA_U32 }, ++ [IFLA_MACSEC_ENCODING_SA] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_ENCRYPT] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_PROTECT] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_INC_SCI] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_ES] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_SCB] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 }, ++ [IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 }, ++}; ++ ++static void macsec_free_netdev(struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ ++ free_percpu(macsec->stats); ++ free_percpu(macsec->secy.tx_sc.stats); ++ ++ /* Get rid of the macsec's reference to real_dev */ ++ netdev_put(macsec->real_dev, &macsec->dev_tracker); ++} ++ ++static void macsec_setup(struct net_device *dev) ++{ ++ ether_setup(dev); ++ dev->min_mtu = 0; ++ dev->max_mtu = ETH_MAX_MTU; ++ dev->priv_flags |= IFF_NO_QUEUE; ++ dev->netdev_ops = &macsec_netdev_ops; ++ dev->needs_free_netdev = true; ++ dev->priv_destructor = macsec_free_netdev; ++ SET_NETDEV_DEVTYPE(dev, &macsec_type); ++ ++ eth_zero_addr(dev->broadcast); ++} ++ ++static int macsec_changelink_common(struct net_device *dev, ++ struct nlattr *data[]) ++{ ++ struct macsec_secy *secy; ++ struct macsec_tx_sc *tx_sc; ++ ++ secy = &macsec_priv(dev)->secy; ++ tx_sc = &secy->tx_sc; ++ ++ if (data[IFLA_MACSEC_ENCODING_SA]) { ++ struct macsec_tx_sa *tx_sa; ++ ++ tx_sc->encoding_sa = nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]); ++ tx_sa = rtnl_dereference(tx_sc->sa[tx_sc->encoding_sa]); ++ ++ secy->operational = tx_sa && tx_sa->active; ++ } ++ ++ if (data[IFLA_MACSEC_ENCRYPT]) ++ tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]); ++ ++ if (data[IFLA_MACSEC_PROTECT]) ++ secy->protect_frames = !!nla_get_u8(data[IFLA_MACSEC_PROTECT]); ++ ++ if (data[IFLA_MACSEC_INC_SCI]) ++ tx_sc->send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); ++ ++ if (data[IFLA_MACSEC_ES]) ++ tx_sc->end_station = !!nla_get_u8(data[IFLA_MACSEC_ES]); ++ ++ if (data[IFLA_MACSEC_SCB]) ++ tx_sc->scb = !!nla_get_u8(data[IFLA_MACSEC_SCB]); ++ ++ if (data[IFLA_MACSEC_REPLAY_PROTECT]) ++ secy->replay_protect = !!nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT]); ++ ++ if (data[IFLA_MACSEC_VALIDATION]) ++ secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]); ++ ++ if (data[IFLA_MACSEC_CIPHER_SUITE]) { ++ switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) { ++ case MACSEC_CIPHER_ID_GCM_AES_128: ++ case MACSEC_DEFAULT_CIPHER_ID: ++ secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; ++ secy->xpn = false; ++ break; ++ case MACSEC_CIPHER_ID_GCM_AES_256: ++ secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; ++ secy->xpn = false; ++ break; ++ case MACSEC_CIPHER_ID_GCM_AES_XPN_128: ++ secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; ++ secy->xpn = true; ++ break; ++ case MACSEC_CIPHER_ID_GCM_AES_XPN_256: ++ secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; ++ secy->xpn = true; ++ break; ++ default: ++ return -EINVAL; ++ } ++ } ++ ++ if (data[IFLA_MACSEC_WINDOW]) { ++ secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]); ++ ++ /* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window ++ * for XPN cipher suites */ ++ if (secy->xpn && ++ secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW) ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], ++ struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_tx_sc tx_sc; ++ struct macsec_secy secy; ++ int ret; ++ ++ if (!data) ++ return 0; ++ ++ if (data[IFLA_MACSEC_CIPHER_SUITE] || ++ data[IFLA_MACSEC_ICV_LEN] || ++ data[IFLA_MACSEC_SCI] || ++ data[IFLA_MACSEC_PORT]) ++ return -EINVAL; ++ ++ /* Keep a copy of unmodified secy and tx_sc, in case the offload ++ * propagation fails, to revert macsec_changelink_common. ++ */ ++ memcpy(&secy, &macsec->secy, sizeof(secy)); ++ memcpy(&tx_sc, &macsec->secy.tx_sc, sizeof(tx_sc)); ++ ++ ret = macsec_changelink_common(dev, data); ++ if (ret) ++ goto cleanup; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (!ops) { ++ ret = -EOPNOTSUPP; ++ goto cleanup; ++ } ++ ++ ctx.secy = &macsec->secy; ++ ret = macsec_offload(ops->mdo_upd_secy, &ctx); ++ if (ret) ++ goto cleanup; ++ } ++ ++ return 0; ++ ++cleanup: ++ memcpy(&macsec->secy.tx_sc, &tx_sc, sizeof(tx_sc)); ++ memcpy(&macsec->secy, &secy, sizeof(secy)); ++ ++ return ret; ++} ++ ++static void macsec_del_dev(struct macsec_dev *macsec) ++{ ++ int i; ++ ++ while (macsec->secy.rx_sc) { ++ struct macsec_rx_sc *rx_sc = rtnl_dereference(macsec->secy.rx_sc); ++ ++ rcu_assign_pointer(macsec->secy.rx_sc, rx_sc->next); ++ free_rx_sc(rx_sc); ++ } ++ ++ for (i = 0; i < MACSEC_NUM_AN; i++) { ++ struct macsec_tx_sa *sa = rtnl_dereference(macsec->secy.tx_sc.sa[i]); ++ ++ if (sa) { ++ RCU_INIT_POINTER(macsec->secy.tx_sc.sa[i], NULL); ++ clear_tx_sa(sa); ++ } ++ } ++} ++ ++static void macsec_common_dellink(struct net_device *dev, struct list_head *head) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(netdev_priv(dev), &ctx); ++ if (ops) { ++ ctx.secy = &macsec->secy; ++ macsec_offload(ops->mdo_del_secy, &ctx); ++ } ++ } ++ ++ unregister_netdevice_queue(dev, head); ++ list_del_rcu(&macsec->secys); ++ macsec_del_dev(macsec); ++ netdev_upper_dev_unlink(real_dev, dev); ++ ++ macsec_generation++; ++} ++ ++static void macsec_dellink(struct net_device *dev, struct list_head *head) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct net_device *real_dev = macsec->real_dev; ++ struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); ++ ++ macsec_common_dellink(dev, head); ++ ++ if (list_empty(&rxd->secys)) { ++ netdev_rx_handler_unregister(real_dev); ++ kfree(rxd); ++ } ++} ++ ++static int register_macsec_dev(struct net_device *real_dev, ++ struct net_device *dev) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); ++ ++ if (!rxd) { ++ int err; ++ ++ rxd = kmalloc(sizeof(*rxd), GFP_KERNEL); ++ if (!rxd) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&rxd->secys); ++ ++ err = netdev_rx_handler_register(real_dev, macsec_handle_frame, ++ rxd); ++ if (err < 0) { ++ kfree(rxd); ++ return err; ++ } ++ } ++ ++ list_add_tail_rcu(&macsec->secys, &rxd->secys); ++ return 0; ++} ++ ++static bool sci_exists(struct net_device *dev, sci_t sci) ++{ ++ struct macsec_rxh_data *rxd = macsec_data_rtnl(dev); ++ struct macsec_dev *macsec; ++ ++ list_for_each_entry(macsec, &rxd->secys, secys) { ++ if (macsec->secy.sci == sci) ++ return true; ++ } ++ ++ return false; ++} ++ ++static sci_t dev_to_sci(struct net_device *dev, __be16 port) ++{ ++ return make_sci(dev->dev_addr, port); ++} ++ ++static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ struct macsec_secy *secy = &macsec->secy; ++ ++ macsec->stats = netdev_alloc_pcpu_stats(struct pcpu_secy_stats); ++ if (!macsec->stats) ++ return -ENOMEM; ++ ++ secy->tx_sc.stats = netdev_alloc_pcpu_stats(struct pcpu_tx_sc_stats); ++ if (!secy->tx_sc.stats) { ++ free_percpu(macsec->stats); ++ return -ENOMEM; ++ } ++ ++ if (sci == MACSEC_UNDEF_SCI) ++ sci = dev_to_sci(dev, MACSEC_PORT_ES); ++ ++ secy->netdev = dev; ++ secy->operational = true; ++ secy->key_len = DEFAULT_SAK_LEN; ++ secy->icv_len = icv_len; ++ secy->validate_frames = MACSEC_VALIDATE_DEFAULT; ++ secy->protect_frames = true; ++ secy->replay_protect = false; ++ secy->xpn = DEFAULT_XPN; ++ ++ secy->sci = sci; ++ secy->tx_sc.active = true; ++ secy->tx_sc.encoding_sa = DEFAULT_ENCODING_SA; ++ secy->tx_sc.encrypt = DEFAULT_ENCRYPT; ++ secy->tx_sc.send_sci = DEFAULT_SEND_SCI; ++ secy->tx_sc.end_station = false; ++ secy->tx_sc.scb = false; ++ ++ return 0; ++} ++ ++static struct lock_class_key macsec_netdev_addr_lock_key; ++ ++static int macsec_newlink(struct net *net, struct net_device *dev, ++ struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ struct macsec_dev *macsec = macsec_priv(dev); ++ rx_handler_func_t *rx_handler; ++ u8 icv_len = DEFAULT_ICV_LEN; ++ struct net_device *real_dev; ++ int err, mtu; ++ sci_t sci; ++ ++ if (!tb[IFLA_LINK]) ++ return -EINVAL; ++ real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK])); ++ if (!real_dev) ++ return -ENODEV; ++ if (real_dev->type != ARPHRD_ETHER) ++ return -EINVAL; ++ ++ dev->priv_flags |= IFF_MACSEC; ++ ++ macsec->real_dev = real_dev; ++ ++ if (data && data[IFLA_MACSEC_OFFLOAD]) ++ macsec->offload = nla_get_offload(data[IFLA_MACSEC_OFFLOAD]); ++ else ++ /* MACsec offloading is off by default */ ++ macsec->offload = MACSEC_OFFLOAD_OFF; ++ ++ /* Check if the offloading mode is supported by the underlying layers */ ++ if (macsec->offload != MACSEC_OFFLOAD_OFF && ++ !macsec_check_offload(macsec->offload, macsec)) ++ return -EOPNOTSUPP; ++ ++ /* send_sci must be set to true when transmit sci explicitly is set */ ++ if ((data && data[IFLA_MACSEC_SCI]) && ++ (data && data[IFLA_MACSEC_INC_SCI])) { ++ u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); ++ ++ if (!send_sci) ++ return -EINVAL; ++ } ++ ++ if (data && data[IFLA_MACSEC_ICV_LEN]) ++ icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); ++ mtu = real_dev->mtu - icv_len - macsec_extra_len(true); ++ if (mtu < 0) ++ dev->mtu = 0; ++ else ++ dev->mtu = mtu; ++ ++ rx_handler = rtnl_dereference(real_dev->rx_handler); ++ if (rx_handler && rx_handler != macsec_handle_frame) ++ return -EBUSY; ++ ++ err = register_netdevice(dev); ++ if (err < 0) ++ return err; ++ ++ netdev_lockdep_set_classes(dev); ++ lockdep_set_class(&dev->addr_list_lock, ++ &macsec_netdev_addr_lock_key); ++ ++ err = netdev_upper_dev_link(real_dev, dev, extack); ++ if (err < 0) ++ goto unregister; ++ ++ /* need to be already registered so that ->init has run and ++ * the MAC addr is set ++ */ ++ if (data && data[IFLA_MACSEC_SCI]) ++ sci = nla_get_sci(data[IFLA_MACSEC_SCI]); ++ else if (data && data[IFLA_MACSEC_PORT]) ++ sci = dev_to_sci(dev, nla_get_be16(data[IFLA_MACSEC_PORT])); ++ else ++ sci = dev_to_sci(dev, MACSEC_PORT_ES); ++ ++ if (rx_handler && sci_exists(real_dev, sci)) { ++ err = -EBUSY; ++ goto unlink; ++ } ++ ++ err = macsec_add_dev(dev, sci, icv_len); ++ if (err) ++ goto unlink; ++ ++ if (data) { ++ err = macsec_changelink_common(dev, data); ++ if (err) ++ goto del_dev; ++ } ++ ++ /* If h/w offloading is available, propagate to the device */ ++ if (macsec_is_offloaded(macsec)) { ++ const struct macsec_ops *ops; ++ struct macsec_context ctx; ++ ++ ops = macsec_get_ops(macsec, &ctx); ++ if (ops) { ++ ctx.secy = &macsec->secy; ++ err = macsec_offload(ops->mdo_add_secy, &ctx); ++ if (err) ++ goto del_dev; ++ } ++ } ++ ++ err = register_macsec_dev(real_dev, dev); ++ if (err < 0) ++ goto del_dev; ++ ++ netif_stacked_transfer_operstate(real_dev, dev); ++ linkwatch_fire_event(dev); ++ ++ macsec_generation++; ++ ++ return 0; ++ ++del_dev: ++ macsec_del_dev(macsec); ++unlink: ++ netdev_upper_dev_unlink(real_dev, dev); ++unregister: ++ unregister_netdevice(dev); ++ return err; ++} ++ ++static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[], ++ struct netlink_ext_ack *extack) ++{ ++ u64 csid = MACSEC_DEFAULT_CIPHER_ID; ++ u8 icv_len = DEFAULT_ICV_LEN; ++ int flag; ++ bool es, scb, sci; ++ ++ if (!data) ++ return 0; ++ ++ if (data[IFLA_MACSEC_CIPHER_SUITE]) ++ csid = nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE]); ++ ++ if (data[IFLA_MACSEC_ICV_LEN]) { ++ icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); ++ if (icv_len != DEFAULT_ICV_LEN) { ++ char dummy_key[DEFAULT_SAK_LEN] = { 0 }; ++ struct crypto_aead *dummy_tfm; ++ ++ dummy_tfm = macsec_alloc_tfm(dummy_key, ++ DEFAULT_SAK_LEN, ++ icv_len); ++ if (IS_ERR(dummy_tfm)) ++ return PTR_ERR(dummy_tfm); ++ crypto_free_aead(dummy_tfm); ++ } ++ } ++ ++ switch (csid) { ++ case MACSEC_CIPHER_ID_GCM_AES_128: ++ case MACSEC_CIPHER_ID_GCM_AES_256: ++ case MACSEC_CIPHER_ID_GCM_AES_XPN_128: ++ case MACSEC_CIPHER_ID_GCM_AES_XPN_256: ++ case MACSEC_DEFAULT_CIPHER_ID: ++ if (icv_len < MACSEC_MIN_ICV_LEN || ++ icv_len > MACSEC_STD_ICV_LEN) ++ return -EINVAL; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ if (data[IFLA_MACSEC_ENCODING_SA]) { ++ if (nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]) >= MACSEC_NUM_AN) ++ return -EINVAL; ++ } ++ ++ for (flag = IFLA_MACSEC_ENCODING_SA + 1; ++ flag < IFLA_MACSEC_VALIDATION; ++ flag++) { ++ if (data[flag]) { ++ if (nla_get_u8(data[flag]) > 1) ++ return -EINVAL; ++ } ++ } ++ ++ es = data[IFLA_MACSEC_ES] ? nla_get_u8(data[IFLA_MACSEC_ES]) : false; ++ sci = data[IFLA_MACSEC_INC_SCI] ? nla_get_u8(data[IFLA_MACSEC_INC_SCI]) : false; ++ scb = data[IFLA_MACSEC_SCB] ? nla_get_u8(data[IFLA_MACSEC_SCB]) : false; ++ ++ if ((sci && (scb || es)) || (scb && es)) ++ return -EINVAL; ++ ++ if (data[IFLA_MACSEC_VALIDATION] && ++ nla_get_u8(data[IFLA_MACSEC_VALIDATION]) > MACSEC_VALIDATE_MAX) ++ return -EINVAL; ++ ++ if ((data[IFLA_MACSEC_REPLAY_PROTECT] && ++ nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT])) && ++ !data[IFLA_MACSEC_WINDOW]) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static struct net *macsec_get_link_net(const struct net_device *dev) ++{ ++ return dev_net(macsec_priv(dev)->real_dev); ++} ++ ++static size_t macsec_get_size(const struct net_device *dev) ++{ ++ return nla_total_size_64bit(8) + /* IFLA_MACSEC_SCI */ ++ nla_total_size(1) + /* IFLA_MACSEC_ICV_LEN */ ++ nla_total_size_64bit(8) + /* IFLA_MACSEC_CIPHER_SUITE */ ++ nla_total_size(4) + /* IFLA_MACSEC_WINDOW */ ++ nla_total_size(1) + /* IFLA_MACSEC_ENCODING_SA */ ++ nla_total_size(1) + /* IFLA_MACSEC_ENCRYPT */ ++ nla_total_size(1) + /* IFLA_MACSEC_PROTECT */ ++ nla_total_size(1) + /* IFLA_MACSEC_INC_SCI */ ++ nla_total_size(1) + /* IFLA_MACSEC_ES */ ++ nla_total_size(1) + /* IFLA_MACSEC_SCB */ ++ nla_total_size(1) + /* IFLA_MACSEC_REPLAY_PROTECT */ ++ nla_total_size(1) + /* IFLA_MACSEC_VALIDATION */ ++ 0; ++} ++ ++static int macsec_fill_info(struct sk_buff *skb, ++ const struct net_device *dev) ++{ ++ struct macsec_secy *secy = &macsec_priv(dev)->secy; ++ struct macsec_tx_sc *tx_sc = &secy->tx_sc; ++ u64 csid; ++ ++ switch (secy->key_len) { ++ case MACSEC_GCM_AES_128_SAK_LEN: ++ csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; ++ break; ++ case MACSEC_GCM_AES_256_SAK_LEN: ++ csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; ++ break; ++ default: ++ goto nla_put_failure; ++ } ++ ++ if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci, ++ IFLA_MACSEC_PAD) || ++ nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) || ++ nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE, ++ csid, IFLA_MACSEC_PAD) || ++ nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) || ++ nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) || ++ nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) || ++ nla_put_u8(skb, IFLA_MACSEC_INC_SCI, tx_sc->send_sci) || ++ nla_put_u8(skb, IFLA_MACSEC_ES, tx_sc->end_station) || ++ nla_put_u8(skb, IFLA_MACSEC_SCB, tx_sc->scb) || ++ nla_put_u8(skb, IFLA_MACSEC_REPLAY_PROTECT, secy->replay_protect) || ++ nla_put_u8(skb, IFLA_MACSEC_VALIDATION, secy->validate_frames) || ++ 0) ++ goto nla_put_failure; ++ ++ if (secy->replay_protect) { ++ if (nla_put_u32(skb, IFLA_MACSEC_WINDOW, secy->replay_window)) ++ goto nla_put_failure; ++ } ++ ++ return 0; ++ ++nla_put_failure: ++ return -EMSGSIZE; ++} ++ ++static struct rtnl_link_ops macsec_link_ops __read_mostly = { ++ .kind = "macsec", ++ .priv_size = sizeof(struct macsec_dev), ++ .maxtype = IFLA_MACSEC_MAX, ++ .policy = macsec_rtnl_policy, ++ .setup = macsec_setup, ++ .validate = macsec_validate_attr, ++ .newlink = macsec_newlink, ++ .changelink = macsec_changelink, ++ .dellink = macsec_dellink, ++ .get_size = macsec_get_size, ++ .fill_info = macsec_fill_info, ++ .get_link_net = macsec_get_link_net, ++}; ++ ++static bool is_macsec_master(struct net_device *dev) ++{ ++ return rcu_access_pointer(dev->rx_handler) == macsec_handle_frame; ++} ++ ++static int macsec_notify(struct notifier_block *this, unsigned long event, ++ void *ptr) ++{ ++ struct net_device *real_dev = netdev_notifier_info_to_dev(ptr); ++ LIST_HEAD(head); ++ ++ if (!is_macsec_master(real_dev)) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_DOWN: ++ case NETDEV_UP: ++ case NETDEV_CHANGE: { ++ struct macsec_dev *m, *n; ++ struct macsec_rxh_data *rxd; ++ ++ rxd = macsec_data_rtnl(real_dev); ++ list_for_each_entry_safe(m, n, &rxd->secys, secys) { ++ struct net_device *dev = m->secy.netdev; ++ ++ netif_stacked_transfer_operstate(real_dev, dev); ++ } ++ break; ++ } ++ case NETDEV_UNREGISTER: { ++ struct macsec_dev *m, *n; ++ struct macsec_rxh_data *rxd; ++ ++ rxd = macsec_data_rtnl(real_dev); ++ list_for_each_entry_safe(m, n, &rxd->secys, secys) { ++ macsec_common_dellink(m->secy.netdev, &head); ++ } ++ ++ netdev_rx_handler_unregister(real_dev); ++ kfree(rxd); ++ ++ unregister_netdevice_many(&head); ++ break; ++ } ++ case NETDEV_CHANGEMTU: { ++ struct macsec_dev *m; ++ struct macsec_rxh_data *rxd; ++ ++ rxd = macsec_data_rtnl(real_dev); ++ list_for_each_entry(m, &rxd->secys, secys) { ++ struct net_device *dev = m->secy.netdev; ++ unsigned int mtu = real_dev->mtu - (m->secy.icv_len + ++ macsec_extra_len(true)); ++ ++ if (dev->mtu > mtu) ++ dev_set_mtu(dev, mtu); ++ } ++ } ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block macsec_notifier = { ++ .notifier_call = macsec_notify, ++}; ++ ++static int __init macsec_init(void) ++{ ++ int err; ++ ++ pr_info("MACsec IEEE 802.1AE\n"); ++ err = register_netdevice_notifier(&macsec_notifier); ++ if (err) ++ return err; ++ ++ err = rtnl_link_register(&macsec_link_ops); ++ if (err) ++ goto notifier; ++ ++ err = genl_register_family(&macsec_fam); ++ if (err) ++ goto rtnl; ++ ++ return 0; ++ ++rtnl: ++ rtnl_link_unregister(&macsec_link_ops); ++notifier: ++ unregister_netdevice_notifier(&macsec_notifier); ++ return err; ++} ++ ++static void __exit macsec_exit(void) ++{ ++ genl_unregister_family(&macsec_fam); ++ rtnl_link_unregister(&macsec_link_ops); ++ unregister_netdevice_notifier(&macsec_notifier); ++ rcu_barrier(); ++} ++ ++module_init(macsec_init); ++module_exit(macsec_exit); ++ ++MODULE_ALIAS_RTNL_LINK("macsec"); ++MODULE_ALIAS_GENL_FAMILY("macsec"); ++ ++MODULE_DESCRIPTION("MACsec IEEE 802.1AE"); ++MODULE_LICENSE("GPL v2"); +diff -rupN linux.orig/drivers/net/macvlan.c linux/drivers/net/macvlan.c +--- linux.orig/drivers/net/macvlan.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/macvlan.c 2022-12-04 10:40:26.696034096 -0500 +@@ -948,13 +948,13 @@ static void macvlan_dev_get_stats64(stru for_each_possible_cpu(i) { p = per_cpu_ptr(vlan->pcpu_stats, i); do { @@ -2829,11 +20477,10 @@ index 1080d6ebff63b..a1c7823f0ba66 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c -index 0b1b6f650104b..ff302144029de 100644 ---- a/drivers/net/mhi_net.c -+++ b/drivers/net/mhi_net.c -@@ -104,19 +104,19 @@ static void mhi_ndo_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/mhi_net.c linux/drivers/net/mhi_net.c +--- linux.orig/drivers/net/mhi_net.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/mhi_net.c 2022-12-04 10:40:26.696034096 -0500 +@@ -104,19 +104,19 @@ static void mhi_ndo_get_stats64(struct n unsigned int start; do { @@ -2857,11 +20504,10 @@ index 0b1b6f650104b..ff302144029de 100644 } static const struct net_device_ops mhi_netdev_ops = { -diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c -index 9a1a5b2036240..e470e3398abc2 100644 ---- a/drivers/net/netdevsim/netdev.c -+++ b/drivers/net/netdevsim/netdev.c -@@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/netdevsim/netdev.c linux/drivers/net/netdevsim/netdev.c +--- linux.orig/drivers/net/netdevsim/netdev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/netdevsim/netdev.c 2022-12-04 10:40:26.696034096 -0500 +@@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev, unsigned int start; do { @@ -2874,11 +20520,10 @@ index 9a1a5b2036240..e470e3398abc2 100644 } static int -diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c -index 154a3c0a6dfd8..3de937141c168 100644 ---- a/drivers/net/team/team.c -+++ b/drivers/net/team/team.c -@@ -1865,13 +1865,13 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +diff -rupN linux.orig/drivers/net/team/team.c linux/drivers/net/team/team.c +--- linux.orig/drivers/net/team/team.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/team/team.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1865,13 +1865,13 @@ team_get_stats64(struct net_device *dev, for_each_possible_cpu(i) { p = per_cpu_ptr(team->pcpu_stats, i); do { @@ -2894,11 +20539,10 @@ index 154a3c0a6dfd8..3de937141c168 100644 stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; -diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c -index b095a4b4957bb..18d99fda997cf 100644 ---- a/drivers/net/team/team_mode_loadbalance.c -+++ b/drivers/net/team/team_mode_loadbalance.c -@@ -466,9 +466,9 @@ static void __lb_one_cpu_stats_add(struct lb_stats *acc_stats, +diff -rupN linux.orig/drivers/net/team/team_mode_loadbalance.c linux/drivers/net/team/team_mode_loadbalance.c +--- linux.orig/drivers/net/team/team_mode_loadbalance.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/team/team_mode_loadbalance.c 2022-12-04 10:40:26.696034096 -0500 +@@ -466,9 +466,9 @@ static void __lb_one_cpu_stats_add(struc struct lb_stats tmp; do { @@ -2910,11 +20554,10 @@ index b095a4b4957bb..18d99fda997cf 100644 acc_stats->tx_bytes += tmp.tx_bytes; } -diff --git a/drivers/net/veth.c b/drivers/net/veth.c -index 466da01ba2e3e..2da7cfcfe1c31 100644 ---- a/drivers/net/veth.c -+++ b/drivers/net/veth.c -@@ -182,12 +182,12 @@ static void veth_get_ethtool_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/veth.c linux/drivers/net/veth.c +--- linux.orig/drivers/net/veth.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/veth.c 2022-12-04 10:40:26.696034096 -0500 +@@ -182,12 +182,12 @@ static void veth_get_ethtool_stats(struc size_t offset; do { @@ -2929,7 +20572,7 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644 idx += VETH_RQ_STATS_LEN; } -@@ -203,12 +203,12 @@ static void veth_get_ethtool_stats(struct net_device *dev, +@@ -203,12 +203,12 @@ static void veth_get_ethtool_stats(struc tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; do { @@ -2944,7 +20587,7 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644 } } -@@ -379,13 +379,13 @@ static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) +@@ -379,13 +379,13 @@ static void veth_stats_rx(struct veth_st unsigned int start; do { @@ -2960,11 +20603,10 @@ index 466da01ba2e3e..2da7cfcfe1c31 100644 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; result->xdp_tx_err += xdp_tx_err; result->xdp_packets += packets; -diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c -index 9cce7dec7366d..a94d9d8f67fd0 100644 ---- a/drivers/net/virtio_net.c -+++ b/drivers/net/virtio_net.c -@@ -2066,18 +2066,18 @@ static void virtnet_stats(struct net_device *dev, +diff -rupN linux.orig/drivers/net/virtio_net.c linux/drivers/net/virtio_net.c +--- linux.orig/drivers/net/virtio_net.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/virtio_net.c 2022-12-04 10:40:26.696034096 -0500 +@@ -2066,18 +2066,18 @@ static void virtnet_stats(struct net_dev struct send_queue *sq = &vi->sq[i]; do { @@ -2987,7 +20629,7 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644 tot->rx_packets += rpackets; tot->tx_packets += tpackets; -@@ -2688,12 +2688,12 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, +@@ -2688,12 +2688,12 @@ static void virtnet_get_ethtool_stats(st stats_base = (u8 *)&rq->stats; do { @@ -3002,7 +20644,7 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644 idx += VIRTNET_RQ_STATS_LEN; } -@@ -2702,12 +2702,12 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, +@@ -2702,12 +2702,12 @@ static void virtnet_get_ethtool_stats(st stats_base = (u8 *)&sq->stats; do { @@ -3017,11 +20659,10 @@ index 9cce7dec7366d..a94d9d8f67fd0 100644 idx += VIRTNET_SQ_STATS_LEN; } } -diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c -index 5df7a0abc39d5..191ebc482f0c1 100644 ---- a/drivers/net/vrf.c -+++ b/drivers/net/vrf.c -@@ -159,13 +159,13 @@ static void vrf_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/vrf.c linux/drivers/net/vrf.c +--- linux.orig/drivers/net/vrf.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/vrf.c 2022-12-04 10:40:26.696034096 -0500 +@@ -159,13 +159,13 @@ static void vrf_get_stats64(struct net_d dstats = per_cpu_ptr(dev->dstats, i); do { @@ -3037,11 +20678,10 @@ index 5df7a0abc39d5..191ebc482f0c1 100644 stats->tx_bytes += tbytes; stats->tx_packets += tpkts; stats->tx_dropped += tdrops; -diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c -index 3e04af4c5daa1..a3de081cda5ee 100644 ---- a/drivers/net/vxlan/vxlan_vnifilter.c -+++ b/drivers/net/vxlan/vxlan_vnifilter.c -@@ -129,9 +129,9 @@ static void vxlan_vnifilter_stats_get(const struct vxlan_vni_node *vninode, +diff -rupN linux.orig/drivers/net/vxlan/vxlan_vnifilter.c linux/drivers/net/vxlan/vxlan_vnifilter.c +--- linux.orig/drivers/net/vxlan/vxlan_vnifilter.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/vxlan/vxlan_vnifilter.c 2022-12-04 10:40:26.696034096 -0500 +@@ -129,9 +129,9 @@ static void vxlan_vnifilter_stats_get(co pstats = per_cpu_ptr(vninode->stats, i); do { @@ -3053,11 +20693,10 @@ index 3e04af4c5daa1..a3de081cda5ee 100644 dest->rx_packets += temp.rx_packets; dest->rx_bytes += temp.rx_bytes; -diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c -index 6872782e8dd89..22b5939a42bb3 100644 ---- a/drivers/net/wwan/mhi_wwan_mbim.c -+++ b/drivers/net/wwan/mhi_wwan_mbim.c -@@ -456,19 +456,19 @@ static void mhi_mbim_ndo_get_stats64(struct net_device *ndev, +diff -rupN linux.orig/drivers/net/wwan/mhi_wwan_mbim.c linux/drivers/net/wwan/mhi_wwan_mbim.c +--- linux.orig/drivers/net/wwan/mhi_wwan_mbim.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/wwan/mhi_wwan_mbim.c 2022-12-04 10:40:26.696034096 -0500 +@@ -456,19 +456,19 @@ static void mhi_mbim_ndo_get_stats64(str unsigned int start; do { @@ -3081,11 +20720,10 @@ index 6872782e8dd89..22b5939a42bb3 100644 } static void mhi_mbim_ul_callback(struct mhi_device *mhi_dev, -diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c -index 27a11cc08c61e..df4dc02638a00 100644 ---- a/drivers/net/xen-netfront.c -+++ b/drivers/net/xen-netfront.c -@@ -1392,16 +1392,16 @@ static void xennet_get_stats64(struct net_device *dev, +diff -rupN linux.orig/drivers/net/xen-netfront.c linux/drivers/net/xen-netfront.c +--- linux.orig/drivers/net/xen-netfront.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/net/xen-netfront.c 2022-12-04 10:40:26.696034096 -0500 +@@ -1392,16 +1392,16 @@ static void xennet_get_stats64(struct ne unsigned int start; do { @@ -3106,11 +20744,10 @@ index 27a11cc08c61e..df4dc02638a00 100644 tot->rx_packets += rx_packets; tot->tx_packets += tx_packets; -diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c -index 2a4b3efb7e12b..9f6ed09538cd0 100644 ---- a/drivers/pinctrl/pinctrl-amd.c -+++ b/drivers/pinctrl/pinctrl-amd.c -@@ -639,7 +639,7 @@ static bool do_amd_gpio_irq_handler(int irq, void *dev_id) +diff -rupN linux.orig/drivers/pinctrl/pinctrl-amd.c linux/drivers/pinctrl/pinctrl-amd.c +--- linux.orig/drivers/pinctrl/pinctrl-amd.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/pinctrl/pinctrl-amd.c 2022-12-04 10:40:26.696034096 -0500 +@@ -639,7 +639,7 @@ static bool do_amd_gpio_irq_handler(int if (!(regval & PIN_IRQ_PENDING) || !(regval & BIT(INTERRUPT_MASK_OFF))) continue; @@ -3119,11 +20756,10 @@ index 2a4b3efb7e12b..9f6ed09538cd0 100644 /* Clear interrupt. * We must read the pin register again, in case the -diff --git a/drivers/platform/x86/intel/int0002_vgpio.c b/drivers/platform/x86/intel/int0002_vgpio.c -index 617dbf98980ec..97cfbc520a02c 100644 ---- a/drivers/platform/x86/intel/int0002_vgpio.c -+++ b/drivers/platform/x86/intel/int0002_vgpio.c -@@ -125,8 +125,7 @@ static irqreturn_t int0002_irq(int irq, void *data) +diff -rupN linux.orig/drivers/platform/x86/intel/int0002_vgpio.c linux/drivers/platform/x86/intel/int0002_vgpio.c +--- linux.orig/drivers/platform/x86/intel/int0002_vgpio.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/platform/x86/intel/int0002_vgpio.c 2022-12-04 10:40:26.696034096 -0500 +@@ -125,8 +125,7 @@ static irqreturn_t int0002_irq(int irq, if (!(gpe_sts_reg & GPE0A_PME_B0_STS_BIT)) return IRQ_NONE; @@ -3133,10 +20769,9 @@ index 617dbf98980ec..97cfbc520a02c 100644 pm_wakeup_hard_event(chip->parent); -diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c -index 4b42f2302a8a8..d4f77f6688cf7 100644 ---- a/drivers/spi/spi.c -+++ b/drivers/spi/spi.c +diff -rupN linux.orig/drivers/spi/spi.c linux/drivers/spi/spi.c +--- linux.orig/drivers/spi/spi.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/spi/spi.c 2022-12-04 10:40:26.700034085 -0500 @@ -127,10 +127,10 @@ do { \ unsigned int start; \ pcpu_stats = per_cpu_ptr(in, i); \ @@ -3150,11 +20785,10 @@ index 4b42f2302a8a8..d4f77f6688cf7 100644 &pcpu_stats->syncp, start)); \ ret += inc; \ } \ -diff --git a/drivers/ssb/driver_gpio.c b/drivers/ssb/driver_gpio.c -index 2de3896489c84..897cb8db5084f 100644 ---- a/drivers/ssb/driver_gpio.c -+++ b/drivers/ssb/driver_gpio.c -@@ -132,7 +132,8 @@ static irqreturn_t ssb_gpio_irq_chipco_handler(int irq, void *dev_id) +diff -rupN linux.orig/drivers/ssb/driver_gpio.c linux/drivers/ssb/driver_gpio.c +--- linux.orig/drivers/ssb/driver_gpio.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/ssb/driver_gpio.c 2022-12-04 10:40:26.700034085 -0500 +@@ -132,7 +132,8 @@ static irqreturn_t ssb_gpio_irq_chipco_h return IRQ_NONE; for_each_set_bit(gpio, &irqs, bus->gpio.ngpio) @@ -3164,7 +20798,7 @@ index 2de3896489c84..897cb8db5084f 100644 ssb_chipco_gpio_polarity(chipco, irqs, val & irqs); return IRQ_HANDLED; -@@ -330,7 +331,8 @@ static irqreturn_t ssb_gpio_irq_extif_handler(int irq, void *dev_id) +@@ -330,7 +331,8 @@ static irqreturn_t ssb_gpio_irq_extif_ha return IRQ_NONE; for_each_set_bit(gpio, &irqs, bus->gpio.ngpio) @@ -3174,11 +20808,207 @@ index 2de3896489c84..897cb8db5084f 100644 ssb_extif_gpio_polarity(extif, irqs, val & irqs); return IRQ_HANDLED; -diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h -index 287153d325365..81f5fce6e895f 100644 ---- a/drivers/tty/serial/8250/8250.h -+++ b/drivers/tty/serial/8250/8250.h -@@ -177,12 +177,74 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value) +diff -rupN linux.orig/drivers/tty/serial/8250/8250_aspeed_vuart.c linux/drivers/tty/serial/8250/8250_aspeed_vuart.c +--- linux.orig/drivers/tty/serial/8250/8250_aspeed_vuart.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_aspeed_vuart.c 2022-12-04 10:40:26.700034085 -0500 +@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle( + up->ier &= ~irqs; + if (!throttle) + up->ier |= irqs; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + } + static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle) + { +diff -rupN linux.orig/drivers/tty/serial/8250/8250_bcm7271.c linux/drivers/tty/serial/8250/8250_bcm7271.c +--- linux.orig/drivers/tty/serial/8250/8250_bcm7271.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_bcm7271.c 2022-12-04 10:40:26.700034085 -0500 +@@ -609,7 +609,7 @@ static int brcmuart_startup(struct uart_ + * will handle this. + */ + up->ier &= ~UART_IER_RDI; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + priv->tx_running = false; + priv->dma.rx_dma = NULL; +@@ -775,10 +775,12 @@ static int brcmuart_handle_irq(struct ua + unsigned int iir = serial_port_in(p, UART_IIR); + struct brcmuart_priv *priv = p->private_data; + struct uart_8250_port *up = up_to_u8250p(p); ++ unsigned long cs_flags; + unsigned int status; + unsigned long flags; + unsigned int ier; + unsigned int mcr; ++ bool is_console; + int handled = 0; + + /* +@@ -789,6 +791,10 @@ static int brcmuart_handle_irq(struct ua + spin_lock_irqsave(&p->lock, flags); + status = serial_port_in(p, UART_LSR); + if ((status & UART_LSR_DR) == 0) { ++ is_console = uart_console(p); ++ ++ if (is_console) ++ printk_cpu_sync_get_irqsave(cs_flags); + + ier = serial_port_in(p, UART_IER); + /* +@@ -809,6 +815,9 @@ static int brcmuart_handle_irq(struct ua + serial_port_in(p, UART_RX); + } + ++ if (is_console) ++ printk_cpu_sync_put_irqrestore(cs_flags); ++ + handled = 1; + } + spin_unlock_irqrestore(&p->lock, flags); +@@ -823,8 +832,10 @@ static enum hrtimer_restart brcmuart_hrt + struct brcmuart_priv *priv = container_of(t, struct brcmuart_priv, hrt); + struct uart_port *p = priv->up; + struct uart_8250_port *up = up_to_u8250p(p); ++ unsigned long cs_flags; + unsigned int status; + unsigned long flags; ++ bool is_console; + + if (priv->shutdown) + return HRTIMER_NORESTART; +@@ -846,12 +857,20 @@ static enum hrtimer_restart brcmuart_hrt + /* re-enable receive unless upper layer has disabled it */ + if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) == + (UART_IER_RLSI | UART_IER_RDI)) { ++ is_console = uart_console(p); ++ ++ if (is_console) ++ printk_cpu_sync_get_irqsave(cs_flags); ++ + status = serial_port_in(p, UART_IER); + status |= (UART_IER_RLSI | UART_IER_RDI); + serial_port_out(p, UART_IER, status); + status = serial_port_in(p, UART_MCR); + status |= UART_MCR_RTS; + serial_port_out(p, UART_MCR, status); ++ ++ if (is_console) ++ printk_cpu_sync_put_irqrestore(cs_flags); + } + spin_unlock_irqrestore(&p->lock, flags); + return HRTIMER_NORESTART; +diff -rupN linux.orig/drivers/tty/serial/8250/8250_core.c linux/drivers/tty/serial/8250/8250_core.c +--- linux.orig/drivers/tty/serial/8250/8250_core.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_core.c 2022-12-04 10:40:26.700034085 -0500 +@@ -255,8 +255,11 @@ static void serial8250_timeout(struct ti + static void serial8250_backup_timeout(struct timer_list *t) + { + struct uart_8250_port *up = from_timer(up, t, timer); ++ struct uart_port *port = &up->port; + unsigned int iir, ier = 0, lsr; ++ unsigned long cs_flags; + unsigned long flags; ++ bool is_console; + + spin_lock_irqsave(&up->port.lock, flags); + +@@ -265,8 +268,16 @@ static void serial8250_backup_timeout(st + * based handler. + */ + if (up->port.irq) { ++ is_console = uart_console(port); ++ ++ if (is_console) ++ printk_cpu_sync_get_irqsave(cs_flags); ++ + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, 0); ++ ++ if (is_console) ++ printk_cpu_sync_put_irqrestore(cs_flags); + } + + iir = serial_in(up, UART_IIR); +@@ -289,7 +300,7 @@ static void serial8250_backup_timeout(st + serial8250_tx_chars(up); + + if (up->port.irq) +- serial_out(up, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + spin_unlock_irqrestore(&up->port.lock, flags); + +@@ -575,6 +586,14 @@ serial8250_register_ports(struct uart_dr + + #ifdef CONFIG_SERIAL_8250_CONSOLE + ++static void univ8250_console_write_atomic(struct console *co, const char *s, ++ unsigned int count) ++{ ++ struct uart_8250_port *up = &serial8250_ports[co->index]; ++ ++ serial8250_console_write_atomic(up, s, count); ++} ++ + static void univ8250_console_write(struct console *co, const char *s, + unsigned int count) + { +@@ -668,6 +687,7 @@ static int univ8250_console_match(struct + + static struct console univ8250_console = { + .name = "ttyS", ++ .write_atomic = univ8250_console_write_atomic, + .write = univ8250_console_write, + .device = uart_console_device, + .setup = univ8250_console_setup, +@@ -961,7 +981,7 @@ static void serial_8250_overrun_backoff_ + spin_lock_irqsave(&port->lock, flags); + up->ier |= UART_IER_RLSI | UART_IER_RDI; + up->port.read_status_mask |= UART_LSR_DR; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + spin_unlock_irqrestore(&port->lock, flags); + } + +diff -rupN linux.orig/drivers/tty/serial/8250/8250_exar.c linux/drivers/tty/serial/8250/8250_exar.c +--- linux.orig/drivers/tty/serial/8250/8250_exar.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_exar.c 2022-12-04 10:40:26.700034085 -0500 +@@ -179,6 +179,8 @@ static void xr17v35x_set_divisor(struct + + static int xr17v35x_startup(struct uart_port *port) + { ++ struct uart_8250_port *up = up_to_u8250p(port); ++ + /* + * First enable access to IER [7:5], ISR [5:4], FCR [5:4], + * MCR [7:5] and MSR [7:0] +@@ -189,7 +191,7 @@ static int xr17v35x_startup(struct uart_ + * Make sure all interrups are masked until initialization is + * complete and the FIFOs are cleared + */ +- serial_port_out(port, UART_IER, 0); ++ serial8250_set_IER(up, 0); + + return serial8250_do_startup(port); + } +diff -rupN linux.orig/drivers/tty/serial/8250/8250_fsl.c linux/drivers/tty/serial/8250/8250_fsl.c +--- linux.orig/drivers/tty/serial/8250/8250_fsl.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_fsl.c 2022-12-04 10:40:26.700034085 -0500 +@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port + if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { + unsigned long delay; + +- up->ier = port->serial_in(port, UART_IER); ++ up->ier = serial8250_in_IER(up); ++ + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { + port->ops->stop_rx(port); + } else { +diff -rupN linux.orig/drivers/tty/serial/8250/8250.h linux/drivers/tty/serial/8250/8250.h +--- linux.orig/drivers/tty/serial/8250/8250.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250.h 2022-12-04 10:40:26.700034085 -0500 +@@ -177,12 +177,74 @@ static inline void serial_dl_write(struc up->dl_write(up, value); } @@ -3254,7 +21084,7 @@ index 287153d325365..81f5fce6e895f 100644 return true; } -@@ -191,7 +253,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) +@@ -191,7 +253,7 @@ static inline bool serial8250_clear_THRI if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; @@ -3263,213 +21093,10 @@ index 287153d325365..81f5fce6e895f 100644 return true; } -diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c -index 9d2a7856784f7..7cc6b527c088b 100644 ---- a/drivers/tty/serial/8250/8250_aspeed_vuart.c -+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c -@@ -278,7 +278,7 @@ static void __aspeed_vuart_set_throttle(struct uart_8250_port *up, - up->ier &= ~irqs; - if (!throttle) - up->ier |= irqs; -- serial_out(up, UART_IER, up->ier); -+ serial8250_set_IER(up, up->ier); - } - static void aspeed_vuart_set_throttle(struct uart_port *port, bool throttle) - { -diff --git a/drivers/tty/serial/8250/8250_bcm7271.c b/drivers/tty/serial/8250/8250_bcm7271.c -index 8efdc271eb75f..d30c74618411f 100644 ---- a/drivers/tty/serial/8250/8250_bcm7271.c -+++ b/drivers/tty/serial/8250/8250_bcm7271.c -@@ -609,7 +609,7 @@ static int brcmuart_startup(struct uart_port *port) - * will handle this. - */ - up->ier &= ~UART_IER_RDI; -- serial_port_out(port, UART_IER, up->ier); -+ serial8250_set_IER(up, up->ier); - - priv->tx_running = false; - priv->dma.rx_dma = NULL; -@@ -775,10 +775,12 @@ static int brcmuart_handle_irq(struct uart_port *p) - unsigned int iir = serial_port_in(p, UART_IIR); - struct brcmuart_priv *priv = p->private_data; - struct uart_8250_port *up = up_to_u8250p(p); -+ unsigned long cs_flags; - unsigned int status; - unsigned long flags; - unsigned int ier; - unsigned int mcr; -+ bool is_console; - int handled = 0; - - /* -@@ -789,6 +791,10 @@ static int brcmuart_handle_irq(struct uart_port *p) - spin_lock_irqsave(&p->lock, flags); - status = serial_port_in(p, UART_LSR); - if ((status & UART_LSR_DR) == 0) { -+ is_console = uart_console(p); -+ -+ if (is_console) -+ printk_cpu_sync_get_irqsave(cs_flags); - - ier = serial_port_in(p, UART_IER); - /* -@@ -809,6 +815,9 @@ static int brcmuart_handle_irq(struct uart_port *p) - serial_port_in(p, UART_RX); - } - -+ if (is_console) -+ printk_cpu_sync_put_irqrestore(cs_flags); -+ - handled = 1; - } - spin_unlock_irqrestore(&p->lock, flags); -@@ -823,8 +832,10 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t) - struct brcmuart_priv *priv = container_of(t, struct brcmuart_priv, hrt); - struct uart_port *p = priv->up; - struct uart_8250_port *up = up_to_u8250p(p); -+ unsigned long cs_flags; - unsigned int status; - unsigned long flags; -+ bool is_console; - - if (priv->shutdown) - return HRTIMER_NORESTART; -@@ -846,12 +857,20 @@ static enum hrtimer_restart brcmuart_hrtimer_func(struct hrtimer *t) - /* re-enable receive unless upper layer has disabled it */ - if ((up->ier & (UART_IER_RLSI | UART_IER_RDI)) == - (UART_IER_RLSI | UART_IER_RDI)) { -+ is_console = uart_console(p); -+ -+ if (is_console) -+ printk_cpu_sync_get_irqsave(cs_flags); -+ - status = serial_port_in(p, UART_IER); - status |= (UART_IER_RLSI | UART_IER_RDI); - serial_port_out(p, UART_IER, status); - status = serial_port_in(p, UART_MCR); - status |= UART_MCR_RTS; - serial_port_out(p, UART_MCR, status); -+ -+ if (is_console) -+ printk_cpu_sync_put_irqrestore(cs_flags); - } - spin_unlock_irqrestore(&p->lock, flags); - return HRTIMER_NORESTART; -diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c -index 94fbf0add2ce2..196d0c55dfe99 100644 ---- a/drivers/tty/serial/8250/8250_core.c -+++ b/drivers/tty/serial/8250/8250_core.c -@@ -255,8 +255,11 @@ static void serial8250_timeout(struct timer_list *t) - static void serial8250_backup_timeout(struct timer_list *t) - { - struct uart_8250_port *up = from_timer(up, t, timer); -+ struct uart_port *port = &up->port; - unsigned int iir, ier = 0, lsr; -+ unsigned long cs_flags; - unsigned long flags; -+ bool is_console; - - spin_lock_irqsave(&up->port.lock, flags); - -@@ -265,8 +268,16 @@ static void serial8250_backup_timeout(struct timer_list *t) - * based handler. - */ - if (up->port.irq) { -+ is_console = uart_console(port); -+ -+ if (is_console) -+ printk_cpu_sync_get_irqsave(cs_flags); -+ - ier = serial_in(up, UART_IER); - serial_out(up, UART_IER, 0); -+ -+ if (is_console) -+ printk_cpu_sync_put_irqrestore(cs_flags); - } - - iir = serial_in(up, UART_IIR); -@@ -289,7 +300,7 @@ static void serial8250_backup_timeout(struct timer_list *t) - serial8250_tx_chars(up); - - if (up->port.irq) -- serial_out(up, UART_IER, ier); -+ serial8250_set_IER(up, ier); - - spin_unlock_irqrestore(&up->port.lock, flags); - -@@ -575,6 +586,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) - - #ifdef CONFIG_SERIAL_8250_CONSOLE - -+static void univ8250_console_write_atomic(struct console *co, const char *s, -+ unsigned int count) -+{ -+ struct uart_8250_port *up = &serial8250_ports[co->index]; -+ -+ serial8250_console_write_atomic(up, s, count); -+} -+ - static void univ8250_console_write(struct console *co, const char *s, - unsigned int count) - { -@@ -668,6 +687,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, - - static struct console univ8250_console = { - .name = "ttyS", -+ .write_atomic = univ8250_console_write_atomic, - .write = univ8250_console_write, - .device = uart_console_device, - .setup = univ8250_console_setup, -@@ -961,7 +981,7 @@ static void serial_8250_overrun_backoff_work(struct work_struct *work) - spin_lock_irqsave(&port->lock, flags); - up->ier |= UART_IER_RLSI | UART_IER_RDI; - up->port.read_status_mask |= UART_LSR_DR; -- serial_out(up, UART_IER, up->ier); -+ serial8250_set_IER(up, up->ier); - spin_unlock_irqrestore(&port->lock, flags); - } - -diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c -index 314a05e009df9..9809517de8270 100644 ---- a/drivers/tty/serial/8250/8250_exar.c -+++ b/drivers/tty/serial/8250/8250_exar.c -@@ -179,6 +179,8 @@ static void xr17v35x_set_divisor(struct uart_port *p, unsigned int baud, - - static int xr17v35x_startup(struct uart_port *port) - { -+ struct uart_8250_port *up = up_to_u8250p(port); -+ - /* - * First enable access to IER [7:5], ISR [5:4], FCR [5:4], - * MCR [7:5] and MSR [7:0] -@@ -189,7 +191,7 @@ static int xr17v35x_startup(struct uart_port *port) - * Make sure all interrups are masked until initialization is - * complete and the FIFOs are cleared - */ -- serial_port_out(port, UART_IER, 0); -+ serial8250_set_IER(up, 0); - - return serial8250_do_startup(port); - } -diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c -index 8aad15622a2e5..74bb85b705e7f 100644 ---- a/drivers/tty/serial/8250/8250_fsl.c -+++ b/drivers/tty/serial/8250/8250_fsl.c -@@ -58,7 +58,8 @@ int fsl8250_handle_irq(struct uart_port *port) - if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { - unsigned long delay; - -- up->ier = port->serial_in(port, UART_IER); -+ up->ier = serial8250_in_IER(up); -+ - if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { - port->ops->stop_rx(port); - } else { -diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c -index 2b2f5d8d24b91..2b78e6c394fb9 100644 ---- a/drivers/tty/serial/8250/8250_ingenic.c -+++ b/drivers/tty/serial/8250/8250_ingenic.c -@@ -146,6 +146,7 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", +diff -rupN linux.orig/drivers/tty/serial/8250/8250_ingenic.c linux/drivers/tty/serial/8250/8250_ingenic.c +--- linux.orig/drivers/tty/serial/8250/8250_ingenic.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_ingenic.c 2022-12-04 10:40:26.700034085 -0500 +@@ -146,6 +146,7 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) { @@ -3477,7 +21104,7 @@ index 2b2f5d8d24b91..2b78e6c394fb9 100644 int ier; switch (offset) { -@@ -167,7 +168,7 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) +@@ -167,7 +168,7 @@ static void ingenic_uart_serial_out(stru * If we have enabled modem status IRQs we should enable * modem mode. */ @@ -3486,11 +21113,10 @@ index 2b2f5d8d24b91..2b78e6c394fb9 100644 if (ier & UART_IER_MSI) value |= UART_MCR_MDCE | UART_MCR_FCM; -diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c -index 54051ec7b4992..6092c75808fb9 100644 ---- a/drivers/tty/serial/8250/8250_mtk.c -+++ b/drivers/tty/serial/8250/8250_mtk.c -@@ -222,12 +222,40 @@ static void mtk8250_shutdown(struct uart_port *port) +diff -rupN linux.orig/drivers/tty/serial/8250/8250_mtk.c linux/drivers/tty/serial/8250/8250_mtk.c +--- linux.orig/drivers/tty/serial/8250/8250_mtk.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_mtk.c 2022-12-04 10:40:26.700034085 -0500 +@@ -222,12 +222,40 @@ static void mtk8250_shutdown(struct uart static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) { @@ -3533,20 +21159,19 @@ index 54051ec7b4992..6092c75808fb9 100644 } static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) -diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c -index 38ee3e42251af..8dc983a8cad15 100644 ---- a/drivers/tty/serial/8250/8250_omap.c -+++ b/drivers/tty/serial/8250/8250_omap.c -@@ -325,7 +325,7 @@ static void omap8250_restore_regs(struct uart_8250_port *up) - +diff -rupN linux.orig/drivers/tty/serial/8250/8250_omap.c linux/drivers/tty/serial/8250/8250_omap.c +--- linux.orig/drivers/tty/serial/8250/8250_omap.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_omap.c 2022-12-04 10:41:15.271907054 -0500 +@@ -328,7 +328,7 @@ static void omap8250_restore_regs(struct /* drop TCR + TLR access, we setup XON/XOFF later */ - serial8250_out_MCR(up, up->mcr); + serial8250_out_MCR(up, mcr); + - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_dl_write(up, priv->quot); -@@ -515,7 +515,7 @@ static void omap_8250_pm(struct uart_port *port, unsigned int state, +@@ -518,7 +518,7 @@ static void omap_8250_pm(struct uart_por serial_out(up, UART_EFR, efr | UART_EFR_ECB); serial_out(up, UART_LCR, 0); @@ -3555,7 +21180,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(up, UART_EFR, efr); serial_out(up, UART_LCR, 0); -@@ -636,7 +636,7 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id) +@@ -639,7 +639,7 @@ static irqreturn_t omap8250_irq(int irq, if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) { unsigned long delay; @@ -3564,7 +21189,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { port->ops->stop_rx(port); } else { -@@ -696,7 +696,7 @@ static int omap_8250_startup(struct uart_port *port) +@@ -698,7 +698,7 @@ static int omap_8250_startup(struct uart goto err; up->ier = UART_IER_RLSI | UART_IER_RDI; @@ -3573,7 +21198,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 #ifdef CONFIG_PM up->capabilities |= UART_CAP_RPM; -@@ -737,7 +737,7 @@ static void omap_8250_shutdown(struct uart_port *port) +@@ -739,7 +739,7 @@ static void omap_8250_shutdown(struct ua serial_out(up, UART_OMAP_EFR2, 0x0); up->ier = 0; @@ -3582,7 +21207,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 if (up->dma) serial8250_release_dma(up); -@@ -785,7 +785,7 @@ static void omap_8250_unthrottle(struct uart_port *port) +@@ -787,7 +787,7 @@ static void omap_8250_unthrottle(struct up->dma->rx_dma(up); up->ier |= UART_IER_RLSI | UART_IER_RDI; port->read_status_mask |= UART_LSR_DR; @@ -3591,7 +21216,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 spin_unlock_irqrestore(&port->lock, flags); pm_runtime_mark_last_busy(port->dev); -@@ -876,7 +876,7 @@ static void __dma_rx_complete(void *param) +@@ -878,7 +878,7 @@ static void __dma_rx_complete(void *para __dma_rx_do_complete(p); if (!priv->throttled) { p->ier |= UART_IER_RLSI | UART_IER_RDI; @@ -3600,7 +21225,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 if (!(priv->habit & UART_HAS_EFR2)) omap_8250_rx_dma(p); } -@@ -933,7 +933,7 @@ static int omap_8250_rx_dma(struct uart_8250_port *p) +@@ -935,7 +935,7 @@ static int omap_8250_rx_dma(struct uart_ * callback to run. */ p->ier &= ~(UART_IER_RLSI | UART_IER_RDI); @@ -3609,7 +21234,7 @@ index 38ee3e42251af..8dc983a8cad15 100644 } goto out; } -@@ -1148,12 +1148,12 @@ static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, +@@ -1150,12 +1150,12 @@ static void am654_8250_handle_rx_dma(str * periodic timeouts, re-enable interrupts. */ up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); @@ -3624,11 +21249,1731 @@ index 38ee3e42251af..8dc983a8cad15 100644 } } -diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c -index 2030a92ac66e7..326549603740d 100644 ---- a/drivers/tty/serial/8250/8250_port.c -+++ b/drivers/tty/serial/8250/8250_port.c -@@ -743,7 +743,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) +diff -rupN linux.orig/drivers/tty/serial/8250/8250_omap.c.orig linux/drivers/tty/serial/8250/8250_omap.c.orig +--- linux.orig/drivers/tty/serial/8250/8250_omap.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_omap.c.orig 2022-12-04 10:40:18.432055273 -0500 +@@ -0,0 +1,1716 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * 8250-core based driver for the OMAP internal UART ++ * ++ * based on omap-serial.c, Copyright (C) 2010 Texas Instruments. ++ * ++ * Copyright (C) 2014 Sebastian Andrzej Siewior ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "8250.h" ++ ++#define DEFAULT_CLK_SPEED 48000000 ++ ++#define UART_ERRATA_i202_MDR1_ACCESS (1 << 0) ++#define OMAP_UART_WER_HAS_TX_WAKEUP (1 << 1) ++#define OMAP_DMA_TX_KICK (1 << 2) ++/* ++ * See Advisory 21 in AM437x errata SPRZ408B, updated April 2015. ++ * The same errata is applicable to AM335x and DRA7x processors too. ++ */ ++#define UART_ERRATA_CLOCK_DISABLE (1 << 3) ++#define UART_HAS_EFR2 BIT(4) ++#define UART_HAS_RHR_IT_DIS BIT(5) ++#define UART_RX_TIMEOUT_QUIRK BIT(6) ++ ++#define OMAP_UART_FCR_RX_TRIG 6 ++#define OMAP_UART_FCR_TX_TRIG 4 ++ ++/* SCR register bitmasks */ ++#define OMAP_UART_SCR_RX_TRIG_GRANU1_MASK (1 << 7) ++#define OMAP_UART_SCR_TX_TRIG_GRANU1_MASK (1 << 6) ++#define OMAP_UART_SCR_TX_EMPTY (1 << 3) ++#define OMAP_UART_SCR_DMAMODE_MASK (3 << 1) ++#define OMAP_UART_SCR_DMAMODE_1 (1 << 1) ++#define OMAP_UART_SCR_DMAMODE_CTL (1 << 0) ++ ++/* MVR register bitmasks */ ++#define OMAP_UART_MVR_SCHEME_SHIFT 30 ++#define OMAP_UART_LEGACY_MVR_MAJ_MASK 0xf0 ++#define OMAP_UART_LEGACY_MVR_MAJ_SHIFT 4 ++#define OMAP_UART_LEGACY_MVR_MIN_MASK 0x0f ++#define OMAP_UART_MVR_MAJ_MASK 0x700 ++#define OMAP_UART_MVR_MAJ_SHIFT 8 ++#define OMAP_UART_MVR_MIN_MASK 0x3f ++ ++/* SYSC register bitmasks */ ++#define OMAP_UART_SYSC_SOFTRESET (1 << 1) ++ ++/* SYSS register bitmasks */ ++#define OMAP_UART_SYSS_RESETDONE (1 << 0) ++ ++#define UART_TI752_TLR_TX 0 ++#define UART_TI752_TLR_RX 4 ++ ++#define TRIGGER_TLR_MASK(x) ((x & 0x3c) >> 2) ++#define TRIGGER_FCR_MASK(x) (x & 3) ++ ++/* Enable XON/XOFF flow control on output */ ++#define OMAP_UART_SW_TX 0x08 ++/* Enable XON/XOFF flow control on input */ ++#define OMAP_UART_SW_RX 0x02 ++ ++#define OMAP_UART_WER_MOD_WKUP 0x7f ++#define OMAP_UART_TX_WAKEUP_EN (1 << 7) ++ ++#define TX_TRIGGER 1 ++#define RX_TRIGGER 48 ++ ++#define OMAP_UART_TCR_RESTORE(x) ((x / 4) << 4) ++#define OMAP_UART_TCR_HALT(x) ((x / 4) << 0) ++ ++#define UART_BUILD_REVISION(x, y) (((x) << 8) | (y)) ++ ++#define OMAP_UART_REV_46 0x0406 ++#define OMAP_UART_REV_52 0x0502 ++#define OMAP_UART_REV_63 0x0603 ++ ++/* Interrupt Enable Register 2 */ ++#define UART_OMAP_IER2 0x1B ++#define UART_OMAP_IER2_RHR_IT_DIS BIT(2) ++ ++/* Enhanced features register 2 */ ++#define UART_OMAP_EFR2 0x23 ++#define UART_OMAP_EFR2_TIMEOUT_BEHAVE BIT(6) ++ ++/* RX FIFO occupancy indicator */ ++#define UART_OMAP_RX_LVL 0x19 ++ ++struct omap8250_priv { ++ int line; ++ u8 habit; ++ u8 mdr1; ++ u8 efr; ++ u8 scr; ++ u8 wer; ++ u8 xon; ++ u8 xoff; ++ u8 delayed_restore; ++ u16 quot; ++ ++ u8 tx_trigger; ++ u8 rx_trigger; ++ bool is_suspending; ++ int wakeirq; ++ int wakeups_enabled; ++ u32 latency; ++ u32 calc_latency; ++ struct pm_qos_request pm_qos_request; ++ struct work_struct qos_work; ++ struct uart_8250_dma omap8250_dma; ++ spinlock_t rx_dma_lock; ++ bool rx_dma_broken; ++ bool throttled; ++}; ++ ++struct omap8250_dma_params { ++ u32 rx_size; ++ u8 rx_trigger; ++ u8 tx_trigger; ++}; ++ ++struct omap8250_platdata { ++ struct omap8250_dma_params *dma_params; ++ u8 habit; ++}; ++ ++#ifdef CONFIG_SERIAL_8250_DMA ++static void omap_8250_rx_dma_flush(struct uart_8250_port *p); ++#else ++static inline void omap_8250_rx_dma_flush(struct uart_8250_port *p) { } ++#endif ++ ++static u32 uart_read(struct uart_8250_port *up, u32 reg) ++{ ++ return readl(up->port.membase + (reg << up->port.regshift)); ++} ++ ++/* ++ * Called on runtime PM resume path from omap8250_restore_regs(), and ++ * omap8250_set_mctrl(). ++ */ ++static void __omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = up->port.private_data; ++ u8 lcr; ++ ++ serial8250_do_set_mctrl(port, mctrl); ++ ++ if (!mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_RTS)) { ++ /* ++ * Turn off autoRTS if RTS is lowered and restore autoRTS ++ * setting if RTS is raised ++ */ ++ lcr = serial_in(up, UART_LCR); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ if ((mctrl & TIOCM_RTS) && (port->status & UPSTAT_AUTORTS)) ++ priv->efr |= UART_EFR_RTS; ++ else ++ priv->efr &= ~UART_EFR_RTS; ++ serial_out(up, UART_EFR, priv->efr); ++ serial_out(up, UART_LCR, lcr); ++ } ++} ++ ++static void omap8250_set_mctrl(struct uart_port *port, unsigned int mctrl) ++{ ++ int err; ++ ++ err = pm_runtime_resume_and_get(port->dev); ++ if (err) ++ return; ++ ++ __omap8250_set_mctrl(port, mctrl); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++} ++ ++/* ++ * Work Around for Errata i202 (2430, 3430, 3630, 4430 and 4460) ++ * The access to uart register after MDR1 Access ++ * causes UART to corrupt data. ++ * ++ * Need a delay = ++ * 5 L4 clock cycles + 5 UART functional clock cycle (@48MHz = ~0.2uS) ++ * give 10 times as much ++ */ ++static void omap_8250_mdr1_errataset(struct uart_8250_port *up, ++ struct omap8250_priv *priv) ++{ ++ serial_out(up, UART_OMAP_MDR1, priv->mdr1); ++ udelay(2); ++ serial_out(up, UART_FCR, up->fcr | UART_FCR_CLEAR_XMIT | ++ UART_FCR_CLEAR_RCVR); ++} ++ ++static void omap_8250_get_divisor(struct uart_port *port, unsigned int baud, ++ struct omap8250_priv *priv) ++{ ++ unsigned int uartclk = port->uartclk; ++ unsigned int div_13, div_16; ++ unsigned int abs_d13, abs_d16; ++ ++ /* ++ * Old custom speed handling. ++ */ ++ if (baud == 38400 && (port->flags & UPF_SPD_MASK) == UPF_SPD_CUST) { ++ priv->quot = port->custom_divisor & UART_DIV_MAX; ++ /* ++ * I assume that nobody is using this. But hey, if somebody ++ * would like to specify the divisor _and_ the mode then the ++ * driver is ready and waiting for it. ++ */ ++ if (port->custom_divisor & (1 << 16)) ++ priv->mdr1 = UART_OMAP_MDR1_13X_MODE; ++ else ++ priv->mdr1 = UART_OMAP_MDR1_16X_MODE; ++ return; ++ } ++ div_13 = DIV_ROUND_CLOSEST(uartclk, 13 * baud); ++ div_16 = DIV_ROUND_CLOSEST(uartclk, 16 * baud); ++ ++ if (!div_13) ++ div_13 = 1; ++ if (!div_16) ++ div_16 = 1; ++ ++ abs_d13 = abs(baud - uartclk / 13 / div_13); ++ abs_d16 = abs(baud - uartclk / 16 / div_16); ++ ++ if (abs_d13 >= abs_d16) { ++ priv->mdr1 = UART_OMAP_MDR1_16X_MODE; ++ priv->quot = div_16; ++ } else { ++ priv->mdr1 = UART_OMAP_MDR1_13X_MODE; ++ priv->quot = div_13; ++ } ++} ++ ++static void omap8250_update_scr(struct uart_8250_port *up, ++ struct omap8250_priv *priv) ++{ ++ u8 old_scr; ++ ++ old_scr = serial_in(up, UART_OMAP_SCR); ++ if (old_scr == priv->scr) ++ return; ++ ++ /* ++ * The manual recommends not to enable the DMA mode selector in the SCR ++ * (instead of the FCR) register _and_ selecting the DMA mode as one ++ * register write because this may lead to malfunction. ++ */ ++ if (priv->scr & OMAP_UART_SCR_DMAMODE_MASK) ++ serial_out(up, UART_OMAP_SCR, ++ priv->scr & ~OMAP_UART_SCR_DMAMODE_MASK); ++ serial_out(up, UART_OMAP_SCR, priv->scr); ++} ++ ++static void omap8250_update_mdr1(struct uart_8250_port *up, ++ struct omap8250_priv *priv) ++{ ++ if (priv->habit & UART_ERRATA_i202_MDR1_ACCESS) ++ omap_8250_mdr1_errataset(up, priv); ++ else ++ serial_out(up, UART_OMAP_MDR1, priv->mdr1); ++} ++ ++static void omap8250_restore_regs(struct uart_8250_port *up) ++{ ++ struct omap8250_priv *priv = up->port.private_data; ++ struct uart_8250_dma *dma = up->dma; ++ u8 mcr = serial8250_in_MCR(up); ++ ++ if (dma && dma->tx_running) { ++ /* ++ * TCSANOW requests the change to occur immediately however if ++ * we have a TX-DMA operation in progress then it has been ++ * observed that it might stall and never complete. Therefore we ++ * delay DMA completes to prevent this hang from happen. ++ */ ++ priv->delayed_restore = 1; ++ return; ++ } ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_EFR, UART_EFR_ECB); ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ serial8250_out_MCR(up, mcr | UART_MCR_TCRTLR); ++ serial_out(up, UART_FCR, up->fcr); ++ ++ omap8250_update_scr(up, priv); ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ ++ serial_out(up, UART_TI752_TCR, OMAP_UART_TCR_RESTORE(16) | ++ OMAP_UART_TCR_HALT(52)); ++ serial_out(up, UART_TI752_TLR, ++ TRIGGER_TLR_MASK(priv->tx_trigger) << UART_TI752_TLR_TX | ++ TRIGGER_TLR_MASK(priv->rx_trigger) << UART_TI752_TLR_RX); ++ ++ serial_out(up, UART_LCR, 0); ++ ++ /* drop TCR + TLR access, we setup XON/XOFF later */ ++ serial8250_out_MCR(up, mcr); ++ ++ serial_out(up, UART_IER, up->ier); ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_dl_write(up, priv->quot); ++ ++ serial_out(up, UART_EFR, priv->efr); ++ ++ /* Configure flow control */ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_XON1, priv->xon); ++ serial_out(up, UART_XOFF1, priv->xoff); ++ ++ serial_out(up, UART_LCR, up->lcr); ++ ++ omap8250_update_mdr1(up, priv); ++ ++ __omap8250_set_mctrl(&up->port, up->port.mctrl); ++ ++ if (up->port.rs485.flags & SER_RS485_ENABLED) ++ serial8250_em485_stop_tx(up); ++} ++ ++/* ++ * OMAP can use "CLK / (16 or 13) / div" for baud rate. And then we have have ++ * some differences in how we want to handle flow control. ++ */ ++static void omap_8250_set_termios(struct uart_port *port, ++ struct ktermios *termios, ++ struct ktermios *old) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = up->port.private_data; ++ unsigned char cval = 0; ++ unsigned int baud; ++ ++ cval = UART_LCR_WLEN(tty_get_char_size(termios->c_cflag)); ++ ++ if (termios->c_cflag & CSTOPB) ++ cval |= UART_LCR_STOP; ++ if (termios->c_cflag & PARENB) ++ cval |= UART_LCR_PARITY; ++ if (!(termios->c_cflag & PARODD)) ++ cval |= UART_LCR_EPAR; ++ if (termios->c_cflag & CMSPAR) ++ cval |= UART_LCR_SPAR; ++ ++ /* ++ * Ask the core to calculate the divisor for us. ++ */ ++ baud = uart_get_baud_rate(port, termios, old, ++ port->uartclk / 16 / UART_DIV_MAX, ++ port->uartclk / 13); ++ omap_8250_get_divisor(port, baud, priv); ++ ++ /* ++ * Ok, we're now changing the port state. Do it with ++ * interrupts disabled. ++ */ ++ pm_runtime_get_sync(port->dev); ++ spin_lock_irq(&port->lock); ++ ++ /* ++ * Update the per-port timeout. ++ */ ++ uart_update_timeout(port, termios->c_cflag, baud); ++ ++ up->port.read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR; ++ if (termios->c_iflag & INPCK) ++ up->port.read_status_mask |= UART_LSR_FE | UART_LSR_PE; ++ if (termios->c_iflag & (IGNBRK | PARMRK)) ++ up->port.read_status_mask |= UART_LSR_BI; ++ ++ /* ++ * Characters to ignore ++ */ ++ up->port.ignore_status_mask = 0; ++ if (termios->c_iflag & IGNPAR) ++ up->port.ignore_status_mask |= UART_LSR_PE | UART_LSR_FE; ++ if (termios->c_iflag & IGNBRK) { ++ up->port.ignore_status_mask |= UART_LSR_BI; ++ /* ++ * If we're ignoring parity and break indicators, ++ * ignore overruns too (for real raw support). ++ */ ++ if (termios->c_iflag & IGNPAR) ++ up->port.ignore_status_mask |= UART_LSR_OE; ++ } ++ ++ /* ++ * ignore all characters if CREAD is not set ++ */ ++ if ((termios->c_cflag & CREAD) == 0) ++ up->port.ignore_status_mask |= UART_LSR_DR; ++ ++ /* ++ * Modem status interrupts ++ */ ++ up->ier &= ~UART_IER_MSI; ++ if (UART_ENABLE_MS(&up->port, termios->c_cflag)) ++ up->ier |= UART_IER_MSI; ++ ++ up->lcr = cval; ++ /* Up to here it was mostly serial8250_do_set_termios() */ ++ ++ /* ++ * We enable TRIG_GRANU for RX and TX and additionally we set ++ * SCR_TX_EMPTY bit. The result is the following: ++ * - RX_TRIGGER amount of bytes in the FIFO will cause an interrupt. ++ * - less than RX_TRIGGER number of bytes will also cause an interrupt ++ * once the UART decides that there no new bytes arriving. ++ * - Once THRE is enabled, the interrupt will be fired once the FIFO is ++ * empty - the trigger level is ignored here. ++ * ++ * Once DMA is enabled: ++ * - UART will assert the TX DMA line once there is room for TX_TRIGGER ++ * bytes in the TX FIFO. On each assert the DMA engine will move ++ * TX_TRIGGER bytes into the FIFO. ++ * - UART will assert the RX DMA line once there are RX_TRIGGER bytes in ++ * the FIFO and move RX_TRIGGER bytes. ++ * This is because threshold and trigger values are the same. ++ */ ++ up->fcr = UART_FCR_ENABLE_FIFO; ++ up->fcr |= TRIGGER_FCR_MASK(priv->tx_trigger) << OMAP_UART_FCR_TX_TRIG; ++ up->fcr |= TRIGGER_FCR_MASK(priv->rx_trigger) << OMAP_UART_FCR_RX_TRIG; ++ ++ priv->scr = OMAP_UART_SCR_RX_TRIG_GRANU1_MASK | OMAP_UART_SCR_TX_EMPTY | ++ OMAP_UART_SCR_TX_TRIG_GRANU1_MASK; ++ ++ if (up->dma) ++ priv->scr |= OMAP_UART_SCR_DMAMODE_1 | ++ OMAP_UART_SCR_DMAMODE_CTL; ++ ++ priv->xon = termios->c_cc[VSTART]; ++ priv->xoff = termios->c_cc[VSTOP]; ++ ++ priv->efr = 0; ++ up->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS | UPSTAT_AUTOXOFF); ++ ++ if (termios->c_cflag & CRTSCTS && up->port.flags & UPF_HARD_FLOW && ++ !mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_RTS) && ++ !mctrl_gpio_to_gpiod(up->gpios, UART_GPIO_CTS)) { ++ /* Enable AUTOCTS (autoRTS is enabled when RTS is raised) */ ++ up->port.status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS; ++ priv->efr |= UART_EFR_CTS; ++ } else if (up->port.flags & UPF_SOFT_FLOW) { ++ /* ++ * OMAP rx s/w flow control is borked; the transmitter remains ++ * stuck off even if rx flow control is subsequently disabled ++ */ ++ ++ /* ++ * IXOFF Flag: ++ * Enable XON/XOFF flow control on output. ++ * Transmit XON1, XOFF1 ++ */ ++ if (termios->c_iflag & IXOFF) { ++ up->port.status |= UPSTAT_AUTOXOFF; ++ priv->efr |= OMAP_UART_SW_TX; ++ } ++ } ++ omap8250_restore_regs(up); ++ ++ spin_unlock_irq(&up->port.lock); ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++ ++ /* calculate wakeup latency constraint */ ++ priv->calc_latency = USEC_PER_SEC * 64 * 8 / baud; ++ priv->latency = priv->calc_latency; ++ ++ schedule_work(&priv->qos_work); ++ ++ /* Don't rewrite B0 */ ++ if (tty_termios_baud_rate(termios)) ++ tty_termios_encode_baud_rate(termios, baud, baud); ++} ++ ++/* same as 8250 except that we may have extra flow bits set in EFR */ ++static void omap_8250_pm(struct uart_port *port, unsigned int state, ++ unsigned int oldstate) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ u8 efr; ++ ++ pm_runtime_get_sync(port->dev); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ efr = serial_in(up, UART_EFR); ++ serial_out(up, UART_EFR, efr | UART_EFR_ECB); ++ serial_out(up, UART_LCR, 0); ++ ++ serial_out(up, UART_IER, (state != 0) ? UART_IERX_SLEEP : 0); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_EFR, efr); ++ serial_out(up, UART_LCR, 0); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++} ++ ++static void omap_serial_fill_features_erratas(struct uart_8250_port *up, ++ struct omap8250_priv *priv) ++{ ++ static const struct soc_device_attribute k3_soc_devices[] = { ++ { .family = "AM65X", }, ++ { .family = "J721E", .revision = "SR1.0" }, ++ { /* sentinel */ } ++ }; ++ u32 mvr, scheme; ++ u16 revision, major, minor; ++ ++ mvr = uart_read(up, UART_OMAP_MVER); ++ ++ /* Check revision register scheme */ ++ scheme = mvr >> OMAP_UART_MVR_SCHEME_SHIFT; ++ ++ switch (scheme) { ++ case 0: /* Legacy Scheme: OMAP2/3 */ ++ /* MINOR_REV[0:4], MAJOR_REV[4:7] */ ++ major = (mvr & OMAP_UART_LEGACY_MVR_MAJ_MASK) >> ++ OMAP_UART_LEGACY_MVR_MAJ_SHIFT; ++ minor = (mvr & OMAP_UART_LEGACY_MVR_MIN_MASK); ++ break; ++ case 1: ++ /* New Scheme: OMAP4+ */ ++ /* MINOR_REV[0:5], MAJOR_REV[8:10] */ ++ major = (mvr & OMAP_UART_MVR_MAJ_MASK) >> ++ OMAP_UART_MVR_MAJ_SHIFT; ++ minor = (mvr & OMAP_UART_MVR_MIN_MASK); ++ break; ++ default: ++ dev_warn(up->port.dev, ++ "Unknown revision, defaulting to highest\n"); ++ /* highest possible revision */ ++ major = 0xff; ++ minor = 0xff; ++ } ++ /* normalize revision for the driver */ ++ revision = UART_BUILD_REVISION(major, minor); ++ ++ switch (revision) { ++ case OMAP_UART_REV_46: ++ priv->habit |= UART_ERRATA_i202_MDR1_ACCESS; ++ break; ++ case OMAP_UART_REV_52: ++ priv->habit |= UART_ERRATA_i202_MDR1_ACCESS | ++ OMAP_UART_WER_HAS_TX_WAKEUP; ++ break; ++ case OMAP_UART_REV_63: ++ priv->habit |= UART_ERRATA_i202_MDR1_ACCESS | ++ OMAP_UART_WER_HAS_TX_WAKEUP; ++ break; ++ default: ++ break; ++ } ++ ++ /* ++ * AM65x SR1.0, AM65x SR2.0 and J721e SR1.0 don't ++ * don't have RHR_IT_DIS bit in IER2 register. So drop to flag ++ * to enable errata workaround. ++ */ ++ if (soc_device_match(k3_soc_devices)) ++ priv->habit &= ~UART_HAS_RHR_IT_DIS; ++} ++ ++static void omap8250_uart_qos_work(struct work_struct *work) ++{ ++ struct omap8250_priv *priv; ++ ++ priv = container_of(work, struct omap8250_priv, qos_work); ++ cpu_latency_qos_update_request(&priv->pm_qos_request, priv->latency); ++} ++ ++#ifdef CONFIG_SERIAL_8250_DMA ++static int omap_8250_dma_handle_irq(struct uart_port *port); ++#endif ++ ++static irqreturn_t omap8250_irq(int irq, void *dev_id) ++{ ++ struct uart_port *port = dev_id; ++ struct omap8250_priv *priv = port->private_data; ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int iir, lsr; ++ int ret; ++ ++#ifdef CONFIG_SERIAL_8250_DMA ++ if (up->dma) { ++ ret = omap_8250_dma_handle_irq(port); ++ return IRQ_RETVAL(ret); ++ } ++#endif ++ ++ serial8250_rpm_get(up); ++ lsr = serial_port_in(port, UART_LSR); ++ iir = serial_port_in(port, UART_IIR); ++ ret = serial8250_handle_irq(port, iir); ++ ++ /* ++ * On K3 SoCs, it is observed that RX TIMEOUT is signalled after ++ * FIFO has been drained, in which case a dummy read of RX FIFO ++ * is required to clear RX TIMEOUT condition. ++ */ ++ if (priv->habit & UART_RX_TIMEOUT_QUIRK && ++ (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT && ++ serial_port_in(port, UART_OMAP_RX_LVL) == 0) { ++ serial_port_in(port, UART_RX); ++ } ++ ++ /* Stop processing interrupts on input overrun */ ++ if ((lsr & UART_LSR_OE) && up->overrun_backoff_time_ms > 0) { ++ unsigned long delay; ++ ++ up->ier = port->serial_in(port, UART_IER); ++ if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { ++ port->ops->stop_rx(port); ++ } else { ++ /* Keep restarting the timer until ++ * the input overrun subsides. ++ */ ++ cancel_delayed_work(&up->overrun_backoff); ++ } ++ ++ delay = msecs_to_jiffies(up->overrun_backoff_time_ms); ++ schedule_delayed_work(&up->overrun_backoff, delay); ++ } ++ ++ serial8250_rpm_put(up); ++ ++ return IRQ_RETVAL(ret); ++} ++ ++static int omap_8250_startup(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = port->private_data; ++ int ret; ++ ++ if (priv->wakeirq) { ++ ret = dev_pm_set_dedicated_wake_irq(port->dev, priv->wakeirq); ++ if (ret) ++ return ret; ++ } ++ ++ pm_runtime_get_sync(port->dev); ++ ++ serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); ++ ++ serial_out(up, UART_LCR, UART_LCR_WLEN8); ++ ++ up->lsr_saved_flags = 0; ++ up->msr_saved_flags = 0; ++ ++ /* Disable DMA for console UART */ ++ if (uart_console(port)) ++ up->dma = NULL; ++ ++ if (up->dma) { ++ ret = serial8250_request_dma(up); ++ if (ret) { ++ dev_warn_ratelimited(port->dev, ++ "failed to request DMA\n"); ++ up->dma = NULL; ++ } ++ } ++ ++ ret = request_irq(port->irq, omap8250_irq, IRQF_SHARED, ++ dev_name(port->dev), port); ++ if (ret < 0) ++ goto err; ++ ++ up->ier = UART_IER_RLSI | UART_IER_RDI; ++ serial_out(up, UART_IER, up->ier); ++ ++#ifdef CONFIG_PM ++ up->capabilities |= UART_CAP_RPM; ++#endif ++ ++ /* Enable module level wake up */ ++ priv->wer = OMAP_UART_WER_MOD_WKUP; ++ if (priv->habit & OMAP_UART_WER_HAS_TX_WAKEUP) ++ priv->wer |= OMAP_UART_TX_WAKEUP_EN; ++ serial_out(up, UART_OMAP_WER, priv->wer); ++ ++ if (up->dma && !(priv->habit & UART_HAS_EFR2)) ++ up->dma->rx_dma(up); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++ return 0; ++err: ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++ dev_pm_clear_wake_irq(port->dev); ++ return ret; ++} ++ ++static void omap_8250_shutdown(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = port->private_data; ++ ++ flush_work(&priv->qos_work); ++ if (up->dma) ++ omap_8250_rx_dma_flush(up); ++ ++ pm_runtime_get_sync(port->dev); ++ ++ serial_out(up, UART_OMAP_WER, 0); ++ if (priv->habit & UART_HAS_EFR2) ++ serial_out(up, UART_OMAP_EFR2, 0x0); ++ ++ up->ier = 0; ++ serial_out(up, UART_IER, 0); ++ ++ if (up->dma) ++ serial8250_release_dma(up); ++ ++ /* ++ * Disable break condition and FIFOs ++ */ ++ if (up->lcr & UART_LCR_SBC) ++ serial_out(up, UART_LCR, up->lcr & ~UART_LCR_SBC); ++ serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++ free_irq(port->irq, port); ++ dev_pm_clear_wake_irq(port->dev); ++} ++ ++static void omap_8250_throttle(struct uart_port *port) ++{ ++ struct omap8250_priv *priv = port->private_data; ++ unsigned long flags; ++ ++ pm_runtime_get_sync(port->dev); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ port->ops->stop_rx(port); ++ priv->throttled = true; ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++} ++ ++static void omap_8250_unthrottle(struct uart_port *port) ++{ ++ struct omap8250_priv *priv = port->private_data; ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ ++ pm_runtime_get_sync(port->dev); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ priv->throttled = false; ++ if (up->dma) ++ up->dma->rx_dma(up); ++ up->ier |= UART_IER_RLSI | UART_IER_RDI; ++ port->read_status_mask |= UART_LSR_DR; ++ serial_out(up, UART_IER, up->ier); ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ pm_runtime_mark_last_busy(port->dev); ++ pm_runtime_put_autosuspend(port->dev); ++} ++ ++#ifdef CONFIG_SERIAL_8250_DMA ++static int omap_8250_rx_dma(struct uart_8250_port *p); ++ ++/* Must be called while priv->rx_dma_lock is held */ ++static void __dma_rx_do_complete(struct uart_8250_port *p) ++{ ++ struct uart_8250_dma *dma = p->dma; ++ struct tty_port *tty_port = &p->port.state->port; ++ struct omap8250_priv *priv = p->port.private_data; ++ struct dma_chan *rxchan = dma->rxchan; ++ dma_cookie_t cookie; ++ struct dma_tx_state state; ++ int count; ++ int ret; ++ u32 reg; ++ ++ if (!dma->rx_running) ++ goto out; ++ ++ cookie = dma->rx_cookie; ++ dma->rx_running = 0; ++ ++ /* Re-enable RX FIFO interrupt now that transfer is complete */ ++ if (priv->habit & UART_HAS_RHR_IT_DIS) { ++ reg = serial_in(p, UART_OMAP_IER2); ++ reg &= ~UART_OMAP_IER2_RHR_IT_DIS; ++ serial_out(p, UART_OMAP_IER2, UART_OMAP_IER2_RHR_IT_DIS); ++ } ++ ++ dmaengine_tx_status(rxchan, cookie, &state); ++ ++ count = dma->rx_size - state.residue + state.in_flight_bytes; ++ if (count < dma->rx_size) { ++ dmaengine_terminate_async(rxchan); ++ ++ /* ++ * Poll for teardown to complete which guarantees in ++ * flight data is drained. ++ */ ++ if (state.in_flight_bytes) { ++ int poll_count = 25; ++ ++ while (dmaengine_tx_status(rxchan, cookie, NULL) && ++ poll_count--) ++ cpu_relax(); ++ ++ if (poll_count == -1) ++ dev_err(p->port.dev, "teardown incomplete\n"); ++ } ++ } ++ if (!count) ++ goto out; ++ ret = tty_insert_flip_string(tty_port, dma->rx_buf, count); ++ ++ p->port.icount.rx += ret; ++ p->port.icount.buf_overrun += count - ret; ++out: ++ ++ tty_flip_buffer_push(tty_port); ++} ++ ++static void __dma_rx_complete(void *param) ++{ ++ struct uart_8250_port *p = param; ++ struct omap8250_priv *priv = p->port.private_data; ++ struct uart_8250_dma *dma = p->dma; ++ struct dma_tx_state state; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&p->port.lock, flags); ++ ++ /* ++ * If the tx status is not DMA_COMPLETE, then this is a delayed ++ * completion callback. A previous RX timeout flush would have ++ * already pushed the data, so exit. ++ */ ++ if (dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state) != ++ DMA_COMPLETE) { ++ spin_unlock_irqrestore(&p->port.lock, flags); ++ return; ++ } ++ __dma_rx_do_complete(p); ++ if (!priv->throttled) { ++ p->ier |= UART_IER_RLSI | UART_IER_RDI; ++ serial_out(p, UART_IER, p->ier); ++ if (!(priv->habit & UART_HAS_EFR2)) ++ omap_8250_rx_dma(p); ++ } ++ ++ spin_unlock_irqrestore(&p->port.lock, flags); ++} ++ ++static void omap_8250_rx_dma_flush(struct uart_8250_port *p) ++{ ++ struct omap8250_priv *priv = p->port.private_data; ++ struct uart_8250_dma *dma = p->dma; ++ struct dma_tx_state state; ++ unsigned long flags; ++ int ret; ++ ++ spin_lock_irqsave(&priv->rx_dma_lock, flags); ++ ++ if (!dma->rx_running) { ++ spin_unlock_irqrestore(&priv->rx_dma_lock, flags); ++ return; ++ } ++ ++ ret = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state); ++ if (ret == DMA_IN_PROGRESS) { ++ ret = dmaengine_pause(dma->rxchan); ++ if (WARN_ON_ONCE(ret)) ++ priv->rx_dma_broken = true; ++ } ++ __dma_rx_do_complete(p); ++ spin_unlock_irqrestore(&priv->rx_dma_lock, flags); ++} ++ ++static int omap_8250_rx_dma(struct uart_8250_port *p) ++{ ++ struct omap8250_priv *priv = p->port.private_data; ++ struct uart_8250_dma *dma = p->dma; ++ int err = 0; ++ struct dma_async_tx_descriptor *desc; ++ unsigned long flags; ++ u32 reg; ++ ++ if (priv->rx_dma_broken) ++ return -EINVAL; ++ ++ spin_lock_irqsave(&priv->rx_dma_lock, flags); ++ ++ if (dma->rx_running) { ++ enum dma_status state; ++ ++ state = dmaengine_tx_status(dma->rxchan, dma->rx_cookie, NULL); ++ if (state == DMA_COMPLETE) { ++ /* ++ * Disable RX interrupts to allow RX DMA completion ++ * callback to run. ++ */ ++ p->ier &= ~(UART_IER_RLSI | UART_IER_RDI); ++ serial_out(p, UART_IER, p->ier); ++ } ++ goto out; ++ } ++ ++ desc = dmaengine_prep_slave_single(dma->rxchan, dma->rx_addr, ++ dma->rx_size, DMA_DEV_TO_MEM, ++ DMA_PREP_INTERRUPT | DMA_CTRL_ACK); ++ if (!desc) { ++ err = -EBUSY; ++ goto out; ++ } ++ ++ dma->rx_running = 1; ++ desc->callback = __dma_rx_complete; ++ desc->callback_param = p; ++ ++ dma->rx_cookie = dmaengine_submit(desc); ++ ++ /* ++ * Disable RX FIFO interrupt while RX DMA is enabled, else ++ * spurious interrupt may be raised when data is in the RX FIFO ++ * but is yet to be drained by DMA. ++ */ ++ if (priv->habit & UART_HAS_RHR_IT_DIS) { ++ reg = serial_in(p, UART_OMAP_IER2); ++ reg |= UART_OMAP_IER2_RHR_IT_DIS; ++ serial_out(p, UART_OMAP_IER2, UART_OMAP_IER2_RHR_IT_DIS); ++ } ++ ++ dma_async_issue_pending(dma->rxchan); ++out: ++ spin_unlock_irqrestore(&priv->rx_dma_lock, flags); ++ return err; ++} ++ ++static int omap_8250_tx_dma(struct uart_8250_port *p); ++ ++static void omap_8250_dma_tx_complete(void *param) ++{ ++ struct uart_8250_port *p = param; ++ struct uart_8250_dma *dma = p->dma; ++ struct circ_buf *xmit = &p->port.state->xmit; ++ unsigned long flags; ++ bool en_thri = false; ++ struct omap8250_priv *priv = p->port.private_data; ++ ++ dma_sync_single_for_cpu(dma->txchan->device->dev, dma->tx_addr, ++ UART_XMIT_SIZE, DMA_TO_DEVICE); ++ ++ spin_lock_irqsave(&p->port.lock, flags); ++ ++ dma->tx_running = 0; ++ ++ xmit->tail += dma->tx_size; ++ xmit->tail &= UART_XMIT_SIZE - 1; ++ p->port.icount.tx += dma->tx_size; ++ ++ if (priv->delayed_restore) { ++ priv->delayed_restore = 0; ++ omap8250_restore_regs(p); ++ } ++ ++ if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) ++ uart_write_wakeup(&p->port); ++ ++ if (!uart_circ_empty(xmit) && !uart_tx_stopped(&p->port)) { ++ int ret; ++ ++ ret = omap_8250_tx_dma(p); ++ if (ret) ++ en_thri = true; ++ } else if (p->capabilities & UART_CAP_RPM) { ++ en_thri = true; ++ } ++ ++ if (en_thri) { ++ dma->tx_err = 1; ++ serial8250_set_THRI(p); ++ } ++ ++ spin_unlock_irqrestore(&p->port.lock, flags); ++} ++ ++static int omap_8250_tx_dma(struct uart_8250_port *p) ++{ ++ struct uart_8250_dma *dma = p->dma; ++ struct omap8250_priv *priv = p->port.private_data; ++ struct circ_buf *xmit = &p->port.state->xmit; ++ struct dma_async_tx_descriptor *desc; ++ unsigned int skip_byte = 0; ++ int ret; ++ ++ if (dma->tx_running) ++ return 0; ++ if (uart_tx_stopped(&p->port) || uart_circ_empty(xmit)) { ++ ++ /* ++ * Even if no data, we need to return an error for the two cases ++ * below so serial8250_tx_chars() is invoked and properly clears ++ * THRI and/or runtime suspend. ++ */ ++ if (dma->tx_err || p->capabilities & UART_CAP_RPM) { ++ ret = -EBUSY; ++ goto err; ++ } ++ serial8250_clear_THRI(p); ++ return 0; ++ } ++ ++ dma->tx_size = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE); ++ if (priv->habit & OMAP_DMA_TX_KICK) { ++ u8 tx_lvl; ++ ++ /* ++ * We need to put the first byte into the FIFO in order to start ++ * the DMA transfer. For transfers smaller than four bytes we ++ * don't bother doing DMA at all. It seem not matter if there ++ * are still bytes in the FIFO from the last transfer (in case ++ * we got here directly from omap_8250_dma_tx_complete()). Bytes ++ * leaving the FIFO seem not to trigger the DMA transfer. It is ++ * really the byte that we put into the FIFO. ++ * If the FIFO is already full then we most likely got here from ++ * omap_8250_dma_tx_complete(). And this means the DMA engine ++ * just completed its work. We don't have to wait the complete ++ * 86us at 115200,8n1 but around 60us (not to mention lower ++ * baudrates). So in that case we take the interrupt and try ++ * again with an empty FIFO. ++ */ ++ tx_lvl = serial_in(p, UART_OMAP_TX_LVL); ++ if (tx_lvl == p->tx_loadsz) { ++ ret = -EBUSY; ++ goto err; ++ } ++ if (dma->tx_size < 4) { ++ ret = -EINVAL; ++ goto err; ++ } ++ skip_byte = 1; ++ } ++ ++ desc = dmaengine_prep_slave_single(dma->txchan, ++ dma->tx_addr + xmit->tail + skip_byte, ++ dma->tx_size - skip_byte, DMA_MEM_TO_DEV, ++ DMA_PREP_INTERRUPT | DMA_CTRL_ACK); ++ if (!desc) { ++ ret = -EBUSY; ++ goto err; ++ } ++ ++ dma->tx_running = 1; ++ ++ desc->callback = omap_8250_dma_tx_complete; ++ desc->callback_param = p; ++ ++ dma->tx_cookie = dmaengine_submit(desc); ++ ++ dma_sync_single_for_device(dma->txchan->device->dev, dma->tx_addr, ++ UART_XMIT_SIZE, DMA_TO_DEVICE); ++ ++ dma_async_issue_pending(dma->txchan); ++ if (dma->tx_err) ++ dma->tx_err = 0; ++ ++ serial8250_clear_THRI(p); ++ if (skip_byte) ++ serial_out(p, UART_TX, xmit->buf[xmit->tail]); ++ return 0; ++err: ++ dma->tx_err = 1; ++ return ret; ++} ++ ++static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir) ++{ ++ switch (iir & 0x3f) { ++ case UART_IIR_RLSI: ++ case UART_IIR_RX_TIMEOUT: ++ case UART_IIR_RDI: ++ omap_8250_rx_dma_flush(up); ++ return true; ++ } ++ return omap_8250_rx_dma(up); ++} ++ ++static u16 omap_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, u16 status) ++{ ++ if ((status & (UART_LSR_DR | UART_LSR_BI)) && ++ (iir & UART_IIR_RDI)) { ++ if (handle_rx_dma(up, iir)) { ++ status = serial8250_rx_chars(up, status); ++ omap_8250_rx_dma(up); ++ } ++ } ++ ++ return status; ++} ++ ++static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, ++ u16 status) ++{ ++ /* ++ * Queue a new transfer if FIFO has data. ++ */ ++ if ((status & (UART_LSR_DR | UART_LSR_BI)) && ++ (up->ier & UART_IER_RDI)) { ++ omap_8250_rx_dma(up); ++ serial_out(up, UART_OMAP_EFR2, UART_OMAP_EFR2_TIMEOUT_BEHAVE); ++ } else if ((iir & 0x3f) == UART_IIR_RX_TIMEOUT) { ++ /* ++ * Disable RX timeout, read IIR to clear ++ * current timeout condition, clear EFR2 to ++ * periodic timeouts, re-enable interrupts. ++ */ ++ up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); ++ serial_out(up, UART_IER, up->ier); ++ omap_8250_rx_dma_flush(up); ++ serial_in(up, UART_IIR); ++ serial_out(up, UART_OMAP_EFR2, 0x0); ++ up->ier |= UART_IER_RLSI | UART_IER_RDI; ++ serial_out(up, UART_IER, up->ier); ++ } ++} ++ ++/* ++ * This is mostly serial8250_handle_irq(). We have a slightly different DMA ++ * hoook for RX/TX and need different logic for them in the ISR. Therefore we ++ * use the default routine in the non-DMA case and this one for with DMA. ++ */ ++static int omap_8250_dma_handle_irq(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct omap8250_priv *priv = up->port.private_data; ++ u16 status; ++ u8 iir; ++ ++ serial8250_rpm_get(up); ++ ++ iir = serial_port_in(port, UART_IIR); ++ if (iir & UART_IIR_NO_INT) { ++ serial8250_rpm_put(up); ++ return IRQ_HANDLED; ++ } ++ ++ spin_lock(&port->lock); ++ ++ status = serial_port_in(port, UART_LSR); ++ ++ if (priv->habit & UART_HAS_EFR2) ++ am654_8250_handle_rx_dma(up, iir, status); ++ else ++ status = omap_8250_handle_rx_dma(up, iir, status); ++ ++ serial8250_modem_status(up); ++ if (status & UART_LSR_THRE && up->dma->tx_err) { ++ if (uart_tx_stopped(&up->port) || ++ uart_circ_empty(&up->port.state->xmit)) { ++ up->dma->tx_err = 0; ++ serial8250_tx_chars(up); ++ } else { ++ /* ++ * try again due to an earlier failer which ++ * might have been resolved by now. ++ */ ++ if (omap_8250_tx_dma(up)) ++ serial8250_tx_chars(up); ++ } ++ } ++ ++ uart_unlock_and_check_sysrq(port); ++ ++ serial8250_rpm_put(up); ++ return 1; ++} ++ ++static bool the_no_dma_filter_fn(struct dma_chan *chan, void *param) ++{ ++ return false; ++} ++ ++#else ++ ++static inline int omap_8250_rx_dma(struct uart_8250_port *p) ++{ ++ return -EINVAL; ++} ++#endif ++ ++static int omap8250_no_handle_irq(struct uart_port *port) ++{ ++ /* IRQ has not been requested but handling irq? */ ++ WARN_ONCE(1, "Unexpected irq handling before port startup\n"); ++ return 0; ++} ++ ++static struct omap8250_dma_params am654_dma = { ++ .rx_size = SZ_2K, ++ .rx_trigger = 1, ++ .tx_trigger = TX_TRIGGER, ++}; ++ ++static struct omap8250_dma_params am33xx_dma = { ++ .rx_size = RX_TRIGGER, ++ .rx_trigger = RX_TRIGGER, ++ .tx_trigger = TX_TRIGGER, ++}; ++ ++static struct omap8250_platdata am654_platdata = { ++ .dma_params = &am654_dma, ++ .habit = UART_HAS_EFR2 | UART_HAS_RHR_IT_DIS | ++ UART_RX_TIMEOUT_QUIRK, ++}; ++ ++static struct omap8250_platdata am33xx_platdata = { ++ .dma_params = &am33xx_dma, ++ .habit = OMAP_DMA_TX_KICK | UART_ERRATA_CLOCK_DISABLE, ++}; ++ ++static struct omap8250_platdata omap4_platdata = { ++ .dma_params = &am33xx_dma, ++ .habit = UART_ERRATA_CLOCK_DISABLE, ++}; ++ ++static const struct of_device_id omap8250_dt_ids[] = { ++ { .compatible = "ti,am654-uart", .data = &am654_platdata, }, ++ { .compatible = "ti,omap2-uart" }, ++ { .compatible = "ti,omap3-uart" }, ++ { .compatible = "ti,omap4-uart", .data = &omap4_platdata, }, ++ { .compatible = "ti,am3352-uart", .data = &am33xx_platdata, }, ++ { .compatible = "ti,am4372-uart", .data = &am33xx_platdata, }, ++ { .compatible = "ti,dra742-uart", .data = &omap4_platdata, }, ++ {}, ++}; ++MODULE_DEVICE_TABLE(of, omap8250_dt_ids); ++ ++static int omap8250_probe(struct platform_device *pdev) ++{ ++ struct device_node *np = pdev->dev.of_node; ++ struct omap8250_priv *priv; ++ const struct omap8250_platdata *pdata; ++ struct uart_8250_port up; ++ struct resource *regs; ++ void __iomem *membase; ++ int irq, ret; ++ ++ irq = platform_get_irq(pdev, 0); ++ if (irq < 0) ++ return irq; ++ ++ regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); ++ if (!regs) { ++ dev_err(&pdev->dev, "missing registers\n"); ++ return -EINVAL; ++ } ++ ++ priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); ++ if (!priv) ++ return -ENOMEM; ++ ++ membase = devm_ioremap(&pdev->dev, regs->start, ++ resource_size(regs)); ++ if (!membase) ++ return -ENODEV; ++ ++ memset(&up, 0, sizeof(up)); ++ up.port.dev = &pdev->dev; ++ up.port.mapbase = regs->start; ++ up.port.membase = membase; ++ up.port.irq = irq; ++ /* ++ * It claims to be 16C750 compatible however it is a little different. ++ * It has EFR and has no FCR7_64byte bit. The AFE (which it claims to ++ * have) is enabled via EFR instead of MCR. The type is set here 8250 ++ * just to get things going. UNKNOWN does not work for a few reasons and ++ * we don't need our own type since we don't use 8250's set_termios() ++ * or pm callback. ++ */ ++ up.port.type = PORT_8250; ++ up.port.iotype = UPIO_MEM; ++ up.port.flags = UPF_FIXED_PORT | UPF_FIXED_TYPE | UPF_SOFT_FLOW | ++ UPF_HARD_FLOW; ++ up.port.private_data = priv; ++ ++ up.port.regshift = 2; ++ up.port.fifosize = 64; ++ up.tx_loadsz = 64; ++ up.capabilities = UART_CAP_FIFO; ++#ifdef CONFIG_PM ++ /* ++ * Runtime PM is mostly transparent. However to do it right we need to a ++ * TX empty interrupt before we can put the device to auto idle. So if ++ * PM is not enabled we don't add that flag and can spare that one extra ++ * interrupt in the TX path. ++ */ ++ up.capabilities |= UART_CAP_RPM; ++#endif ++ up.port.set_termios = omap_8250_set_termios; ++ up.port.set_mctrl = omap8250_set_mctrl; ++ up.port.pm = omap_8250_pm; ++ up.port.startup = omap_8250_startup; ++ up.port.shutdown = omap_8250_shutdown; ++ up.port.throttle = omap_8250_throttle; ++ up.port.unthrottle = omap_8250_unthrottle; ++ up.port.rs485_config = serial8250_em485_config; ++ up.port.rs485_supported = serial8250_em485_supported; ++ up.rs485_start_tx = serial8250_em485_start_tx; ++ up.rs485_stop_tx = serial8250_em485_stop_tx; ++ up.port.has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE); ++ ++ ret = of_alias_get_id(np, "serial"); ++ if (ret < 0) { ++ dev_err(&pdev->dev, "failed to get alias\n"); ++ return ret; ++ } ++ up.port.line = ret; ++ ++ if (of_property_read_u32(np, "clock-frequency", &up.port.uartclk)) { ++ struct clk *clk; ++ ++ clk = devm_clk_get(&pdev->dev, NULL); ++ if (IS_ERR(clk)) { ++ if (PTR_ERR(clk) == -EPROBE_DEFER) ++ return -EPROBE_DEFER; ++ } else { ++ up.port.uartclk = clk_get_rate(clk); ++ } ++ } ++ ++ if (of_property_read_u32(np, "overrun-throttle-ms", ++ &up.overrun_backoff_time_ms) != 0) ++ up.overrun_backoff_time_ms = 0; ++ ++ priv->wakeirq = irq_of_parse_and_map(np, 1); ++ ++ pdata = of_device_get_match_data(&pdev->dev); ++ if (pdata) ++ priv->habit |= pdata->habit; ++ ++ if (!up.port.uartclk) { ++ up.port.uartclk = DEFAULT_CLK_SPEED; ++ dev_warn(&pdev->dev, ++ "No clock speed specified: using default: %d\n", ++ DEFAULT_CLK_SPEED); ++ } ++ ++ priv->latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE; ++ priv->calc_latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE; ++ cpu_latency_qos_add_request(&priv->pm_qos_request, priv->latency); ++ INIT_WORK(&priv->qos_work, omap8250_uart_qos_work); ++ ++ spin_lock_init(&priv->rx_dma_lock); ++ ++ device_init_wakeup(&pdev->dev, true); ++ pm_runtime_enable(&pdev->dev); ++ pm_runtime_use_autosuspend(&pdev->dev); ++ ++ /* ++ * Disable runtime PM until autosuspend delay unless specifically ++ * enabled by the user via sysfs. This is the historic way to ++ * prevent an unsafe default policy with lossy characters on wake-up. ++ * For serdev devices this is not needed, the policy can be managed by ++ * the serdev driver. ++ */ ++ if (!of_get_available_child_count(pdev->dev.of_node)) ++ pm_runtime_set_autosuspend_delay(&pdev->dev, -1); ++ ++ pm_runtime_irq_safe(&pdev->dev); ++ ++ pm_runtime_get_sync(&pdev->dev); ++ ++ omap_serial_fill_features_erratas(&up, priv); ++ up.port.handle_irq = omap8250_no_handle_irq; ++ priv->rx_trigger = RX_TRIGGER; ++ priv->tx_trigger = TX_TRIGGER; ++#ifdef CONFIG_SERIAL_8250_DMA ++ /* ++ * Oh DMA support. If there are no DMA properties in the DT then ++ * we will fall back to a generic DMA channel which does not ++ * really work here. To ensure that we do not get a generic DMA ++ * channel assigned, we have the the_no_dma_filter_fn() here. ++ * To avoid "failed to request DMA" messages we check for DMA ++ * properties in DT. ++ */ ++ ret = of_property_count_strings(np, "dma-names"); ++ if (ret == 2) { ++ struct omap8250_dma_params *dma_params = NULL; ++ ++ up.dma = &priv->omap8250_dma; ++ up.dma->fn = the_no_dma_filter_fn; ++ up.dma->tx_dma = omap_8250_tx_dma; ++ up.dma->rx_dma = omap_8250_rx_dma; ++ if (pdata) ++ dma_params = pdata->dma_params; ++ ++ if (dma_params) { ++ up.dma->rx_size = dma_params->rx_size; ++ up.dma->rxconf.src_maxburst = dma_params->rx_trigger; ++ up.dma->txconf.dst_maxburst = dma_params->tx_trigger; ++ priv->rx_trigger = dma_params->rx_trigger; ++ priv->tx_trigger = dma_params->tx_trigger; ++ } else { ++ up.dma->rx_size = RX_TRIGGER; ++ up.dma->rxconf.src_maxburst = RX_TRIGGER; ++ up.dma->txconf.dst_maxburst = TX_TRIGGER; ++ } ++ } ++#endif ++ ret = serial8250_register_8250_port(&up); ++ if (ret < 0) { ++ dev_err(&pdev->dev, "unable to register 8250 port\n"); ++ goto err; ++ } ++ priv->line = ret; ++ platform_set_drvdata(pdev, priv); ++ pm_runtime_mark_last_busy(&pdev->dev); ++ pm_runtime_put_autosuspend(&pdev->dev); ++ return 0; ++err: ++ pm_runtime_dont_use_autosuspend(&pdev->dev); ++ pm_runtime_put_sync(&pdev->dev); ++ pm_runtime_disable(&pdev->dev); ++ return ret; ++} ++ ++static int omap8250_remove(struct platform_device *pdev) ++{ ++ struct omap8250_priv *priv = platform_get_drvdata(pdev); ++ int err; ++ ++ err = pm_runtime_resume_and_get(&pdev->dev); ++ if (err) ++ return err; ++ ++ pm_runtime_dont_use_autosuspend(&pdev->dev); ++ pm_runtime_put_sync(&pdev->dev); ++ flush_work(&priv->qos_work); ++ pm_runtime_disable(&pdev->dev); ++ serial8250_unregister_port(priv->line); ++ cpu_latency_qos_remove_request(&priv->pm_qos_request); ++ device_init_wakeup(&pdev->dev, false); ++ return 0; ++} ++ ++#ifdef CONFIG_PM_SLEEP ++static int omap8250_prepare(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ ++ if (!priv) ++ return 0; ++ priv->is_suspending = true; ++ return 0; ++} ++ ++static void omap8250_complete(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ ++ if (!priv) ++ return; ++ priv->is_suspending = false; ++} ++ ++static int omap8250_suspend(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ struct uart_8250_port *up = serial8250_get_port(priv->line); ++ ++ serial8250_suspend_port(priv->line); ++ ++ pm_runtime_get_sync(dev); ++ if (!device_may_wakeup(dev)) ++ priv->wer = 0; ++ serial_out(up, UART_OMAP_WER, priv->wer); ++ pm_runtime_mark_last_busy(dev); ++ pm_runtime_put_autosuspend(dev); ++ ++ flush_work(&priv->qos_work); ++ return 0; ++} ++ ++static int omap8250_resume(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ ++ serial8250_resume_port(priv->line); ++ return 0; ++} ++#else ++#define omap8250_prepare NULL ++#define omap8250_complete NULL ++#endif ++ ++#ifdef CONFIG_PM ++static int omap8250_lost_context(struct uart_8250_port *up) ++{ ++ u32 val; ++ ++ val = serial_in(up, UART_OMAP_SCR); ++ /* ++ * If we lose context, then SCR is set to its reset value of zero. ++ * After set_termios() we set bit 3 of SCR (TX_EMPTY_CTL_IT) to 1, ++ * among other bits, to never set the register back to zero again. ++ */ ++ if (!val) ++ return 1; ++ return 0; ++} ++ ++/* TODO: in future, this should happen via API in drivers/reset/ */ ++static int omap8250_soft_reset(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ struct uart_8250_port *up = serial8250_get_port(priv->line); ++ int timeout = 100; ++ int sysc; ++ int syss; ++ ++ /* ++ * At least on omap4, unused uarts may not idle after reset without ++ * a basic scr dma configuration even with no dma in use. The ++ * module clkctrl status bits will be 1 instead of 3 blocking idle ++ * for the whole clockdomain. The softreset below will clear scr, ++ * and we restore it on resume so this is safe to do on all SoCs ++ * needing omap8250_soft_reset() quirk. Do it in two writes as ++ * recommended in the comment for omap8250_update_scr(). ++ */ ++ serial_out(up, UART_OMAP_SCR, OMAP_UART_SCR_DMAMODE_1); ++ serial_out(up, UART_OMAP_SCR, ++ OMAP_UART_SCR_DMAMODE_1 | OMAP_UART_SCR_DMAMODE_CTL); ++ ++ sysc = serial_in(up, UART_OMAP_SYSC); ++ ++ /* softreset the UART */ ++ sysc |= OMAP_UART_SYSC_SOFTRESET; ++ serial_out(up, UART_OMAP_SYSC, sysc); ++ ++ /* By experiments, 1us enough for reset complete on AM335x */ ++ do { ++ udelay(1); ++ syss = serial_in(up, UART_OMAP_SYSS); ++ } while (--timeout && !(syss & OMAP_UART_SYSS_RESETDONE)); ++ ++ if (!timeout) { ++ dev_err(dev, "timed out waiting for reset done\n"); ++ return -ETIMEDOUT; ++ } ++ ++ return 0; ++} ++ ++static int omap8250_runtime_suspend(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ struct uart_8250_port *up; ++ ++ /* In case runtime-pm tries this before we are setup */ ++ if (!priv) ++ return 0; ++ ++ up = serial8250_get_port(priv->line); ++ /* ++ * When using 'no_console_suspend', the console UART must not be ++ * suspended. Since driver suspend is managed by runtime suspend, ++ * preventing runtime suspend (by returning error) will keep device ++ * active during suspend. ++ */ ++ if (priv->is_suspending && !console_suspend_enabled) { ++ if (uart_console(&up->port)) ++ return -EBUSY; ++ } ++ ++ if (priv->habit & UART_ERRATA_CLOCK_DISABLE) { ++ int ret; ++ ++ ret = omap8250_soft_reset(dev); ++ if (ret) ++ return ret; ++ ++ /* Restore to UART mode after reset (for wakeup) */ ++ omap8250_update_mdr1(up, priv); ++ /* Restore wakeup enable register */ ++ serial_out(up, UART_OMAP_WER, priv->wer); ++ } ++ ++ if (up->dma && up->dma->rxchan) ++ omap_8250_rx_dma_flush(up); ++ ++ priv->latency = PM_QOS_CPU_LATENCY_DEFAULT_VALUE; ++ schedule_work(&priv->qos_work); ++ ++ return 0; ++} ++ ++static int omap8250_runtime_resume(struct device *dev) ++{ ++ struct omap8250_priv *priv = dev_get_drvdata(dev); ++ struct uart_8250_port *up; ++ ++ /* In case runtime-pm tries this before we are setup */ ++ if (!priv) ++ return 0; ++ ++ up = serial8250_get_port(priv->line); ++ ++ if (omap8250_lost_context(up)) ++ omap8250_restore_regs(up); ++ ++ if (up->dma && up->dma->rxchan && !(priv->habit & UART_HAS_EFR2)) ++ omap_8250_rx_dma(up); ++ ++ priv->latency = priv->calc_latency; ++ schedule_work(&priv->qos_work); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SERIAL_8250_OMAP_TTYO_FIXUP ++static int __init omap8250_console_fixup(void) ++{ ++ char *omap_str; ++ char *options; ++ u8 idx; ++ ++ if (strstr(boot_command_line, "console=ttyS")) ++ /* user set a ttyS based name for the console */ ++ return 0; ++ ++ omap_str = strstr(boot_command_line, "console=ttyO"); ++ if (!omap_str) ++ /* user did not set ttyO based console, so we don't care */ ++ return 0; ++ ++ omap_str += 12; ++ if ('0' <= *omap_str && *omap_str <= '9') ++ idx = *omap_str - '0'; ++ else ++ return 0; ++ ++ omap_str++; ++ if (omap_str[0] == ',') { ++ omap_str++; ++ options = omap_str; ++ } else { ++ options = NULL; ++ } ++ ++ add_preferred_console("ttyS", idx, options); ++ pr_err("WARNING: Your 'console=ttyO%d' has been replaced by 'ttyS%d'\n", ++ idx, idx); ++ pr_err("This ensures that you still see kernel messages. Please\n"); ++ pr_err("update your kernel commandline.\n"); ++ return 0; ++} ++console_initcall(omap8250_console_fixup); ++#endif ++ ++static const struct dev_pm_ops omap8250_dev_pm_ops = { ++ SET_SYSTEM_SLEEP_PM_OPS(omap8250_suspend, omap8250_resume) ++ SET_RUNTIME_PM_OPS(omap8250_runtime_suspend, ++ omap8250_runtime_resume, NULL) ++ .prepare = omap8250_prepare, ++ .complete = omap8250_complete, ++}; ++ ++static struct platform_driver omap8250_platform_driver = { ++ .driver = { ++ .name = "omap8250", ++ .pm = &omap8250_dev_pm_ops, ++ .of_match_table = omap8250_dt_ids, ++ }, ++ .probe = omap8250_probe, ++ .remove = omap8250_remove, ++}; ++module_platform_driver(omap8250_platform_driver); ++ ++MODULE_AUTHOR("Sebastian Andrzej Siewior"); ++MODULE_DESCRIPTION("OMAP 8250 Driver"); ++MODULE_LICENSE("GPL v2"); +Binary files linux.orig/drivers/tty/serial/8250/.8250_omap.c.rej.swp and linux/drivers/tty/serial/8250/.8250_omap.c.rej.swp differ +diff -rupN linux.orig/drivers/tty/serial/8250/8250_port.c linux/drivers/tty/serial/8250/8250_port.c +--- linux.orig/drivers/tty/serial/8250/8250_port.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_port.c 2022-12-04 10:40:26.700034085 -0500 +@@ -743,7 +743,7 @@ static void serial8250_set_sleep(struct serial_out(p, UART_EFR, UART_EFR_ECB); serial_out(p, UART_LCR, 0); } @@ -3637,7 +22982,7 @@ index 2030a92ac66e7..326549603740d 100644 if (p->capabilities & UART_CAP_EFR) { serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, efr); -@@ -1017,8 +1017,11 @@ static int broken_efr(struct uart_8250_port *up) +@@ -1017,8 +1017,11 @@ static int broken_efr(struct uart_8250_p */ static void autoconfig_16550a(struct uart_8250_port *up) { @@ -3649,7 +22994,7 @@ index 2030a92ac66e7..326549603740d 100644 up->port.type = PORT_16550A; up->capabilities |= UART_CAP_FIFO; -@@ -1130,6 +1133,11 @@ static void autoconfig_16550a(struct uart_8250_port *up) +@@ -1130,6 +1133,11 @@ static void autoconfig_16550a(struct uar return; } @@ -3661,7 +23006,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Try writing and reading the UART_IER_UUE bit (b6). * If it works, this is probably one of the Xscale platform's -@@ -1165,6 +1173,9 @@ static void autoconfig_16550a(struct uart_8250_port *up) +@@ -1165,6 +1173,9 @@ static void autoconfig_16550a(struct uar } serial_out(up, UART_IER, iersave); @@ -3671,7 +23016,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * We distinguish between 16550A and U6 16550A by counting * how many bytes are in the FIFO. -@@ -1187,8 +1198,10 @@ static void autoconfig(struct uart_8250_port *up) +@@ -1187,8 +1198,10 @@ static void autoconfig(struct uart_8250_ unsigned char status1, scratch, scratch2, scratch3; unsigned char save_lcr, save_mcr; struct uart_port *port = &up->port; @@ -3682,7 +23027,7 @@ index 2030a92ac66e7..326549603740d 100644 if (!port->iobase && !port->mapbase && !port->membase) return; -@@ -1206,6 +1219,11 @@ static void autoconfig(struct uart_8250_port *up) +@@ -1206,6 +1219,11 @@ static void autoconfig(struct uart_8250_ up->bugs = 0; if (!(port->flags & UPF_BUGGY_UART)) { @@ -3694,7 +23039,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Do a simple existence test first; if we fail this, * there's no point trying anything else. -@@ -1235,6 +1253,10 @@ static void autoconfig(struct uart_8250_port *up) +@@ -1235,6 +1253,10 @@ static void autoconfig(struct uart_8250_ #endif scratch3 = serial_in(up, UART_IER) & 0x0f; serial_out(up, UART_IER, scratch); @@ -3705,7 +23050,7 @@ index 2030a92ac66e7..326549603740d 100644 if (scratch2 != 0 || scratch3 != 0x0F) { /* * We failed; there's nothing here -@@ -1332,10 +1354,7 @@ static void autoconfig(struct uart_8250_port *up) +@@ -1332,10 +1354,7 @@ static void autoconfig(struct uart_8250_ serial8250_out_MCR(up, save_mcr); serial8250_clear_fifos(up); serial_in(up, UART_RX); @@ -3717,7 +23062,7 @@ index 2030a92ac66e7..326549603740d 100644 out_unlock: spin_unlock_irqrestore(&port->lock, flags); -@@ -1361,7 +1380,9 @@ static void autoconfig_irq(struct uart_8250_port *up) +@@ -1361,7 +1380,9 @@ static void autoconfig_irq(struct uart_8 unsigned char save_mcr, save_ier; unsigned char save_ICP = 0; unsigned int ICP = 0; @@ -3727,7 +23072,7 @@ index 2030a92ac66e7..326549603740d 100644 int irq; if (port->flags & UPF_FOURPORT) { -@@ -1371,8 +1392,12 @@ static void autoconfig_irq(struct uart_8250_port *up) +@@ -1371,8 +1392,12 @@ static void autoconfig_irq(struct uart_8 inb_p(ICP); } @@ -3741,7 +23086,7 @@ index 2030a92ac66e7..326549603740d 100644 /* forget possible initially masked and pending IRQ */ probe_irq_off(probe_irq_on()); -@@ -1404,8 +1429,10 @@ static void autoconfig_irq(struct uart_8250_port *up) +@@ -1404,8 +1429,10 @@ static void autoconfig_irq(struct uart_8 if (port->flags & UPF_FOURPORT) outb_p(save_ICP, ICP); @@ -3753,7 +23098,7 @@ index 2030a92ac66e7..326549603740d 100644 port->irq = (irq > 0) ? irq : 0; } -@@ -1418,7 +1445,7 @@ static void serial8250_stop_rx(struct uart_port *port) +@@ -1418,7 +1445,7 @@ static void serial8250_stop_rx(struct ua up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); up->port.read_status_mask &= ~UART_LSR_DR; @@ -3762,7 +23107,7 @@ index 2030a92ac66e7..326549603740d 100644 serial8250_rpm_put(up); } -@@ -1448,7 +1475,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p) +@@ -1448,7 +1475,7 @@ void serial8250_em485_stop_tx(struct uar serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; @@ -3771,7 +23116,7 @@ index 2030a92ac66e7..326549603740d 100644 } } EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); -@@ -1697,7 +1724,7 @@ static void serial8250_disable_ms(struct uart_port *port) +@@ -1697,7 +1724,7 @@ static void serial8250_disable_ms(struct mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; @@ -3780,7 +23125,7 @@ index 2030a92ac66e7..326549603740d 100644 } static void serial8250_enable_ms(struct uart_port *port) -@@ -1713,7 +1740,7 @@ static void serial8250_enable_ms(struct uart_port *port) +@@ -1713,7 +1740,7 @@ static void serial8250_enable_ms(struct up->ier |= UART_IER_MSI; serial8250_rpm_get(up); @@ -3789,7 +23134,7 @@ index 2030a92ac66e7..326549603740d 100644 serial8250_rpm_put(up); } -@@ -2144,14 +2171,7 @@ static void serial8250_put_poll_char(struct uart_port *port, +@@ -2147,14 +2174,7 @@ static void serial8250_put_poll_char(str struct uart_8250_port *up = up_to_u8250p(port); serial8250_rpm_get(up); @@ -3805,7 +23150,7 @@ index 2030a92ac66e7..326549603740d 100644 wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); /* -@@ -2164,7 +2184,7 @@ static void serial8250_put_poll_char(struct uart_port *port, +@@ -2167,7 +2187,7 @@ static void serial8250_put_poll_char(str * and restore the IER */ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); @@ -3814,7 +23159,7 @@ index 2030a92ac66e7..326549603740d 100644 serial8250_rpm_put(up); } -@@ -2173,8 +2193,10 @@ static void serial8250_put_poll_char(struct uart_port *port, +@@ -2176,8 +2196,10 @@ static void serial8250_put_poll_char(str int serial8250_do_startup(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); @@ -3825,7 +23170,7 @@ index 2030a92ac66e7..326549603740d 100644 int retval; u16 lsr; -@@ -2195,7 +2217,7 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2198,7 +2220,7 @@ int serial8250_do_startup(struct uart_po up->acr = 0; serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); serial_port_out(port, UART_EFR, UART_EFR_ECB); @@ -3834,7 +23179,7 @@ index 2030a92ac66e7..326549603740d 100644 serial_port_out(port, UART_LCR, 0); serial_icr_write(up, UART_CSR, 0); /* Reset the UART */ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); -@@ -2205,7 +2227,7 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2208,7 +2230,7 @@ int serial8250_do_startup(struct uart_po if (port->type == PORT_DA830) { /* Reset the port */ @@ -3843,7 +23188,7 @@ index 2030a92ac66e7..326549603740d 100644 serial_port_out(port, UART_DA830_PWREMU_MGMT, 0); mdelay(10); -@@ -2304,6 +2326,8 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2307,6 +2329,8 @@ int serial8250_do_startup(struct uart_po if (retval) goto out; @@ -3852,7 +23197,7 @@ index 2030a92ac66e7..326549603740d 100644 if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) { unsigned char iir1; -@@ -2320,6 +2344,9 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2323,6 +2347,9 @@ int serial8250_do_startup(struct uart_po */ spin_lock_irqsave(&port->lock, flags); @@ -3862,7 +23207,7 @@ index 2030a92ac66e7..326549603740d 100644 wait_for_xmitr(up, UART_LSR_THRE); serial_port_out_sync(port, UART_IER, UART_IER_THRI); udelay(1); /* allow THRE to set */ -@@ -2330,6 +2357,9 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2333,6 +2360,9 @@ int serial8250_do_startup(struct uart_po iir = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); @@ -3872,7 +23217,7 @@ index 2030a92ac66e7..326549603740d 100644 spin_unlock_irqrestore(&port->lock, flags); if (port->irqflags & IRQF_SHARED) -@@ -2384,10 +2414,14 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2387,10 +2417,14 @@ int serial8250_do_startup(struct uart_po * Do a quick test to see if we receive an interrupt when we enable * the TX irq. */ @@ -3887,7 +23232,7 @@ index 2030a92ac66e7..326549603740d 100644 if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { if (!(up->bugs & UART_BUG_TXEN)) { -@@ -2419,7 +2453,7 @@ int serial8250_do_startup(struct uart_port *port) +@@ -2422,7 +2456,7 @@ dont_test_tx_en: if (up->dma) { const char *msg = NULL; @@ -3896,7 +23241,7 @@ index 2030a92ac66e7..326549603740d 100644 msg = "forbid DMA for kernel console"; else if (serial8250_request_dma(up)) msg = "failed to request DMA"; -@@ -2470,7 +2504,7 @@ void serial8250_do_shutdown(struct uart_port *port) +@@ -2473,7 +2507,7 @@ void serial8250_do_shutdown(struct uart_ */ spin_lock_irqsave(&port->lock, flags); up->ier = 0; @@ -3905,7 +23250,7 @@ index 2030a92ac66e7..326549603740d 100644 spin_unlock_irqrestore(&port->lock, flags); synchronize_irq(port->irq); -@@ -2836,7 +2870,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, +@@ -2839,7 +2873,7 @@ serial8250_do_set_termios(struct uart_po if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; @@ -3914,7 +23259,7 @@ index 2030a92ac66e7..326549603740d 100644 if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; -@@ -3301,7 +3335,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); +@@ -3304,7 +3338,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_default #ifdef CONFIG_SERIAL_8250_CONSOLE @@ -3923,7 +23268,7 @@ index 2030a92ac66e7..326549603740d 100644 { struct uart_8250_port *up = up_to_u8250p(port); -@@ -3309,6 +3343,18 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) +@@ -3312,6 +3346,18 @@ static void serial8250_console_putchar(s serial_port_out(port, UART_TX, ch); } @@ -3942,7 +23287,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Restore serial console when h/w power-off detected */ -@@ -3335,6 +3381,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) +@@ -3338,6 +3384,32 @@ static void serial8250_console_restore(s serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); } @@ -3975,7 +23320,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Print a string to the serial port using the device FIFO * -@@ -3380,24 +3452,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3383,24 +3455,12 @@ void serial8250_console_write(struct uar struct uart_port *port = &up->port; unsigned long flags; unsigned int ier, use_fifo; @@ -3987,13 +23332,13 @@ index 2030a92ac66e7..326549603740d 100644 - locked = spin_trylock_irqsave(&port->lock, flags); - else - spin_lock_irqsave(&port->lock, flags); -+ spin_lock_irqsave(&port->lock, flags); - +- - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); -- ++ spin_lock_irqsave(&port->lock, flags); + - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else @@ -4002,7 +23347,7 @@ index 2030a92ac66e7..326549603740d 100644 /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { -@@ -3431,10 +3491,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3434,10 +3494,12 @@ void serial8250_console_write(struct uar */ !(up->port.flags & UPF_CONS_FLOW); @@ -4015,7 +23360,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * Finally, wait for transmitter to become empty -@@ -3447,8 +3509,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3450,8 +3512,7 @@ void serial8250_console_write(struct uar if (em485->tx_stopped) up->rs485_stop_tx(up); } @@ -4025,7 +23370,7 @@ index 2030a92ac66e7..326549603740d 100644 /* * The receive handling will happen properly because the -@@ -3460,8 +3521,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, +@@ -3463,8 +3524,7 @@ void serial8250_console_write(struct uar if (up->msr_saved_flags) serial8250_modem_status(up); @@ -4035,7 +23380,7 @@ index 2030a92ac66e7..326549603740d 100644 } static unsigned int probe_baud(struct uart_port *port) -@@ -3481,6 +3541,7 @@ static unsigned int probe_baud(struct uart_port *port) +@@ -3484,6 +3544,7 @@ static unsigned int probe_baud(struct ua int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { @@ -4043,7 +23388,7 @@ index 2030a92ac66e7..326549603740d 100644 int baud = 9600; int bits = 8; int parity = 'n'; -@@ -3490,6 +3551,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) +@@ -3493,6 +3554,8 @@ int serial8250_console_setup(struct uart if (!port->iobase && !port->membase) return -ENODEV; @@ -4052,10 +23397,3534 @@ index 2030a92ac66e7..326549603740d 100644 if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) -diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig -index d0b49e15fbf5e..02c308467339c 100644 ---- a/drivers/tty/serial/8250/Kconfig -+++ b/drivers/tty/serial/8250/Kconfig +diff -rupN linux.orig/drivers/tty/serial/8250/8250_port.c.orig linux/drivers/tty/serial/8250/8250_port.c.orig +--- linux.orig/drivers/tty/serial/8250/8250_port.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/drivers/tty/serial/8250/8250_port.c.orig 2022-12-04 10:40:18.432055273 -0500 +@@ -0,0 +1,3521 @@ ++// SPDX-License-Identifier: GPL-2.0+ ++/* ++ * Base port operations for 8250/16550-type serial ports ++ * ++ * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. ++ * Split from 8250_core.c, Copyright (C) 2001 Russell King. ++ * ++ * A note about mapbase / membase ++ * ++ * mapbase is the physical address of the IO port. ++ * membase is an 'ioremapped' cookie. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "8250.h" ++ ++/* Nuvoton NPCM timeout register */ ++#define UART_NPCM_TOR 7 ++#define UART_NPCM_TOIE BIT(7) /* Timeout Interrupt Enable */ ++ ++/* ++ * Debugging. ++ */ ++#if 0 ++#define DEBUG_AUTOCONF(fmt...) printk(fmt) ++#else ++#define DEBUG_AUTOCONF(fmt...) do { } while (0) ++#endif ++ ++/* ++ * Here we define the default xmit fifo size used for each type of UART. ++ */ ++static const struct serial8250_config uart_config[] = { ++ [PORT_UNKNOWN] = { ++ .name = "unknown", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_8250] = { ++ .name = "8250", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16450] = { ++ .name = "16450", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16550] = { ++ .name = "16550", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16550A] = { ++ .name = "16550A", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_CIRRUS] = { ++ .name = "Cirrus", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16650] = { ++ .name = "ST16650", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, ++ }, ++ [PORT_16650V2] = { ++ .name = "ST16650V2", ++ .fifo_size = 32, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | ++ UART_FCR_T_TRIG_00, ++ .rxtrig_bytes = {8, 16, 24, 28}, ++ .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, ++ }, ++ [PORT_16750] = { ++ .name = "TI16750", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | ++ UART_FCR7_64BYTE, ++ .rxtrig_bytes = {1, 16, 32, 56}, ++ .flags = UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE, ++ }, ++ [PORT_STARTECH] = { ++ .name = "Startech", ++ .fifo_size = 1, ++ .tx_loadsz = 1, ++ }, ++ [PORT_16C950] = { ++ .name = "16C950/954", ++ .fifo_size = 128, ++ .tx_loadsz = 128, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01, ++ .rxtrig_bytes = {16, 32, 112, 120}, ++ /* UART_CAP_EFR breaks billionon CF bluetooth card. */ ++ .flags = UART_CAP_FIFO | UART_CAP_SLEEP, ++ }, ++ [PORT_16654] = { ++ .name = "ST16654", ++ .fifo_size = 64, ++ .tx_loadsz = 32, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | ++ UART_FCR_T_TRIG_10, ++ .rxtrig_bytes = {8, 16, 56, 60}, ++ .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, ++ }, ++ [PORT_16850] = { ++ .name = "XR16850", ++ .fifo_size = 128, ++ .tx_loadsz = 128, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, ++ }, ++ [PORT_RSA] = { ++ .name = "RSA", ++ .fifo_size = 2048, ++ .tx_loadsz = 2048, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_NS16550A] = { ++ .name = "NS16550A", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_NATSEMI, ++ }, ++ [PORT_XSCALE] = { ++ .name = "XScale", ++ .fifo_size = 32, ++ .tx_loadsz = 32, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE, ++ }, ++ [PORT_OCTEON] = { ++ .name = "OCTEON", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_AR7] = { ++ .name = "AR7", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00, ++ .flags = UART_CAP_FIFO /* | UART_CAP_AFE */, ++ }, ++ [PORT_U6_16550A] = { ++ .name = "U6_16550A", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ [PORT_TEGRA] = { ++ .name = "Tegra", ++ .fifo_size = 32, ++ .tx_loadsz = 8, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | ++ UART_FCR_T_TRIG_01, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO | UART_CAP_RTOIE, ++ }, ++ [PORT_XR17D15X] = { ++ .name = "XR17D15X", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR | ++ UART_CAP_SLEEP, ++ }, ++ [PORT_XR17V35X] = { ++ .name = "XR17V35X", ++ .fifo_size = 256, ++ .tx_loadsz = 256, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 | ++ UART_FCR_T_TRIG_11, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR | ++ UART_CAP_SLEEP, ++ }, ++ [PORT_LPC3220] = { ++ .name = "LPC3220", ++ .fifo_size = 64, ++ .tx_loadsz = 32, ++ .fcr = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO | ++ UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_BRCM_TRUMANAGE] = { ++ .name = "TruManage", ++ .fifo_size = 1, ++ .tx_loadsz = 1024, ++ .flags = UART_CAP_HFIFO, ++ }, ++ [PORT_8250_CIR] = { ++ .name = "CIR port" ++ }, ++ [PORT_ALTR_16550_F32] = { ++ .name = "Altera 16550 FIFO32", ++ .fifo_size = 32, ++ .tx_loadsz = 32, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 8, 16, 30}, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ [PORT_ALTR_16550_F64] = { ++ .name = "Altera 16550 FIFO64", ++ .fifo_size = 64, ++ .tx_loadsz = 64, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 16, 32, 62}, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ [PORT_ALTR_16550_F128] = { ++ .name = "Altera 16550 FIFO128", ++ .fifo_size = 128, ++ .tx_loadsz = 128, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 32, 64, 126}, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ /* ++ * tx_loadsz is set to 63-bytes instead of 64-bytes to implement ++ * workaround of errata A-008006 which states that tx_loadsz should ++ * be configured less than Maximum supported fifo bytes. ++ */ ++ [PORT_16550A_FSL64] = { ++ .name = "16550A_FSL64", ++ .fifo_size = 64, ++ .tx_loadsz = 63, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | ++ UART_FCR7_64BYTE, ++ .flags = UART_CAP_FIFO | UART_CAP_NOTEMT, ++ }, ++ [PORT_RT2880] = { ++ .name = "Palmchip BK-3103", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_DA830] = { ++ .name = "TI DA8xx/66AK2x", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO | ++ UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO | UART_CAP_AFE, ++ }, ++ [PORT_MTK_BTIF] = { ++ .name = "MediaTek BTIF", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | ++ UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_NPCM] = { ++ .name = "Nuvoton 16550", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | ++ UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO, ++ }, ++ [PORT_SUNIX] = { ++ .name = "Sunix", ++ .fifo_size = 128, ++ .tx_loadsz = 128, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, ++ .rxtrig_bytes = {1, 32, 64, 112}, ++ .flags = UART_CAP_FIFO | UART_CAP_SLEEP, ++ }, ++ [PORT_ASPEED_VUART] = { ++ .name = "ASPEED VUART", ++ .fifo_size = 16, ++ .tx_loadsz = 16, ++ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00, ++ .rxtrig_bytes = {1, 4, 8, 14}, ++ .flags = UART_CAP_FIFO, ++ }, ++}; ++ ++/* Uart divisor latch read */ ++static int default_serial_dl_read(struct uart_8250_port *up) ++{ ++ /* Assign these in pieces to truncate any bits above 7. */ ++ unsigned char dll = serial_in(up, UART_DLL); ++ unsigned char dlm = serial_in(up, UART_DLM); ++ ++ return dll | dlm << 8; ++} ++ ++/* Uart divisor latch write */ ++static void default_serial_dl_write(struct uart_8250_port *up, int value) ++{ ++ serial_out(up, UART_DLL, value & 0xff); ++ serial_out(up, UART_DLM, value >> 8 & 0xff); ++} ++ ++#ifdef CONFIG_SERIAL_8250_RT288X ++ ++#define UART_REG_UNMAPPED -1 ++ ++/* Au1x00/RT288x UART hardware has a weird register layout */ ++static const s8 au_io_in_map[8] = { ++ [UART_RX] = 0, ++ [UART_IER] = 2, ++ [UART_IIR] = 3, ++ [UART_LCR] = 5, ++ [UART_MCR] = 6, ++ [UART_LSR] = 7, ++ [UART_MSR] = 8, ++ [UART_SCR] = UART_REG_UNMAPPED, ++}; ++ ++static const s8 au_io_out_map[8] = { ++ [UART_TX] = 1, ++ [UART_IER] = 2, ++ [UART_FCR] = 4, ++ [UART_LCR] = 5, ++ [UART_MCR] = 6, ++ [UART_LSR] = UART_REG_UNMAPPED, ++ [UART_MSR] = UART_REG_UNMAPPED, ++ [UART_SCR] = UART_REG_UNMAPPED, ++}; ++ ++unsigned int au_serial_in(struct uart_port *p, int offset) ++{ ++ if (offset >= ARRAY_SIZE(au_io_in_map)) ++ return UINT_MAX; ++ offset = au_io_in_map[offset]; ++ if (offset == UART_REG_UNMAPPED) ++ return UINT_MAX; ++ return __raw_readl(p->membase + (offset << p->regshift)); ++} ++ ++void au_serial_out(struct uart_port *p, int offset, int value) ++{ ++ if (offset >= ARRAY_SIZE(au_io_out_map)) ++ return; ++ offset = au_io_out_map[offset]; ++ if (offset == UART_REG_UNMAPPED) ++ return; ++ __raw_writel(value, p->membase + (offset << p->regshift)); ++} ++ ++/* Au1x00 haven't got a standard divisor latch */ ++static int au_serial_dl_read(struct uart_8250_port *up) ++{ ++ return __raw_readl(up->port.membase + 0x28); ++} ++ ++static void au_serial_dl_write(struct uart_8250_port *up, int value) ++{ ++ __raw_writel(value, up->port.membase + 0x28); ++} ++ ++#endif ++ ++static unsigned int hub6_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ outb(p->hub6 - 1 + offset, p->iobase); ++ return inb(p->iobase + 1); ++} ++ ++static void hub6_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ outb(p->hub6 - 1 + offset, p->iobase); ++ outb(value, p->iobase + 1); ++} ++ ++static unsigned int mem_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return readb(p->membase + offset); ++} ++ ++static void mem_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ writeb(value, p->membase + offset); ++} ++ ++static void mem16_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ writew(value, p->membase + offset); ++} ++ ++static unsigned int mem16_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return readw(p->membase + offset); ++} ++ ++static void mem32_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ writel(value, p->membase + offset); ++} ++ ++static unsigned int mem32_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return readl(p->membase + offset); ++} ++ ++static void mem32be_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ iowrite32be(value, p->membase + offset); ++} ++ ++static unsigned int mem32be_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return ioread32be(p->membase + offset); ++} ++ ++static unsigned int io_serial_in(struct uart_port *p, int offset) ++{ ++ offset = offset << p->regshift; ++ return inb(p->iobase + offset); ++} ++ ++static void io_serial_out(struct uart_port *p, int offset, int value) ++{ ++ offset = offset << p->regshift; ++ outb(value, p->iobase + offset); ++} ++ ++static int serial8250_default_handle_irq(struct uart_port *port); ++ ++static void set_io_from_upio(struct uart_port *p) ++{ ++ struct uart_8250_port *up = up_to_u8250p(p); ++ ++ up->dl_read = default_serial_dl_read; ++ up->dl_write = default_serial_dl_write; ++ ++ switch (p->iotype) { ++ case UPIO_HUB6: ++ p->serial_in = hub6_serial_in; ++ p->serial_out = hub6_serial_out; ++ break; ++ ++ case UPIO_MEM: ++ p->serial_in = mem_serial_in; ++ p->serial_out = mem_serial_out; ++ break; ++ ++ case UPIO_MEM16: ++ p->serial_in = mem16_serial_in; ++ p->serial_out = mem16_serial_out; ++ break; ++ ++ case UPIO_MEM32: ++ p->serial_in = mem32_serial_in; ++ p->serial_out = mem32_serial_out; ++ break; ++ ++ case UPIO_MEM32BE: ++ p->serial_in = mem32be_serial_in; ++ p->serial_out = mem32be_serial_out; ++ break; ++ ++#ifdef CONFIG_SERIAL_8250_RT288X ++ case UPIO_AU: ++ p->serial_in = au_serial_in; ++ p->serial_out = au_serial_out; ++ up->dl_read = au_serial_dl_read; ++ up->dl_write = au_serial_dl_write; ++ break; ++#endif ++ ++ default: ++ p->serial_in = io_serial_in; ++ p->serial_out = io_serial_out; ++ break; ++ } ++ /* Remember loaded iotype */ ++ up->cur_iotype = p->iotype; ++ p->handle_irq = serial8250_default_handle_irq; ++} ++ ++static void ++serial_port_out_sync(struct uart_port *p, int offset, int value) ++{ ++ switch (p->iotype) { ++ case UPIO_MEM: ++ case UPIO_MEM16: ++ case UPIO_MEM32: ++ case UPIO_MEM32BE: ++ case UPIO_AU: ++ p->serial_out(p, offset, value); ++ p->serial_in(p, UART_LCR); /* safe, no side-effects */ ++ break; ++ default: ++ p->serial_out(p, offset, value); ++ } ++} ++ ++/* ++ * FIFO support. ++ */ ++static void serial8250_clear_fifos(struct uart_8250_port *p) ++{ ++ if (p->capabilities & UART_CAP_FIFO) { ++ serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO); ++ serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO | ++ UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); ++ serial_out(p, UART_FCR, 0); ++ } ++} ++ ++static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t); ++static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t); ++ ++void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p) ++{ ++ serial8250_clear_fifos(p); ++ serial_out(p, UART_FCR, p->fcr); ++} ++EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos); ++ ++void serial8250_rpm_get(struct uart_8250_port *p) ++{ ++ if (!(p->capabilities & UART_CAP_RPM)) ++ return; ++ pm_runtime_get_sync(p->port.dev); ++} ++EXPORT_SYMBOL_GPL(serial8250_rpm_get); ++ ++void serial8250_rpm_put(struct uart_8250_port *p) ++{ ++ if (!(p->capabilities & UART_CAP_RPM)) ++ return; ++ pm_runtime_mark_last_busy(p->port.dev); ++ pm_runtime_put_autosuspend(p->port.dev); ++} ++EXPORT_SYMBOL_GPL(serial8250_rpm_put); ++ ++/** ++ * serial8250_em485_init() - put uart_8250_port into rs485 emulating ++ * @p: uart_8250_port port instance ++ * ++ * The function is used to start rs485 software emulating on the ++ * &struct uart_8250_port* @p. Namely, RTS is switched before/after ++ * transmission. The function is idempotent, so it is safe to call it ++ * multiple times. ++ * ++ * The caller MUST enable interrupt on empty shift register before ++ * calling serial8250_em485_init(). This interrupt is not a part of ++ * 8250 standard, but implementation defined. ++ * ++ * The function is supposed to be called from .rs485_config callback ++ * or from any other callback protected with p->port.lock spinlock. ++ * ++ * See also serial8250_em485_destroy() ++ * ++ * Return 0 - success, -errno - otherwise ++ */ ++static int serial8250_em485_init(struct uart_8250_port *p) ++{ ++ if (p->em485) ++ goto deassert_rts; ++ ++ p->em485 = kmalloc(sizeof(struct uart_8250_em485), GFP_ATOMIC); ++ if (!p->em485) ++ return -ENOMEM; ++ ++ hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx; ++ p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx; ++ p->em485->port = p; ++ p->em485->active_timer = NULL; ++ p->em485->tx_stopped = true; ++ ++deassert_rts: ++ if (p->em485->tx_stopped) ++ p->rs485_stop_tx(p); ++ ++ return 0; ++} ++ ++/** ++ * serial8250_em485_destroy() - put uart_8250_port into normal state ++ * @p: uart_8250_port port instance ++ * ++ * The function is used to stop rs485 software emulating on the ++ * &struct uart_8250_port* @p. The function is idempotent, so it is safe to ++ * call it multiple times. ++ * ++ * The function is supposed to be called from .rs485_config callback ++ * or from any other callback protected with p->port.lock spinlock. ++ * ++ * See also serial8250_em485_init() ++ */ ++void serial8250_em485_destroy(struct uart_8250_port *p) ++{ ++ if (!p->em485) ++ return; ++ ++ hrtimer_cancel(&p->em485->start_tx_timer); ++ hrtimer_cancel(&p->em485->stop_tx_timer); ++ ++ kfree(p->em485); ++ p->em485 = NULL; ++} ++EXPORT_SYMBOL_GPL(serial8250_em485_destroy); ++ ++struct serial_rs485 serial8250_em485_supported = { ++ .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND | ++ SER_RS485_TERMINATE_BUS | SER_RS485_RX_DURING_TX, ++ .delay_rts_before_send = 1, ++ .delay_rts_after_send = 1, ++}; ++EXPORT_SYMBOL_GPL(serial8250_em485_supported); ++ ++/** ++ * serial8250_em485_config() - generic ->rs485_config() callback ++ * @port: uart port ++ * @rs485: rs485 settings ++ * ++ * Generic callback usable by 8250 uart drivers to activate rs485 settings ++ * if the uart is incapable of driving RTS as a Transmit Enable signal in ++ * hardware, relying on software emulation instead. ++ */ ++int serial8250_em485_config(struct uart_port *port, struct ktermios *termios, ++ struct serial_rs485 *rs485) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ /* pick sane settings if the user hasn't */ ++ if (!!(rs485->flags & SER_RS485_RTS_ON_SEND) == ++ !!(rs485->flags & SER_RS485_RTS_AFTER_SEND)) { ++ rs485->flags |= SER_RS485_RTS_ON_SEND; ++ rs485->flags &= ~SER_RS485_RTS_AFTER_SEND; ++ } ++ ++ /* ++ * Both serial8250_em485_init() and serial8250_em485_destroy() ++ * are idempotent. ++ */ ++ if (rs485->flags & SER_RS485_ENABLED) ++ return serial8250_em485_init(up); ++ ++ serial8250_em485_destroy(up); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(serial8250_em485_config); ++ ++/* ++ * These two wrappers ensure that enable_runtime_pm_tx() can be called more than ++ * once and disable_runtime_pm_tx() will still disable RPM because the fifo is ++ * empty and the HW can idle again. ++ */ ++void serial8250_rpm_get_tx(struct uart_8250_port *p) ++{ ++ unsigned char rpm_active; ++ ++ if (!(p->capabilities & UART_CAP_RPM)) ++ return; ++ ++ rpm_active = xchg(&p->rpm_tx_active, 1); ++ if (rpm_active) ++ return; ++ pm_runtime_get_sync(p->port.dev); ++} ++EXPORT_SYMBOL_GPL(serial8250_rpm_get_tx); ++ ++void serial8250_rpm_put_tx(struct uart_8250_port *p) ++{ ++ unsigned char rpm_active; ++ ++ if (!(p->capabilities & UART_CAP_RPM)) ++ return; ++ ++ rpm_active = xchg(&p->rpm_tx_active, 0); ++ if (!rpm_active) ++ return; ++ pm_runtime_mark_last_busy(p->port.dev); ++ pm_runtime_put_autosuspend(p->port.dev); ++} ++EXPORT_SYMBOL_GPL(serial8250_rpm_put_tx); ++ ++/* ++ * IER sleep support. UARTs which have EFRs need the "extended ++ * capability" bit enabled. Note that on XR16C850s, we need to ++ * reset LCR to write to IER. ++ */ ++static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) ++{ ++ unsigned char lcr = 0, efr = 0; ++ ++ serial8250_rpm_get(p); ++ ++ if (p->capabilities & UART_CAP_SLEEP) { ++ if (p->capabilities & UART_CAP_EFR) { ++ lcr = serial_in(p, UART_LCR); ++ efr = serial_in(p, UART_EFR); ++ serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(p, UART_EFR, UART_EFR_ECB); ++ serial_out(p, UART_LCR, 0); ++ } ++ serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); ++ if (p->capabilities & UART_CAP_EFR) { ++ serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(p, UART_EFR, efr); ++ serial_out(p, UART_LCR, lcr); ++ } ++ } ++ ++ serial8250_rpm_put(p); ++} ++ ++#ifdef CONFIG_SERIAL_8250_RSA ++/* ++ * Attempts to turn on the RSA FIFO. Returns zero on failure. ++ * We set the port uart clock rate if we succeed. ++ */ ++static int __enable_rsa(struct uart_8250_port *up) ++{ ++ unsigned char mode; ++ int result; ++ ++ mode = serial_in(up, UART_RSA_MSR); ++ result = mode & UART_RSA_MSR_FIFO; ++ ++ if (!result) { ++ serial_out(up, UART_RSA_MSR, mode | UART_RSA_MSR_FIFO); ++ mode = serial_in(up, UART_RSA_MSR); ++ result = mode & UART_RSA_MSR_FIFO; ++ } ++ ++ if (result) ++ up->port.uartclk = SERIAL_RSA_BAUD_BASE * 16; ++ ++ return result; ++} ++ ++static void enable_rsa(struct uart_8250_port *up) ++{ ++ if (up->port.type == PORT_RSA) { ++ if (up->port.uartclk != SERIAL_RSA_BAUD_BASE * 16) { ++ spin_lock_irq(&up->port.lock); ++ __enable_rsa(up); ++ spin_unlock_irq(&up->port.lock); ++ } ++ if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) ++ serial_out(up, UART_RSA_FRR, 0); ++ } ++} ++ ++/* ++ * Attempts to turn off the RSA FIFO. Returns zero on failure. ++ * It is unknown why interrupts were disabled in here. However, ++ * the caller is expected to preserve this behaviour by grabbing ++ * the spinlock before calling this function. ++ */ ++static void disable_rsa(struct uart_8250_port *up) ++{ ++ unsigned char mode; ++ int result; ++ ++ if (up->port.type == PORT_RSA && ++ up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) { ++ spin_lock_irq(&up->port.lock); ++ ++ mode = serial_in(up, UART_RSA_MSR); ++ result = !(mode & UART_RSA_MSR_FIFO); ++ ++ if (!result) { ++ serial_out(up, UART_RSA_MSR, mode & ~UART_RSA_MSR_FIFO); ++ mode = serial_in(up, UART_RSA_MSR); ++ result = !(mode & UART_RSA_MSR_FIFO); ++ } ++ ++ if (result) ++ up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16; ++ spin_unlock_irq(&up->port.lock); ++ } ++} ++#endif /* CONFIG_SERIAL_8250_RSA */ ++ ++/* ++ * This is a quickie test to see how big the FIFO is. ++ * It doesn't work at all the time, more's the pity. ++ */ ++static int size_fifo(struct uart_8250_port *up) ++{ ++ unsigned char old_fcr, old_mcr, old_lcr; ++ unsigned short old_dl; ++ int count; ++ ++ old_lcr = serial_in(up, UART_LCR); ++ serial_out(up, UART_LCR, 0); ++ old_fcr = serial_in(up, UART_FCR); ++ old_mcr = serial8250_in_MCR(up); ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | ++ UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); ++ serial8250_out_MCR(up, UART_MCR_LOOP); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ old_dl = serial_dl_read(up); ++ serial_dl_write(up, 0x0001); ++ serial_out(up, UART_LCR, UART_LCR_WLEN8); ++ for (count = 0; count < 256; count++) ++ serial_out(up, UART_TX, count); ++ mdelay(20);/* FIXME - schedule_timeout */ ++ for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) && ++ (count < 256); count++) ++ serial_in(up, UART_RX); ++ serial_out(up, UART_FCR, old_fcr); ++ serial8250_out_MCR(up, old_mcr); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ serial_dl_write(up, old_dl); ++ serial_out(up, UART_LCR, old_lcr); ++ ++ return count; ++} ++ ++/* ++ * Read UART ID using the divisor method - set DLL and DLM to zero ++ * and the revision will be in DLL and device type in DLM. We ++ * preserve the device state across this. ++ */ ++static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p) ++{ ++ unsigned char old_lcr; ++ unsigned int id, old_dl; ++ ++ old_lcr = serial_in(p, UART_LCR); ++ serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A); ++ old_dl = serial_dl_read(p); ++ serial_dl_write(p, 0); ++ id = serial_dl_read(p); ++ serial_dl_write(p, old_dl); ++ ++ serial_out(p, UART_LCR, old_lcr); ++ ++ return id; ++} ++ ++/* ++ * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's. ++ * When this function is called we know it is at least a StarTech ++ * 16650 V2, but it might be one of several StarTech UARTs, or one of ++ * its clones. (We treat the broken original StarTech 16650 V1 as a ++ * 16550, and why not? Startech doesn't seem to even acknowledge its ++ * existence.) ++ * ++ * What evil have men's minds wrought... ++ */ ++static void autoconfig_has_efr(struct uart_8250_port *up) ++{ ++ unsigned int id1, id2, id3, rev; ++ ++ /* ++ * Everything with an EFR has SLEEP ++ */ ++ up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP; ++ ++ /* ++ * First we check to see if it's an Oxford Semiconductor UART. ++ * ++ * If we have to do this here because some non-National ++ * Semiconductor clone chips lock up if you try writing to the ++ * LSR register (which serial_icr_read does) ++ */ ++ ++ /* ++ * Check for Oxford Semiconductor 16C950. ++ * ++ * EFR [4] must be set else this test fails. ++ * ++ * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca) ++ * claims that it's needed for 952 dual UART's (which are not ++ * recommended for new designs). ++ */ ++ up->acr = 0; ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_EFR, UART_EFR_ECB); ++ serial_out(up, UART_LCR, 0x00); ++ id1 = serial_icr_read(up, UART_ID1); ++ id2 = serial_icr_read(up, UART_ID2); ++ id3 = serial_icr_read(up, UART_ID3); ++ rev = serial_icr_read(up, UART_REV); ++ ++ DEBUG_AUTOCONF("950id=%02x:%02x:%02x:%02x ", id1, id2, id3, rev); ++ ++ if (id1 == 0x16 && id2 == 0xC9 && ++ (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) { ++ up->port.type = PORT_16C950; ++ ++ /* ++ * Enable work around for the Oxford Semiconductor 952 rev B ++ * chip which causes it to seriously miscalculate baud rates ++ * when DLL is 0. ++ */ ++ if (id3 == 0x52 && rev == 0x01) ++ up->bugs |= UART_BUG_QUOT; ++ return; ++ } ++ ++ /* ++ * We check for a XR16C850 by setting DLL and DLM to 0, and then ++ * reading back DLL and DLM. The chip type depends on the DLM ++ * value read back: ++ * 0x10 - XR16C850 and the DLL contains the chip revision. ++ * 0x12 - XR16C2850. ++ * 0x14 - XR16C854. ++ */ ++ id1 = autoconfig_read_divisor_id(up); ++ DEBUG_AUTOCONF("850id=%04x ", id1); ++ ++ id2 = id1 >> 8; ++ if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) { ++ up->port.type = PORT_16850; ++ return; ++ } ++ ++ /* ++ * It wasn't an XR16C850. ++ * ++ * We distinguish between the '654 and the '650 by counting ++ * how many bytes are in the FIFO. I'm using this for now, ++ * since that's the technique that was sent to me in the ++ * serial driver update, but I'm not convinced this works. ++ * I've had problems doing this in the past. -TYT ++ */ ++ if (size_fifo(up) == 64) ++ up->port.type = PORT_16654; ++ else ++ up->port.type = PORT_16650V2; ++} ++ ++/* ++ * We detected a chip without a FIFO. Only two fall into ++ * this category - the original 8250 and the 16450. The ++ * 16450 has a scratch register (accessible with LCR=0) ++ */ ++static void autoconfig_8250(struct uart_8250_port *up) ++{ ++ unsigned char scratch, status1, status2; ++ ++ up->port.type = PORT_8250; ++ ++ scratch = serial_in(up, UART_SCR); ++ serial_out(up, UART_SCR, 0xa5); ++ status1 = serial_in(up, UART_SCR); ++ serial_out(up, UART_SCR, 0x5a); ++ status2 = serial_in(up, UART_SCR); ++ serial_out(up, UART_SCR, scratch); ++ ++ if (status1 == 0xa5 && status2 == 0x5a) ++ up->port.type = PORT_16450; ++} ++ ++static int broken_efr(struct uart_8250_port *up) ++{ ++ /* ++ * Exar ST16C2550 "A2" devices incorrectly detect as ++ * having an EFR, and report an ID of 0x0201. See ++ * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html ++ */ ++ if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * We know that the chip has FIFOs. Does it have an EFR? The ++ * EFR is located in the same register position as the IIR and ++ * we know the top two bits of the IIR are currently set. The ++ * EFR should contain zero. Try to read the EFR. ++ */ ++static void autoconfig_16550a(struct uart_8250_port *up) ++{ ++ unsigned char status1, status2; ++ unsigned int iersave; ++ ++ up->port.type = PORT_16550A; ++ up->capabilities |= UART_CAP_FIFO; ++ ++ if (!IS_ENABLED(CONFIG_SERIAL_8250_16550A_VARIANTS) && ++ !(up->port.flags & UPF_FULL_PROBE)) ++ return; ++ ++ /* ++ * Check for presence of the EFR when DLAB is set. ++ * Only ST16C650V1 UARTs pass this test. ++ */ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ if (serial_in(up, UART_EFR) == 0) { ++ serial_out(up, UART_EFR, 0xA8); ++ if (serial_in(up, UART_EFR) != 0) { ++ DEBUG_AUTOCONF("EFRv1 "); ++ up->port.type = PORT_16650; ++ up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP; ++ } else { ++ serial_out(up, UART_LCR, 0); ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | ++ UART_FCR7_64BYTE); ++ status1 = serial_in(up, UART_IIR) >> 5; ++ serial_out(up, UART_FCR, 0); ++ serial_out(up, UART_LCR, 0); ++ ++ if (status1 == 7) ++ up->port.type = PORT_16550A_FSL64; ++ else ++ DEBUG_AUTOCONF("Motorola 8xxx DUART "); ++ } ++ serial_out(up, UART_EFR, 0); ++ return; ++ } ++ ++ /* ++ * Maybe it requires 0xbf to be written to the LCR. ++ * (other ST16C650V2 UARTs, TI16C752A, etc) ++ */ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) { ++ DEBUG_AUTOCONF("EFRv2 "); ++ autoconfig_has_efr(up); ++ return; ++ } ++ ++ /* ++ * Check for a National Semiconductor SuperIO chip. ++ * Attempt to switch to bank 2, read the value of the LOOP bit ++ * from EXCR1. Switch back to bank 0, change it in MCR. Then ++ * switch back to bank 2, read it from EXCR1 again and check ++ * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2 ++ */ ++ serial_out(up, UART_LCR, 0); ++ status1 = serial8250_in_MCR(up); ++ serial_out(up, UART_LCR, 0xE0); ++ status2 = serial_in(up, 0x02); /* EXCR1 */ ++ ++ if (!((status2 ^ status1) & UART_MCR_LOOP)) { ++ serial_out(up, UART_LCR, 0); ++ serial8250_out_MCR(up, status1 ^ UART_MCR_LOOP); ++ serial_out(up, UART_LCR, 0xE0); ++ status2 = serial_in(up, 0x02); /* EXCR1 */ ++ serial_out(up, UART_LCR, 0); ++ serial8250_out_MCR(up, status1); ++ ++ if ((status2 ^ status1) & UART_MCR_LOOP) { ++ unsigned short quot; ++ ++ serial_out(up, UART_LCR, 0xE0); ++ ++ quot = serial_dl_read(up); ++ quot <<= 3; ++ ++ if (ns16550a_goto_highspeed(up)) ++ serial_dl_write(up, quot); ++ ++ serial_out(up, UART_LCR, 0); ++ ++ up->port.uartclk = 921600*16; ++ up->port.type = PORT_NS16550A; ++ up->capabilities |= UART_NATSEMI; ++ return; ++ } ++ } ++ ++ /* ++ * No EFR. Try to detect a TI16750, which only sets bit 5 of ++ * the IIR when 64 byte FIFO mode is enabled when DLAB is set. ++ * Try setting it with and without DLAB set. Cheap clones ++ * set bit 5 without DLAB set. ++ */ ++ serial_out(up, UART_LCR, 0); ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); ++ status1 = serial_in(up, UART_IIR) >> 5; ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); ++ status2 = serial_in(up, UART_IIR) >> 5; ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); ++ serial_out(up, UART_LCR, 0); ++ ++ DEBUG_AUTOCONF("iir1=%d iir2=%d ", status1, status2); ++ ++ if (status1 == 6 && status2 == 7) { ++ up->port.type = PORT_16750; ++ up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP; ++ return; ++ } ++ ++ /* ++ * Try writing and reading the UART_IER_UUE bit (b6). ++ * If it works, this is probably one of the Xscale platform's ++ * internal UARTs. ++ * We're going to explicitly set the UUE bit to 0 before ++ * trying to write and read a 1 just to make sure it's not ++ * already a 1 and maybe locked there before we even start start. ++ */ ++ iersave = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, iersave & ~UART_IER_UUE); ++ if (!(serial_in(up, UART_IER) & UART_IER_UUE)) { ++ /* ++ * OK it's in a known zero state, try writing and reading ++ * without disturbing the current state of the other bits. ++ */ ++ serial_out(up, UART_IER, iersave | UART_IER_UUE); ++ if (serial_in(up, UART_IER) & UART_IER_UUE) { ++ /* ++ * It's an Xscale. ++ * We'll leave the UART_IER_UUE bit set to 1 (enabled). ++ */ ++ DEBUG_AUTOCONF("Xscale "); ++ up->port.type = PORT_XSCALE; ++ up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE; ++ return; ++ } ++ } else { ++ /* ++ * If we got here we couldn't force the IER_UUE bit to 0. ++ * Log it and continue. ++ */ ++ DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 "); ++ } ++ serial_out(up, UART_IER, iersave); ++ ++ /* ++ * We distinguish between 16550A and U6 16550A by counting ++ * how many bytes are in the FIFO. ++ */ ++ if (up->port.type == PORT_16550A && size_fifo(up) == 64) { ++ up->port.type = PORT_U6_16550A; ++ up->capabilities |= UART_CAP_AFE; ++ } ++} ++ ++/* ++ * This routine is called by rs_init() to initialize a specific serial ++ * port. It determines what type of UART chip this serial port is ++ * using: 8250, 16450, 16550, 16550A. The important question is ++ * whether or not this UART is a 16550A or not, since this will ++ * determine whether or not we can use its FIFO features or not. ++ */ ++static void autoconfig(struct uart_8250_port *up) ++{ ++ unsigned char status1, scratch, scratch2, scratch3; ++ unsigned char save_lcr, save_mcr; ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int old_capabilities; ++ ++ if (!port->iobase && !port->mapbase && !port->membase) ++ return; ++ ++ DEBUG_AUTOCONF("%s: autoconf (0x%04lx, 0x%p): ", ++ port->name, port->iobase, port->membase); ++ ++ /* ++ * We really do need global IRQs disabled here - we're going to ++ * be frobbing the chips IRQ enable register to see if it exists. ++ */ ++ spin_lock_irqsave(&port->lock, flags); ++ ++ up->capabilities = 0; ++ up->bugs = 0; ++ ++ if (!(port->flags & UPF_BUGGY_UART)) { ++ /* ++ * Do a simple existence test first; if we fail this, ++ * there's no point trying anything else. ++ * ++ * 0x80 is used as a nonsense port to prevent against ++ * false positives due to ISA bus float. The ++ * assumption is that 0x80 is a non-existent port; ++ * which should be safe since include/asm/io.h also ++ * makes this assumption. ++ * ++ * Note: this is safe as long as MCR bit 4 is clear ++ * and the device is in "PC" mode. ++ */ ++ scratch = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, 0); ++#ifdef __i386__ ++ outb(0xff, 0x080); ++#endif ++ /* ++ * Mask out IER[7:4] bits for test as some UARTs (e.g. TL ++ * 16C754B) allow only to modify them if an EFR bit is set. ++ */ ++ scratch2 = serial_in(up, UART_IER) & 0x0f; ++ serial_out(up, UART_IER, 0x0F); ++#ifdef __i386__ ++ outb(0, 0x080); ++#endif ++ scratch3 = serial_in(up, UART_IER) & 0x0f; ++ serial_out(up, UART_IER, scratch); ++ if (scratch2 != 0 || scratch3 != 0x0F) { ++ /* ++ * We failed; there's nothing here ++ */ ++ spin_unlock_irqrestore(&port->lock, flags); ++ DEBUG_AUTOCONF("IER test failed (%02x, %02x) ", ++ scratch2, scratch3); ++ goto out; ++ } ++ } ++ ++ save_mcr = serial8250_in_MCR(up); ++ save_lcr = serial_in(up, UART_LCR); ++ ++ /* ++ * Check to see if a UART is really there. Certain broken ++ * internal modems based on the Rockwell chipset fail this ++ * test, because they apparently don't implement the loopback ++ * test mode. So this test is skipped on the COM 1 through ++ * COM 4 ports. This *should* be safe, since no board ++ * manufacturer would be stupid enough to design a board ++ * that conflicts with COM 1-4 --- we hope! ++ */ ++ if (!(port->flags & UPF_SKIP_TEST)) { ++ serial8250_out_MCR(up, UART_MCR_LOOP | 0x0A); ++ status1 = serial_in(up, UART_MSR) & 0xF0; ++ serial8250_out_MCR(up, save_mcr); ++ if (status1 != 0x90) { ++ spin_unlock_irqrestore(&port->lock, flags); ++ DEBUG_AUTOCONF("LOOP test failed (%02x) ", ++ status1); ++ goto out; ++ } ++ } ++ ++ /* ++ * We're pretty sure there's a port here. Lets find out what ++ * type of port it is. The IIR top two bits allows us to find ++ * out if it's 8250 or 16450, 16550, 16550A or later. This ++ * determines what we test for next. ++ * ++ * We also initialise the EFR (if any) to zero for later. The ++ * EFR occupies the same register location as the FCR and IIR. ++ */ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_out(up, UART_EFR, 0); ++ serial_out(up, UART_LCR, 0); ++ ++ serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); ++ ++ /* Assign this as it is to truncate any bits above 7. */ ++ scratch = serial_in(up, UART_IIR); ++ ++ switch (scratch >> 6) { ++ case 0: ++ autoconfig_8250(up); ++ break; ++ case 1: ++ port->type = PORT_UNKNOWN; ++ break; ++ case 2: ++ port->type = PORT_16550; ++ break; ++ case 3: ++ autoconfig_16550a(up); ++ break; ++ } ++ ++#ifdef CONFIG_SERIAL_8250_RSA ++ /* ++ * Only probe for RSA ports if we got the region. ++ */ ++ if (port->type == PORT_16550A && up->probe & UART_PROBE_RSA && ++ __enable_rsa(up)) ++ port->type = PORT_RSA; ++#endif ++ ++ serial_out(up, UART_LCR, save_lcr); ++ ++ port->fifosize = uart_config[up->port.type].fifo_size; ++ old_capabilities = up->capabilities; ++ up->capabilities = uart_config[port->type].flags; ++ up->tx_loadsz = uart_config[port->type].tx_loadsz; ++ ++ if (port->type == PORT_UNKNOWN) ++ goto out_unlock; ++ ++ /* ++ * Reset the UART. ++ */ ++#ifdef CONFIG_SERIAL_8250_RSA ++ if (port->type == PORT_RSA) ++ serial_out(up, UART_RSA_FRR, 0); ++#endif ++ serial8250_out_MCR(up, save_mcr); ++ serial8250_clear_fifos(up); ++ serial_in(up, UART_RX); ++ if (up->capabilities & UART_CAP_UUE) ++ serial_out(up, UART_IER, UART_IER_UUE); ++ else ++ serial_out(up, UART_IER, 0); ++ ++out_unlock: ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ /* ++ * Check if the device is a Fintek F81216A ++ */ ++ if (port->type == PORT_16550A && port->iotype == UPIO_PORT) ++ fintek_8250_probe(up); ++ ++ if (up->capabilities != old_capabilities) { ++ dev_warn(port->dev, "detected caps %08x should be %08x\n", ++ old_capabilities, up->capabilities); ++ } ++out: ++ DEBUG_AUTOCONF("iir=%d ", scratch); ++ DEBUG_AUTOCONF("type=%s\n", uart_config[port->type].name); ++} ++ ++static void autoconfig_irq(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned char save_mcr, save_ier; ++ unsigned char save_ICP = 0; ++ unsigned int ICP = 0; ++ unsigned long irqs; ++ int irq; ++ ++ if (port->flags & UPF_FOURPORT) { ++ ICP = (port->iobase & 0xfe0) | 0x1f; ++ save_ICP = inb_p(ICP); ++ outb_p(0x80, ICP); ++ inb_p(ICP); ++ } ++ ++ if (uart_console(port)) ++ console_lock(); ++ ++ /* forget possible initially masked and pending IRQ */ ++ probe_irq_off(probe_irq_on()); ++ save_mcr = serial8250_in_MCR(up); ++ save_ier = serial_in(up, UART_IER); ++ serial8250_out_MCR(up, UART_MCR_OUT1 | UART_MCR_OUT2); ++ ++ irqs = probe_irq_on(); ++ serial8250_out_MCR(up, 0); ++ udelay(10); ++ if (port->flags & UPF_FOURPORT) { ++ serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); ++ } else { ++ serial8250_out_MCR(up, ++ UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2); ++ } ++ serial_out(up, UART_IER, 0x0f); /* enable all intrs */ ++ serial_in(up, UART_LSR); ++ serial_in(up, UART_RX); ++ serial_in(up, UART_IIR); ++ serial_in(up, UART_MSR); ++ serial_out(up, UART_TX, 0xFF); ++ udelay(20); ++ irq = probe_irq_off(irqs); ++ ++ serial8250_out_MCR(up, save_mcr); ++ serial_out(up, UART_IER, save_ier); ++ ++ if (port->flags & UPF_FOURPORT) ++ outb_p(save_ICP, ICP); ++ ++ if (uart_console(port)) ++ console_unlock(); ++ ++ port->irq = (irq > 0) ? irq : 0; ++} ++ ++static void serial8250_stop_rx(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ serial8250_rpm_get(up); ++ ++ up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); ++ up->port.read_status_mask &= ~UART_LSR_DR; ++ serial_port_out(port, UART_IER, up->ier); ++ ++ serial8250_rpm_put(up); ++} ++ ++/** ++ * serial8250_em485_stop_tx() - generic ->rs485_stop_tx() callback ++ * @p: uart 8250 port ++ * ++ * Generic callback usable by 8250 uart drivers to stop rs485 transmission. ++ */ ++void serial8250_em485_stop_tx(struct uart_8250_port *p) ++{ ++ unsigned char mcr = serial8250_in_MCR(p); ++ ++ if (p->port.rs485.flags & SER_RS485_RTS_AFTER_SEND) ++ mcr |= UART_MCR_RTS; ++ else ++ mcr &= ~UART_MCR_RTS; ++ serial8250_out_MCR(p, mcr); ++ ++ /* ++ * Empty the RX FIFO, we are not interested in anything ++ * received during the half-duplex transmission. ++ * Enable previously disabled RX interrupts. ++ */ ++ if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) { ++ serial8250_clear_and_reinit_fifos(p); ++ ++ p->ier |= UART_IER_RLSI | UART_IER_RDI; ++ serial_port_out(&p->port, UART_IER, p->ier); ++ } ++} ++EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); ++ ++static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t) ++{ ++ struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485, ++ stop_tx_timer); ++ struct uart_8250_port *p = em485->port; ++ unsigned long flags; ++ ++ serial8250_rpm_get(p); ++ spin_lock_irqsave(&p->port.lock, flags); ++ if (em485->active_timer == &em485->stop_tx_timer) { ++ p->rs485_stop_tx(p); ++ em485->active_timer = NULL; ++ em485->tx_stopped = true; ++ } ++ spin_unlock_irqrestore(&p->port.lock, flags); ++ serial8250_rpm_put(p); ++ ++ return HRTIMER_NORESTART; ++} ++ ++static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec) ++{ ++ hrtimer_start(hrt, ms_to_ktime(msec), HRTIMER_MODE_REL); ++} ++ ++static void __stop_tx_rs485(struct uart_8250_port *p, u64 stop_delay) ++{ ++ struct uart_8250_em485 *em485 = p->em485; ++ ++ stop_delay += (u64)p->port.rs485.delay_rts_after_send * NSEC_PER_MSEC; ++ ++ /* ++ * rs485_stop_tx() is going to set RTS according to config ++ * AND flush RX FIFO if required. ++ */ ++ if (stop_delay > 0) { ++ em485->active_timer = &em485->stop_tx_timer; ++ hrtimer_start(&em485->stop_tx_timer, ns_to_ktime(stop_delay), HRTIMER_MODE_REL); ++ } else { ++ p->rs485_stop_tx(p); ++ em485->active_timer = NULL; ++ em485->tx_stopped = true; ++ } ++} ++ ++static inline void __stop_tx(struct uart_8250_port *p) ++{ ++ struct uart_8250_em485 *em485 = p->em485; ++ ++ if (em485) { ++ u16 lsr = serial_lsr_in(p); ++ u64 stop_delay = 0; ++ ++ p->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS; ++ ++ if (!(lsr & UART_LSR_THRE)) ++ return; ++ /* ++ * To provide required timing and allow FIFO transfer, ++ * __stop_tx_rs485() must be called only when both FIFO and ++ * shift register are empty. The device driver should either ++ * enable interrupt on TEMT or set UART_CAP_NOTEMT that will ++ * enlarge stop_tx_timer by the tx time of one frame to cover ++ * for emptying of the shift register. ++ */ ++ if (!(lsr & UART_LSR_TEMT)) { ++ if (!(p->capabilities & UART_CAP_NOTEMT)) ++ return; ++ /* ++ * RTS might get deasserted too early with the normal ++ * frame timing formula. It seems to suggest THRE might ++ * get asserted already during tx of the stop bit ++ * rather than after it is fully sent. ++ * Roughly estimate 1 extra bit here with / 7. ++ */ ++ stop_delay = p->port.frame_time + DIV_ROUND_UP(p->port.frame_time, 7); ++ } ++ ++ __stop_tx_rs485(p, stop_delay); ++ } ++ ++ if (serial8250_clear_THRI(p)) ++ serial8250_rpm_put_tx(p); ++} ++ ++static void serial8250_stop_tx(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ serial8250_rpm_get(up); ++ __stop_tx(up); ++ ++ /* ++ * We really want to stop the transmitter from sending. ++ */ ++ if (port->type == PORT_16C950) { ++ up->acr |= UART_ACR_TXDIS; ++ serial_icr_write(up, UART_ACR, up->acr); ++ } ++ serial8250_rpm_put(up); ++} ++ ++static inline void __start_tx(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ if (up->dma && !up->dma->tx_dma(up)) ++ return; ++ ++ if (serial8250_set_THRI(up)) { ++ if (up->bugs & UART_BUG_TXEN) { ++ u16 lsr = serial_lsr_in(up); ++ ++ if (lsr & UART_LSR_THRE) ++ serial8250_tx_chars(up); ++ } ++ } ++ ++ /* ++ * Re-enable the transmitter if we disabled it. ++ */ ++ if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) { ++ up->acr &= ~UART_ACR_TXDIS; ++ serial_icr_write(up, UART_ACR, up->acr); ++ } ++} ++ ++/** ++ * serial8250_em485_start_tx() - generic ->rs485_start_tx() callback ++ * @up: uart 8250 port ++ * ++ * Generic callback usable by 8250 uart drivers to start rs485 transmission. ++ * Assumes that setting the RTS bit in the MCR register means RTS is high. ++ * (Some chips use inverse semantics.) Further assumes that reception is ++ * stoppable by disabling the UART_IER_RDI interrupt. (Some chips set the ++ * UART_LSR_DR bit even when UART_IER_RDI is disabled, foiling this approach.) ++ */ ++void serial8250_em485_start_tx(struct uart_8250_port *up) ++{ ++ unsigned char mcr = serial8250_in_MCR(up); ++ ++ if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX)) ++ serial8250_stop_rx(&up->port); ++ ++ if (up->port.rs485.flags & SER_RS485_RTS_ON_SEND) ++ mcr |= UART_MCR_RTS; ++ else ++ mcr &= ~UART_MCR_RTS; ++ serial8250_out_MCR(up, mcr); ++} ++EXPORT_SYMBOL_GPL(serial8250_em485_start_tx); ++ ++/* Returns false, if start_tx_timer was setup to defer TX start */ ++static bool start_tx_rs485(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct uart_8250_em485 *em485 = up->em485; ++ ++ /* ++ * While serial8250_em485_handle_stop_tx() is a noop if ++ * em485->active_timer != &em485->stop_tx_timer, it might happen that ++ * the timer is still armed and triggers only after the current bunch of ++ * chars is send and em485->active_timer == &em485->stop_tx_timer again. ++ * So cancel the timer. There is still a theoretical race condition if ++ * the timer is already running and only comes around to check for ++ * em485->active_timer when &em485->stop_tx_timer is armed again. ++ */ ++ if (em485->active_timer == &em485->stop_tx_timer) ++ hrtimer_try_to_cancel(&em485->stop_tx_timer); ++ ++ em485->active_timer = NULL; ++ ++ if (em485->tx_stopped) { ++ em485->tx_stopped = false; ++ ++ up->rs485_start_tx(up); ++ ++ if (up->port.rs485.delay_rts_before_send > 0) { ++ em485->active_timer = &em485->start_tx_timer; ++ start_hrtimer_ms(&em485->start_tx_timer, ++ up->port.rs485.delay_rts_before_send); ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t) ++{ ++ struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485, ++ start_tx_timer); ++ struct uart_8250_port *p = em485->port; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&p->port.lock, flags); ++ if (em485->active_timer == &em485->start_tx_timer) { ++ __start_tx(&p->port); ++ em485->active_timer = NULL; ++ } ++ spin_unlock_irqrestore(&p->port.lock, flags); ++ ++ return HRTIMER_NORESTART; ++} ++ ++static void serial8250_start_tx(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct uart_8250_em485 *em485 = up->em485; ++ ++ if (!port->x_char && uart_circ_empty(&port->state->xmit)) ++ return; ++ ++ serial8250_rpm_get_tx(up); ++ ++ if (em485) { ++ if ((em485->active_timer == &em485->start_tx_timer) || ++ !start_tx_rs485(port)) ++ return; ++ } ++ __start_tx(port); ++} ++ ++static void serial8250_throttle(struct uart_port *port) ++{ ++ port->throttle(port); ++} ++ ++static void serial8250_unthrottle(struct uart_port *port) ++{ ++ port->unthrottle(port); ++} ++ ++static void serial8250_disable_ms(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ /* no MSR capabilities */ ++ if (up->bugs & UART_BUG_NOMSR) ++ return; ++ ++ mctrl_gpio_disable_ms(up->gpios); ++ ++ up->ier &= ~UART_IER_MSI; ++ serial_port_out(port, UART_IER, up->ier); ++} ++ ++static void serial8250_enable_ms(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ /* no MSR capabilities */ ++ if (up->bugs & UART_BUG_NOMSR) ++ return; ++ ++ mctrl_gpio_enable_ms(up->gpios); ++ ++ up->ier |= UART_IER_MSI; ++ ++ serial8250_rpm_get(up); ++ serial_port_out(port, UART_IER, up->ier); ++ serial8250_rpm_put(up); ++} ++ ++void serial8250_read_char(struct uart_8250_port *up, u16 lsr) ++{ ++ struct uart_port *port = &up->port; ++ unsigned char ch; ++ char flag = TTY_NORMAL; ++ ++ if (likely(lsr & UART_LSR_DR)) ++ ch = serial_in(up, UART_RX); ++ else ++ /* ++ * Intel 82571 has a Serial Over Lan device that will ++ * set UART_LSR_BI without setting UART_LSR_DR when ++ * it receives a break. To avoid reading from the ++ * receive buffer without UART_LSR_DR bit set, we ++ * just force the read character to be 0 ++ */ ++ ch = 0; ++ ++ port->icount.rx++; ++ ++ lsr |= up->lsr_saved_flags; ++ up->lsr_saved_flags = 0; ++ ++ if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) { ++ if (lsr & UART_LSR_BI) { ++ lsr &= ~(UART_LSR_FE | UART_LSR_PE); ++ port->icount.brk++; ++ /* ++ * We do the SysRQ and SAK checking ++ * here because otherwise the break ++ * may get masked by ignore_status_mask ++ * or read_status_mask. ++ */ ++ if (uart_handle_break(port)) ++ return; ++ } else if (lsr & UART_LSR_PE) ++ port->icount.parity++; ++ else if (lsr & UART_LSR_FE) ++ port->icount.frame++; ++ if (lsr & UART_LSR_OE) ++ port->icount.overrun++; ++ ++ /* ++ * Mask off conditions which should be ignored. ++ */ ++ lsr &= port->read_status_mask; ++ ++ if (lsr & UART_LSR_BI) { ++ dev_dbg(port->dev, "handling break\n"); ++ flag = TTY_BREAK; ++ } else if (lsr & UART_LSR_PE) ++ flag = TTY_PARITY; ++ else if (lsr & UART_LSR_FE) ++ flag = TTY_FRAME; ++ } ++ if (uart_prepare_sysrq_char(port, ch)) ++ return; ++ ++ uart_insert_char(port, lsr, UART_LSR_OE, ch, flag); ++} ++EXPORT_SYMBOL_GPL(serial8250_read_char); ++ ++/* ++ * serial8250_rx_chars - Read characters. The first LSR value must be passed in. ++ * ++ * Returns LSR bits. The caller should rely only on non-Rx related LSR bits ++ * (such as THRE) because the LSR value might come from an already consumed ++ * character. ++ */ ++u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr) ++{ ++ struct uart_port *port = &up->port; ++ int max_count = 256; ++ ++ do { ++ serial8250_read_char(up, lsr); ++ if (--max_count == 0) ++ break; ++ lsr = serial_in(up, UART_LSR); ++ } while (lsr & (UART_LSR_DR | UART_LSR_BI)); ++ ++ tty_flip_buffer_push(&port->state->port); ++ return lsr; ++} ++EXPORT_SYMBOL_GPL(serial8250_rx_chars); ++ ++void serial8250_tx_chars(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ struct circ_buf *xmit = &port->state->xmit; ++ int count; ++ ++ if (port->x_char) { ++ uart_xchar_out(port, UART_TX); ++ return; ++ } ++ if (uart_tx_stopped(port)) { ++ serial8250_stop_tx(port); ++ return; ++ } ++ if (uart_circ_empty(xmit)) { ++ __stop_tx(up); ++ return; ++ } ++ ++ count = up->tx_loadsz; ++ do { ++ serial_out(up, UART_TX, xmit->buf[xmit->tail]); ++ if (up->bugs & UART_BUG_TXRACE) { ++ /* ++ * The Aspeed BMC virtual UARTs have a bug where data ++ * may get stuck in the BMC's Tx FIFO from bursts of ++ * writes on the APB interface. ++ * ++ * Delay back-to-back writes by a read cycle to avoid ++ * stalling the VUART. Read a register that won't have ++ * side-effects and discard the result. ++ */ ++ serial_in(up, UART_SCR); ++ } ++ xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); ++ port->icount.tx++; ++ if (uart_circ_empty(xmit)) ++ break; ++ if ((up->capabilities & UART_CAP_HFIFO) && ++ !uart_lsr_tx_empty(serial_in(up, UART_LSR))) ++ break; ++ /* The BCM2835 MINI UART THRE bit is really a not-full bit. */ ++ if ((up->capabilities & UART_CAP_MINI) && ++ !(serial_in(up, UART_LSR) & UART_LSR_THRE)) ++ break; ++ } while (--count > 0); ++ ++ if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) ++ uart_write_wakeup(port); ++ ++ /* ++ * With RPM enabled, we have to wait until the FIFO is empty before the ++ * HW can go idle. So we get here once again with empty FIFO and disable ++ * the interrupt and RPM in __stop_tx() ++ */ ++ if (uart_circ_empty(xmit) && !(up->capabilities & UART_CAP_RPM)) ++ __stop_tx(up); ++} ++EXPORT_SYMBOL_GPL(serial8250_tx_chars); ++ ++/* Caller holds uart port lock */ ++unsigned int serial8250_modem_status(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int status = serial_in(up, UART_MSR); ++ ++ status |= up->msr_saved_flags; ++ up->msr_saved_flags = 0; ++ if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI && ++ port->state != NULL) { ++ if (status & UART_MSR_TERI) ++ port->icount.rng++; ++ if (status & UART_MSR_DDSR) ++ port->icount.dsr++; ++ if (status & UART_MSR_DDCD) ++ uart_handle_dcd_change(port, status & UART_MSR_DCD); ++ if (status & UART_MSR_DCTS) ++ uart_handle_cts_change(port, status & UART_MSR_CTS); ++ ++ wake_up_interruptible(&port->state->port.delta_msr_wait); ++ } ++ ++ return status; ++} ++EXPORT_SYMBOL_GPL(serial8250_modem_status); ++ ++static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir) ++{ ++ switch (iir & 0x3f) { ++ case UART_IIR_RDI: ++ if (!up->dma->rx_running) ++ break; ++ fallthrough; ++ case UART_IIR_RLSI: ++ case UART_IIR_RX_TIMEOUT: ++ serial8250_rx_dma_flush(up); ++ return true; ++ } ++ return up->dma->rx_dma(up); ++} ++ ++/* ++ * This handles the interrupt from one port. ++ */ ++int serial8250_handle_irq(struct uart_port *port, unsigned int iir) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ bool skip_rx = false; ++ unsigned long flags; ++ u16 status; ++ ++ if (iir & UART_IIR_NO_INT) ++ return 0; ++ ++ spin_lock_irqsave(&port->lock, flags); ++ ++ status = serial_lsr_in(up); ++ ++ /* ++ * If port is stopped and there are no error conditions in the ++ * FIFO, then don't drain the FIFO, as this may lead to TTY buffer ++ * overflow. Not servicing, RX FIFO would trigger auto HW flow ++ * control when FIFO occupancy reaches preset threshold, thus ++ * halting RX. This only works when auto HW flow control is ++ * available. ++ */ ++ if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) && ++ (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) && ++ !(port->read_status_mask & UART_LSR_DR)) ++ skip_rx = true; ++ ++ if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) { ++ if (!up->dma || handle_rx_dma(up, iir)) ++ status = serial8250_rx_chars(up, status); ++ } ++ serial8250_modem_status(up); ++ if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) { ++ if (!up->dma || up->dma->tx_err) ++ serial8250_tx_chars(up); ++ else if (!up->dma->tx_running) ++ __stop_tx(up); ++ } ++ ++ uart_unlock_and_check_sysrq_irqrestore(port, flags); ++ ++ return 1; ++} ++EXPORT_SYMBOL_GPL(serial8250_handle_irq); ++ ++static int serial8250_default_handle_irq(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int iir; ++ int ret; ++ ++ serial8250_rpm_get(up); ++ ++ iir = serial_port_in(port, UART_IIR); ++ ret = serial8250_handle_irq(port, iir); ++ ++ serial8250_rpm_put(up); ++ return ret; ++} ++ ++/* ++ * Newer 16550 compatible parts such as the SC16C650 & Altera 16550 Soft IP ++ * have a programmable TX threshold that triggers the THRE interrupt in ++ * the IIR register. In this case, the THRE interrupt indicates the FIFO ++ * has space available. Load it up with tx_loadsz bytes. ++ */ ++static int serial8250_tx_threshold_handle_irq(struct uart_port *port) ++{ ++ unsigned long flags; ++ unsigned int iir = serial_port_in(port, UART_IIR); ++ ++ /* TX Threshold IRQ triggered so load up FIFO */ ++ if ((iir & UART_IIR_ID) == UART_IIR_THRI) { ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ serial8250_tx_chars(up); ++ spin_unlock_irqrestore(&port->lock, flags); ++ } ++ ++ iir = serial_port_in(port, UART_IIR); ++ return serial8250_handle_irq(port, iir); ++} ++ ++static unsigned int serial8250_tx_empty(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ u16 lsr; ++ ++ serial8250_rpm_get(up); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ lsr = serial_lsr_in(up); ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ serial8250_rpm_put(up); ++ ++ return uart_lsr_tx_empty(lsr) ? TIOCSER_TEMT : 0; ++} ++ ++unsigned int serial8250_do_get_mctrl(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int status; ++ unsigned int val; ++ ++ serial8250_rpm_get(up); ++ status = serial8250_modem_status(up); ++ serial8250_rpm_put(up); ++ ++ val = serial8250_MSR_to_TIOCM(status); ++ if (up->gpios) ++ return mctrl_gpio_get(up->gpios, &val); ++ ++ return val; ++} ++EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl); ++ ++static unsigned int serial8250_get_mctrl(struct uart_port *port) ++{ ++ if (port->get_mctrl) ++ return port->get_mctrl(port); ++ return serial8250_do_get_mctrl(port); ++} ++ ++void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned char mcr; ++ ++ mcr = serial8250_TIOCM_to_MCR(mctrl); ++ ++ mcr |= up->mcr; ++ ++ serial8250_out_MCR(up, mcr); ++} ++EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl); ++ ++static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl) ++{ ++ if (port->rs485.flags & SER_RS485_ENABLED) ++ return; ++ ++ if (port->set_mctrl) ++ port->set_mctrl(port, mctrl); ++ else ++ serial8250_do_set_mctrl(port, mctrl); ++} ++ ++static void serial8250_break_ctl(struct uart_port *port, int break_state) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ ++ serial8250_rpm_get(up); ++ spin_lock_irqsave(&port->lock, flags); ++ if (break_state == -1) ++ up->lcr |= UART_LCR_SBC; ++ else ++ up->lcr &= ~UART_LCR_SBC; ++ serial_port_out(port, UART_LCR, up->lcr); ++ spin_unlock_irqrestore(&port->lock, flags); ++ serial8250_rpm_put(up); ++} ++ ++static void wait_for_lsr(struct uart_8250_port *up, int bits) ++{ ++ unsigned int status, tmout = 10000; ++ ++ /* Wait up to 10ms for the character(s) to be sent. */ ++ for (;;) { ++ status = serial_lsr_in(up); ++ ++ if ((status & bits) == bits) ++ break; ++ if (--tmout == 0) ++ break; ++ udelay(1); ++ touch_nmi_watchdog(); ++ } ++} ++ ++/* ++ * Wait for transmitter & holding register to empty ++ */ ++static void wait_for_xmitr(struct uart_8250_port *up, int bits) ++{ ++ unsigned int tmout; ++ ++ wait_for_lsr(up, bits); ++ ++ /* Wait up to 1s for flow control if necessary */ ++ if (up->port.flags & UPF_CONS_FLOW) { ++ for (tmout = 1000000; tmout; tmout--) { ++ unsigned int msr = serial_in(up, UART_MSR); ++ up->msr_saved_flags |= msr & MSR_SAVE_FLAGS; ++ if (msr & UART_MSR_CTS) ++ break; ++ udelay(1); ++ touch_nmi_watchdog(); ++ } ++ } ++} ++ ++#ifdef CONFIG_CONSOLE_POLL ++/* ++ * Console polling routines for writing and reading from the uart while ++ * in an interrupt or debug context. ++ */ ++ ++static int serial8250_get_poll_char(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ int status; ++ u16 lsr; ++ ++ serial8250_rpm_get(up); ++ ++ lsr = serial_port_in(port, UART_LSR); ++ ++ if (!(lsr & UART_LSR_DR)) { ++ status = NO_POLL_CHAR; ++ goto out; ++ } ++ ++ status = serial_port_in(port, UART_RX); ++out: ++ serial8250_rpm_put(up); ++ return status; ++} ++ ++ ++static void serial8250_put_poll_char(struct uart_port *port, ++ unsigned char c) ++{ ++ unsigned int ier; ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ serial8250_rpm_get(up); ++ /* ++ * First save the IER then disable the interrupts ++ */ ++ ier = serial_port_in(port, UART_IER); ++ if (up->capabilities & UART_CAP_UUE) ++ serial_port_out(port, UART_IER, UART_IER_UUE); ++ else ++ serial_port_out(port, UART_IER, 0); ++ ++ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); ++ /* ++ * Send the character out. ++ */ ++ serial_port_out(port, UART_TX, c); ++ ++ /* ++ * Finally, wait for transmitter to become empty ++ * and restore the IER ++ */ ++ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); ++ serial_port_out(port, UART_IER, ier); ++ serial8250_rpm_put(up); ++} ++ ++#endif /* CONFIG_CONSOLE_POLL */ ++ ++int serial8250_do_startup(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ unsigned char iir; ++ int retval; ++ u16 lsr; ++ ++ if (!port->fifosize) ++ port->fifosize = uart_config[port->type].fifo_size; ++ if (!up->tx_loadsz) ++ up->tx_loadsz = uart_config[port->type].tx_loadsz; ++ if (!up->capabilities) ++ up->capabilities = uart_config[port->type].flags; ++ up->mcr = 0; ++ ++ if (port->iotype != up->cur_iotype) ++ set_io_from_upio(port); ++ ++ serial8250_rpm_get(up); ++ if (port->type == PORT_16C950) { ++ /* Wake up and initialize UART */ ++ up->acr = 0; ++ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_port_out(port, UART_EFR, UART_EFR_ECB); ++ serial_port_out(port, UART_IER, 0); ++ serial_port_out(port, UART_LCR, 0); ++ serial_icr_write(up, UART_CSR, 0); /* Reset the UART */ ++ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); ++ serial_port_out(port, UART_EFR, UART_EFR_ECB); ++ serial_port_out(port, UART_LCR, 0); ++ } ++ ++ if (port->type == PORT_DA830) { ++ /* Reset the port */ ++ serial_port_out(port, UART_IER, 0); ++ serial_port_out(port, UART_DA830_PWREMU_MGMT, 0); ++ mdelay(10); ++ ++ /* Enable Tx, Rx and free run mode */ ++ serial_port_out(port, UART_DA830_PWREMU_MGMT, ++ UART_DA830_PWREMU_MGMT_UTRST | ++ UART_DA830_PWREMU_MGMT_URRST | ++ UART_DA830_PWREMU_MGMT_FREE); ++ } ++ ++ if (port->type == PORT_NPCM) { ++ /* ++ * Nuvoton calls the scratch register 'UART_TOR' (timeout ++ * register). Enable it, and set TIOC (timeout interrupt ++ * comparator) to be 0x20 for correct operation. ++ */ ++ serial_port_out(port, UART_NPCM_TOR, UART_NPCM_TOIE | 0x20); ++ } ++ ++#ifdef CONFIG_SERIAL_8250_RSA ++ /* ++ * If this is an RSA port, see if we can kick it up to the ++ * higher speed clock. ++ */ ++ enable_rsa(up); ++#endif ++ ++ /* ++ * Clear the FIFO buffers and disable them. ++ * (they will be reenabled in set_termios()) ++ */ ++ serial8250_clear_fifos(up); ++ ++ /* ++ * Clear the interrupt registers. ++ */ ++ serial_port_in(port, UART_LSR); ++ serial_port_in(port, UART_RX); ++ serial_port_in(port, UART_IIR); ++ serial_port_in(port, UART_MSR); ++ ++ /* ++ * At this point, there's no way the LSR could still be 0xff; ++ * if it is, then bail out, because there's likely no UART ++ * here. ++ */ ++ if (!(port->flags & UPF_BUGGY_UART) && ++ (serial_port_in(port, UART_LSR) == 0xff)) { ++ dev_info_ratelimited(port->dev, "LSR safety check engaged!\n"); ++ retval = -ENODEV; ++ goto out; ++ } ++ ++ /* ++ * For a XR16C850, we need to set the trigger levels ++ */ ++ if (port->type == PORT_16850) { ++ unsigned char fctr; ++ ++ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); ++ ++ fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX); ++ serial_port_out(port, UART_FCTR, ++ fctr | UART_FCTR_TRGD | UART_FCTR_RX); ++ serial_port_out(port, UART_TRG, UART_TRG_96); ++ serial_port_out(port, UART_FCTR, ++ fctr | UART_FCTR_TRGD | UART_FCTR_TX); ++ serial_port_out(port, UART_TRG, UART_TRG_96); ++ ++ serial_port_out(port, UART_LCR, 0); ++ } ++ ++ /* ++ * For the Altera 16550 variants, set TX threshold trigger level. ++ */ ++ if (((port->type == PORT_ALTR_16550_F32) || ++ (port->type == PORT_ALTR_16550_F64) || ++ (port->type == PORT_ALTR_16550_F128)) && (port->fifosize > 1)) { ++ /* Bounds checking of TX threshold (valid 0 to fifosize-2) */ ++ if ((up->tx_loadsz < 2) || (up->tx_loadsz > port->fifosize)) { ++ dev_err(port->dev, "TX FIFO Threshold errors, skipping\n"); ++ } else { ++ serial_port_out(port, UART_ALTR_AFR, ++ UART_ALTR_EN_TXFIFO_LW); ++ serial_port_out(port, UART_ALTR_TX_LOW, ++ port->fifosize - up->tx_loadsz); ++ port->handle_irq = serial8250_tx_threshold_handle_irq; ++ } ++ } ++ ++ /* Check if we need to have shared IRQs */ ++ if (port->irq && (up->port.flags & UPF_SHARE_IRQ)) ++ up->port.irqflags |= IRQF_SHARED; ++ ++ retval = up->ops->setup_irq(up); ++ if (retval) ++ goto out; ++ ++ if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) { ++ unsigned char iir1; ++ ++ if (port->irqflags & IRQF_SHARED) ++ disable_irq_nosync(port->irq); ++ ++ /* ++ * Test for UARTs that do not reassert THRE when the ++ * transmitter is idle and the interrupt has already ++ * been cleared. Real 16550s should always reassert ++ * this interrupt whenever the transmitter is idle and ++ * the interrupt is enabled. Delays are necessary to ++ * allow register changes to become visible. ++ */ ++ spin_lock_irqsave(&port->lock, flags); ++ ++ wait_for_xmitr(up, UART_LSR_THRE); ++ serial_port_out_sync(port, UART_IER, UART_IER_THRI); ++ udelay(1); /* allow THRE to set */ ++ iir1 = serial_port_in(port, UART_IIR); ++ serial_port_out(port, UART_IER, 0); ++ serial_port_out_sync(port, UART_IER, UART_IER_THRI); ++ udelay(1); /* allow a working UART time to re-assert THRE */ ++ iir = serial_port_in(port, UART_IIR); ++ serial_port_out(port, UART_IER, 0); ++ ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ if (port->irqflags & IRQF_SHARED) ++ enable_irq(port->irq); ++ ++ /* ++ * If the interrupt is not reasserted, or we otherwise ++ * don't trust the iir, setup a timer to kick the UART ++ * on a regular basis. ++ */ ++ if ((!(iir1 & UART_IIR_NO_INT) && (iir & UART_IIR_NO_INT)) || ++ up->port.flags & UPF_BUG_THRE) { ++ up->bugs |= UART_BUG_THRE; ++ } ++ } ++ ++ up->ops->setup_timer(up); ++ ++ /* ++ * Now, initialize the UART ++ */ ++ serial_port_out(port, UART_LCR, UART_LCR_WLEN8); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ if (up->port.flags & UPF_FOURPORT) { ++ if (!up->port.irq) ++ up->port.mctrl |= TIOCM_OUT1; ++ } else ++ /* ++ * Most PC uarts need OUT2 raised to enable interrupts. ++ */ ++ if (port->irq) ++ up->port.mctrl |= TIOCM_OUT2; ++ ++ serial8250_set_mctrl(port, port->mctrl); ++ ++ /* ++ * Serial over Lan (SoL) hack: ++ * Intel 8257x Gigabit ethernet chips have a 16550 emulation, to be ++ * used for Serial Over Lan. Those chips take a longer time than a ++ * normal serial device to signalize that a transmission data was ++ * queued. Due to that, the above test generally fails. One solution ++ * would be to delay the reading of iir. However, this is not ++ * reliable, since the timeout is variable. So, let's just don't ++ * test if we receive TX irq. This way, we'll never enable ++ * UART_BUG_TXEN. ++ */ ++ if (up->port.quirks & UPQ_NO_TXEN_TEST) ++ goto dont_test_tx_en; ++ ++ /* ++ * Do a quick test to see if we receive an interrupt when we enable ++ * the TX irq. ++ */ ++ serial_port_out(port, UART_IER, UART_IER_THRI); ++ lsr = serial_port_in(port, UART_LSR); ++ iir = serial_port_in(port, UART_IIR); ++ serial_port_out(port, UART_IER, 0); ++ ++ if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { ++ if (!(up->bugs & UART_BUG_TXEN)) { ++ up->bugs |= UART_BUG_TXEN; ++ dev_dbg(port->dev, "enabling bad tx status workarounds\n"); ++ } ++ } else { ++ up->bugs &= ~UART_BUG_TXEN; ++ } ++ ++dont_test_tx_en: ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ /* ++ * Clear the interrupt registers again for luck, and clear the ++ * saved flags to avoid getting false values from polling ++ * routines or the previous session. ++ */ ++ serial_port_in(port, UART_LSR); ++ serial_port_in(port, UART_RX); ++ serial_port_in(port, UART_IIR); ++ serial_port_in(port, UART_MSR); ++ up->lsr_saved_flags = 0; ++ up->msr_saved_flags = 0; ++ ++ /* ++ * Request DMA channels for both RX and TX. ++ */ ++ if (up->dma) { ++ const char *msg = NULL; ++ ++ if (uart_console(port)) ++ msg = "forbid DMA for kernel console"; ++ else if (serial8250_request_dma(up)) ++ msg = "failed to request DMA"; ++ if (msg) { ++ dev_warn_ratelimited(port->dev, "%s\n", msg); ++ up->dma = NULL; ++ } ++ } ++ ++ /* ++ * Set the IER shadow for rx interrupts but defer actual interrupt ++ * enable until after the FIFOs are enabled; otherwise, an already- ++ * active sender can swamp the interrupt handler with "too much work". ++ */ ++ up->ier = UART_IER_RLSI | UART_IER_RDI; ++ ++ if (port->flags & UPF_FOURPORT) { ++ unsigned int icp; ++ /* ++ * Enable interrupts on the AST Fourport board ++ */ ++ icp = (port->iobase & 0xfe0) | 0x01f; ++ outb_p(0x80, icp); ++ inb_p(icp); ++ } ++ retval = 0; ++out: ++ serial8250_rpm_put(up); ++ return retval; ++} ++EXPORT_SYMBOL_GPL(serial8250_do_startup); ++ ++static int serial8250_startup(struct uart_port *port) ++{ ++ if (port->startup) ++ return port->startup(port); ++ return serial8250_do_startup(port); ++} ++ ++void serial8250_do_shutdown(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ ++ serial8250_rpm_get(up); ++ /* ++ * Disable interrupts from this port ++ */ ++ spin_lock_irqsave(&port->lock, flags); ++ up->ier = 0; ++ serial_port_out(port, UART_IER, 0); ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ synchronize_irq(port->irq); ++ ++ if (up->dma) ++ serial8250_release_dma(up); ++ ++ spin_lock_irqsave(&port->lock, flags); ++ if (port->flags & UPF_FOURPORT) { ++ /* reset interrupts on the AST Fourport board */ ++ inb((port->iobase & 0xfe0) | 0x1f); ++ port->mctrl |= TIOCM_OUT1; ++ } else ++ port->mctrl &= ~TIOCM_OUT2; ++ ++ serial8250_set_mctrl(port, port->mctrl); ++ spin_unlock_irqrestore(&port->lock, flags); ++ ++ /* ++ * Disable break condition and FIFOs ++ */ ++ serial_port_out(port, UART_LCR, ++ serial_port_in(port, UART_LCR) & ~UART_LCR_SBC); ++ serial8250_clear_fifos(up); ++ ++#ifdef CONFIG_SERIAL_8250_RSA ++ /* ++ * Reset the RSA board back to 115kbps compat mode. ++ */ ++ disable_rsa(up); ++#endif ++ ++ /* ++ * Read data port to reset things, and then unlink from ++ * the IRQ chain. ++ */ ++ serial_port_in(port, UART_RX); ++ serial8250_rpm_put(up); ++ ++ up->ops->release_irq(up); ++} ++EXPORT_SYMBOL_GPL(serial8250_do_shutdown); ++ ++static void serial8250_shutdown(struct uart_port *port) ++{ ++ if (port->shutdown) ++ port->shutdown(port); ++ else ++ serial8250_do_shutdown(port); ++} ++ ++/* Nuvoton NPCM UARTs have a custom divisor calculation */ ++static unsigned int npcm_get_divisor(struct uart_8250_port *up, ++ unsigned int baud) ++{ ++ struct uart_port *port = &up->port; ++ ++ return DIV_ROUND_CLOSEST(port->uartclk, 16 * baud + 2) - 2; ++} ++ ++static unsigned int serial8250_do_get_divisor(struct uart_port *port, ++ unsigned int baud, ++ unsigned int *frac) ++{ ++ upf_t magic_multiplier = port->flags & UPF_MAGIC_MULTIPLIER; ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int quot; ++ ++ /* ++ * Handle magic divisors for baud rates above baud_base on SMSC ++ * Super I/O chips. We clamp custom rates from clk/6 and clk/12 ++ * up to clk/4 (0x8001) and clk/8 (0x8002) respectively. These ++ * magic divisors actually reprogram the baud rate generator's ++ * reference clock derived from chips's 14.318MHz clock input. ++ * ++ * Documentation claims that with these magic divisors the base ++ * frequencies of 7.3728MHz and 3.6864MHz are used respectively ++ * for the extra baud rates of 460800bps and 230400bps rather ++ * than the usual base frequency of 1.8462MHz. However empirical ++ * evidence contradicts that. ++ * ++ * Instead bit 7 of the DLM register (bit 15 of the divisor) is ++ * effectively used as a clock prescaler selection bit for the ++ * base frequency of 7.3728MHz, always used. If set to 0, then ++ * the base frequency is divided by 4 for use by the Baud Rate ++ * Generator, for the usual arrangement where the value of 1 of ++ * the divisor produces the baud rate of 115200bps. Conversely, ++ * if set to 1 and high-speed operation has been enabled with the ++ * Serial Port Mode Register in the Device Configuration Space, ++ * then the base frequency is supplied directly to the Baud Rate ++ * Generator, so for the divisor values of 0x8001, 0x8002, 0x8003, ++ * 0x8004, etc. the respective baud rates produced are 460800bps, ++ * 230400bps, 153600bps, 115200bps, etc. ++ * ++ * In all cases only low 15 bits of the divisor are used to divide ++ * the baud base and therefore 32767 is the maximum divisor value ++ * possible, even though documentation says that the programmable ++ * Baud Rate Generator is capable of dividing the internal PLL ++ * clock by any divisor from 1 to 65535. ++ */ ++ if (magic_multiplier && baud >= port->uartclk / 6) ++ quot = 0x8001; ++ else if (magic_multiplier && baud >= port->uartclk / 12) ++ quot = 0x8002; ++ else if (up->port.type == PORT_NPCM) ++ quot = npcm_get_divisor(up, baud); ++ else ++ quot = uart_get_divisor(port, baud); ++ ++ /* ++ * Oxford Semi 952 rev B workaround ++ */ ++ if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0) ++ quot++; ++ ++ return quot; ++} ++ ++static unsigned int serial8250_get_divisor(struct uart_port *port, ++ unsigned int baud, ++ unsigned int *frac) ++{ ++ if (port->get_divisor) ++ return port->get_divisor(port, baud, frac); ++ ++ return serial8250_do_get_divisor(port, baud, frac); ++} ++ ++static unsigned char serial8250_compute_lcr(struct uart_8250_port *up, ++ tcflag_t c_cflag) ++{ ++ unsigned char cval; ++ ++ cval = UART_LCR_WLEN(tty_get_char_size(c_cflag)); ++ ++ if (c_cflag & CSTOPB) ++ cval |= UART_LCR_STOP; ++ if (c_cflag & PARENB) { ++ cval |= UART_LCR_PARITY; ++ if (up->bugs & UART_BUG_PARITY) ++ up->fifo_bug = true; ++ } ++ if (!(c_cflag & PARODD)) ++ cval |= UART_LCR_EPAR; ++ if (c_cflag & CMSPAR) ++ cval |= UART_LCR_SPAR; ++ ++ return cval; ++} ++ ++void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud, ++ unsigned int quot, unsigned int quot_frac) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ /* Workaround to enable 115200 baud on OMAP1510 internal ports */ ++ if (is_omap1510_8250(up)) { ++ if (baud == 115200) { ++ quot = 1; ++ serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1); ++ } else ++ serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0); ++ } ++ ++ /* ++ * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2, ++ * otherwise just set DLAB ++ */ ++ if (up->capabilities & UART_NATSEMI) ++ serial_port_out(port, UART_LCR, 0xe0); ++ else ++ serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB); ++ ++ serial_dl_write(up, quot); ++} ++EXPORT_SYMBOL_GPL(serial8250_do_set_divisor); ++ ++static void serial8250_set_divisor(struct uart_port *port, unsigned int baud, ++ unsigned int quot, unsigned int quot_frac) ++{ ++ if (port->set_divisor) ++ port->set_divisor(port, baud, quot, quot_frac); ++ else ++ serial8250_do_set_divisor(port, baud, quot, quot_frac); ++} ++ ++static unsigned int serial8250_get_baud_rate(struct uart_port *port, ++ struct ktermios *termios, ++ struct ktermios *old) ++{ ++ unsigned int tolerance = port->uartclk / 100; ++ unsigned int min; ++ unsigned int max; ++ ++ /* ++ * Handle magic divisors for baud rates above baud_base on SMSC ++ * Super I/O chips. Enable custom rates of clk/4 and clk/8, but ++ * disable divisor values beyond 32767, which are unavailable. ++ */ ++ if (port->flags & UPF_MAGIC_MULTIPLIER) { ++ min = port->uartclk / 16 / UART_DIV_MAX >> 1; ++ max = (port->uartclk + tolerance) / 4; ++ } else { ++ min = port->uartclk / 16 / UART_DIV_MAX; ++ max = (port->uartclk + tolerance) / 16; ++ } ++ ++ /* ++ * Ask the core to calculate the divisor for us. ++ * Allow 1% tolerance at the upper limit so uart clks marginally ++ * slower than nominal still match standard baud rates without ++ * causing transmission errors. ++ */ ++ return uart_get_baud_rate(port, termios, old, min, max); ++} ++ ++/* ++ * Note in order to avoid the tty port mutex deadlock don't use the next method ++ * within the uart port callbacks. Primarily it's supposed to be utilized to ++ * handle a sudden reference clock rate change. ++ */ ++void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ struct tty_port *tport = &port->state->port; ++ unsigned int baud, quot, frac = 0; ++ struct ktermios *termios; ++ struct tty_struct *tty; ++ unsigned long flags; ++ ++ tty = tty_port_tty_get(tport); ++ if (!tty) { ++ mutex_lock(&tport->mutex); ++ port->uartclk = uartclk; ++ mutex_unlock(&tport->mutex); ++ return; ++ } ++ ++ down_write(&tty->termios_rwsem); ++ mutex_lock(&tport->mutex); ++ ++ if (port->uartclk == uartclk) ++ goto out_unlock; ++ ++ port->uartclk = uartclk; ++ ++ if (!tty_port_initialized(tport)) ++ goto out_unlock; ++ ++ termios = &tty->termios; ++ ++ baud = serial8250_get_baud_rate(port, termios, NULL); ++ quot = serial8250_get_divisor(port, baud, &frac); ++ ++ serial8250_rpm_get(up); ++ spin_lock_irqsave(&port->lock, flags); ++ ++ uart_update_timeout(port, termios->c_cflag, baud); ++ ++ serial8250_set_divisor(port, baud, quot, frac); ++ serial_port_out(port, UART_LCR, up->lcr); ++ ++ spin_unlock_irqrestore(&port->lock, flags); ++ serial8250_rpm_put(up); ++ ++out_unlock: ++ mutex_unlock(&tport->mutex); ++ up_write(&tty->termios_rwsem); ++ tty_kref_put(tty); ++} ++EXPORT_SYMBOL_GPL(serial8250_update_uartclk); ++ ++void ++serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, ++ struct ktermios *old) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned char cval; ++ unsigned long flags; ++ unsigned int baud, quot, frac = 0; ++ ++ if (up->capabilities & UART_CAP_MINI) { ++ termios->c_cflag &= ~(CSTOPB | PARENB | PARODD | CMSPAR); ++ if ((termios->c_cflag & CSIZE) == CS5 || ++ (termios->c_cflag & CSIZE) == CS6) ++ termios->c_cflag = (termios->c_cflag & ~CSIZE) | CS7; ++ } ++ cval = serial8250_compute_lcr(up, termios->c_cflag); ++ ++ baud = serial8250_get_baud_rate(port, termios, old); ++ quot = serial8250_get_divisor(port, baud, &frac); ++ ++ /* ++ * Ok, we're now changing the port state. Do it with ++ * interrupts disabled. ++ */ ++ serial8250_rpm_get(up); ++ spin_lock_irqsave(&port->lock, flags); ++ ++ up->lcr = cval; /* Save computed LCR */ ++ ++ if (up->capabilities & UART_CAP_FIFO && port->fifosize > 1) { ++ /* NOTE: If fifo_bug is not set, a user can set RX_trigger. */ ++ if ((baud < 2400 && !up->dma) || up->fifo_bug) { ++ up->fcr &= ~UART_FCR_TRIGGER_MASK; ++ up->fcr |= UART_FCR_TRIGGER_1; ++ } ++ } ++ ++ /* ++ * MCR-based auto flow control. When AFE is enabled, RTS will be ++ * deasserted when the receive FIFO contains more characters than ++ * the trigger, or the MCR RTS bit is cleared. ++ */ ++ if (up->capabilities & UART_CAP_AFE) { ++ up->mcr &= ~UART_MCR_AFE; ++ if (termios->c_cflag & CRTSCTS) ++ up->mcr |= UART_MCR_AFE; ++ } ++ ++ /* ++ * Update the per-port timeout. ++ */ ++ uart_update_timeout(port, termios->c_cflag, baud); ++ ++ port->read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR; ++ if (termios->c_iflag & INPCK) ++ port->read_status_mask |= UART_LSR_FE | UART_LSR_PE; ++ if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK)) ++ port->read_status_mask |= UART_LSR_BI; ++ ++ /* ++ * Characters to ignore ++ */ ++ port->ignore_status_mask = 0; ++ if (termios->c_iflag & IGNPAR) ++ port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE; ++ if (termios->c_iflag & IGNBRK) { ++ port->ignore_status_mask |= UART_LSR_BI; ++ /* ++ * If we're ignoring parity and break indicators, ++ * ignore overruns too (for real raw support). ++ */ ++ if (termios->c_iflag & IGNPAR) ++ port->ignore_status_mask |= UART_LSR_OE; ++ } ++ ++ /* ++ * ignore all characters if CREAD is not set ++ */ ++ if ((termios->c_cflag & CREAD) == 0) ++ port->ignore_status_mask |= UART_LSR_DR; ++ ++ /* ++ * CTS flow control flag and modem status interrupts ++ */ ++ up->ier &= ~UART_IER_MSI; ++ if (!(up->bugs & UART_BUG_NOMSR) && ++ UART_ENABLE_MS(&up->port, termios->c_cflag)) ++ up->ier |= UART_IER_MSI; ++ if (up->capabilities & UART_CAP_UUE) ++ up->ier |= UART_IER_UUE; ++ if (up->capabilities & UART_CAP_RTOIE) ++ up->ier |= UART_IER_RTOIE; ++ ++ serial_port_out(port, UART_IER, up->ier); ++ ++ if (up->capabilities & UART_CAP_EFR) { ++ unsigned char efr = 0; ++ /* ++ * TI16C752/Startech hardware flow control. FIXME: ++ * - TI16C752 requires control thresholds to be set. ++ * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled. ++ */ ++ if (termios->c_cflag & CRTSCTS) ++ efr |= UART_EFR_CTS; ++ ++ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); ++ if (port->flags & UPF_EXAR_EFR) ++ serial_port_out(port, UART_XR_EFR, efr); ++ else ++ serial_port_out(port, UART_EFR, efr); ++ } ++ ++ serial8250_set_divisor(port, baud, quot, frac); ++ ++ /* ++ * LCR DLAB must be set to enable 64-byte FIFO mode. If the FCR ++ * is written without DLAB set, this mode will be disabled. ++ */ ++ if (port->type == PORT_16750) ++ serial_port_out(port, UART_FCR, up->fcr); ++ ++ serial_port_out(port, UART_LCR, up->lcr); /* reset DLAB */ ++ if (port->type != PORT_16750) { ++ /* emulated UARTs (Lucent Venus 167x) need two steps */ ++ if (up->fcr & UART_FCR_ENABLE_FIFO) ++ serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO); ++ serial_port_out(port, UART_FCR, up->fcr); /* set fcr */ ++ } ++ serial8250_set_mctrl(port, port->mctrl); ++ spin_unlock_irqrestore(&port->lock, flags); ++ serial8250_rpm_put(up); ++ ++ /* Don't rewrite B0 */ ++ if (tty_termios_baud_rate(termios)) ++ tty_termios_encode_baud_rate(termios, baud, baud); ++} ++EXPORT_SYMBOL(serial8250_do_set_termios); ++ ++static void ++serial8250_set_termios(struct uart_port *port, struct ktermios *termios, ++ struct ktermios *old) ++{ ++ if (port->set_termios) ++ port->set_termios(port, termios, old); ++ else ++ serial8250_do_set_termios(port, termios, old); ++} ++ ++void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios) ++{ ++ if (termios->c_line == N_PPS) { ++ port->flags |= UPF_HARDPPS_CD; ++ spin_lock_irq(&port->lock); ++ serial8250_enable_ms(port); ++ spin_unlock_irq(&port->lock); ++ } else { ++ port->flags &= ~UPF_HARDPPS_CD; ++ if (!UART_ENABLE_MS(port, termios->c_cflag)) { ++ spin_lock_irq(&port->lock); ++ serial8250_disable_ms(port); ++ spin_unlock_irq(&port->lock); ++ } ++ } ++} ++EXPORT_SYMBOL_GPL(serial8250_do_set_ldisc); ++ ++static void ++serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios) ++{ ++ if (port->set_ldisc) ++ port->set_ldisc(port, termios); ++ else ++ serial8250_do_set_ldisc(port, termios); ++} ++ ++void serial8250_do_pm(struct uart_port *port, unsigned int state, ++ unsigned int oldstate) ++{ ++ struct uart_8250_port *p = up_to_u8250p(port); ++ ++ serial8250_set_sleep(p, state != 0); ++} ++EXPORT_SYMBOL(serial8250_do_pm); ++ ++static void ++serial8250_pm(struct uart_port *port, unsigned int state, ++ unsigned int oldstate) ++{ ++ if (port->pm) ++ port->pm(port, state, oldstate); ++ else ++ serial8250_do_pm(port, state, oldstate); ++} ++ ++static unsigned int serial8250_port_size(struct uart_8250_port *pt) ++{ ++ if (pt->port.mapsize) ++ return pt->port.mapsize; ++ if (pt->port.iotype == UPIO_AU) { ++ if (pt->port.type == PORT_RT2880) ++ return 0x100; ++ return 0x1000; ++ } ++ if (is_omap1_8250(pt)) ++ return 0x16 << pt->port.regshift; ++ ++ return 8 << pt->port.regshift; ++} ++ ++/* ++ * Resource handling. ++ */ ++static int serial8250_request_std_resource(struct uart_8250_port *up) ++{ ++ unsigned int size = serial8250_port_size(up); ++ struct uart_port *port = &up->port; ++ int ret = 0; ++ ++ switch (port->iotype) { ++ case UPIO_AU: ++ case UPIO_TSI: ++ case UPIO_MEM32: ++ case UPIO_MEM32BE: ++ case UPIO_MEM16: ++ case UPIO_MEM: ++ if (!port->mapbase) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (!request_mem_region(port->mapbase, size, "serial")) { ++ ret = -EBUSY; ++ break; ++ } ++ ++ if (port->flags & UPF_IOREMAP) { ++ port->membase = ioremap(port->mapbase, size); ++ if (!port->membase) { ++ release_mem_region(port->mapbase, size); ++ ret = -ENOMEM; ++ } ++ } ++ break; ++ ++ case UPIO_HUB6: ++ case UPIO_PORT: ++ if (!request_region(port->iobase, size, "serial")) ++ ret = -EBUSY; ++ break; ++ } ++ return ret; ++} ++ ++static void serial8250_release_std_resource(struct uart_8250_port *up) ++{ ++ unsigned int size = serial8250_port_size(up); ++ struct uart_port *port = &up->port; ++ ++ switch (port->iotype) { ++ case UPIO_AU: ++ case UPIO_TSI: ++ case UPIO_MEM32: ++ case UPIO_MEM32BE: ++ case UPIO_MEM16: ++ case UPIO_MEM: ++ if (!port->mapbase) ++ break; ++ ++ if (port->flags & UPF_IOREMAP) { ++ iounmap(port->membase); ++ port->membase = NULL; ++ } ++ ++ release_mem_region(port->mapbase, size); ++ break; ++ ++ case UPIO_HUB6: ++ case UPIO_PORT: ++ release_region(port->iobase, size); ++ break; ++ } ++} ++ ++static void serial8250_release_port(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ serial8250_release_std_resource(up); ++} ++ ++static int serial8250_request_port(struct uart_port *port) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ return serial8250_request_std_resource(up); ++} ++ ++static int fcr_get_rxtrig_bytes(struct uart_8250_port *up) ++{ ++ const struct serial8250_config *conf_type = &uart_config[up->port.type]; ++ unsigned char bytes; ++ ++ bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)]; ++ ++ return bytes ? bytes : -EOPNOTSUPP; ++} ++ ++static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes) ++{ ++ const struct serial8250_config *conf_type = &uart_config[up->port.type]; ++ int i; ++ ++ if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)]) ++ return -EOPNOTSUPP; ++ ++ for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) { ++ if (bytes < conf_type->rxtrig_bytes[i]) ++ /* Use the nearest lower value */ ++ return (--i) << UART_FCR_R_TRIG_SHIFT; ++ } ++ ++ return UART_FCR_R_TRIG_11; ++} ++ ++static int do_get_rxtrig(struct tty_port *port) ++{ ++ struct uart_state *state = container_of(port, struct uart_state, port); ++ struct uart_port *uport = state->uart_port; ++ struct uart_8250_port *up = up_to_u8250p(uport); ++ ++ if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1) ++ return -EINVAL; ++ ++ return fcr_get_rxtrig_bytes(up); ++} ++ ++static int do_serial8250_get_rxtrig(struct tty_port *port) ++{ ++ int rxtrig_bytes; ++ ++ mutex_lock(&port->mutex); ++ rxtrig_bytes = do_get_rxtrig(port); ++ mutex_unlock(&port->mutex); ++ ++ return rxtrig_bytes; ++} ++ ++static ssize_t rx_trig_bytes_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ struct tty_port *port = dev_get_drvdata(dev); ++ int rxtrig_bytes; ++ ++ rxtrig_bytes = do_serial8250_get_rxtrig(port); ++ if (rxtrig_bytes < 0) ++ return rxtrig_bytes; ++ ++ return sysfs_emit(buf, "%d\n", rxtrig_bytes); ++} ++ ++static int do_set_rxtrig(struct tty_port *port, unsigned char bytes) ++{ ++ struct uart_state *state = container_of(port, struct uart_state, port); ++ struct uart_port *uport = state->uart_port; ++ struct uart_8250_port *up = up_to_u8250p(uport); ++ int rxtrig; ++ ++ if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1 || ++ up->fifo_bug) ++ return -EINVAL; ++ ++ rxtrig = bytes_to_fcr_rxtrig(up, bytes); ++ if (rxtrig < 0) ++ return rxtrig; ++ ++ serial8250_clear_fifos(up); ++ up->fcr &= ~UART_FCR_TRIGGER_MASK; ++ up->fcr |= (unsigned char)rxtrig; ++ serial_out(up, UART_FCR, up->fcr); ++ return 0; ++} ++ ++static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes) ++{ ++ int ret; ++ ++ mutex_lock(&port->mutex); ++ ret = do_set_rxtrig(port, bytes); ++ mutex_unlock(&port->mutex); ++ ++ return ret; ++} ++ ++static ssize_t rx_trig_bytes_store(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ struct tty_port *port = dev_get_drvdata(dev); ++ unsigned char bytes; ++ int ret; ++ ++ if (!count) ++ return -EINVAL; ++ ++ ret = kstrtou8(buf, 10, &bytes); ++ if (ret < 0) ++ return ret; ++ ++ ret = do_serial8250_set_rxtrig(port, bytes); ++ if (ret < 0) ++ return ret; ++ ++ return count; ++} ++ ++static DEVICE_ATTR_RW(rx_trig_bytes); ++ ++static struct attribute *serial8250_dev_attrs[] = { ++ &dev_attr_rx_trig_bytes.attr, ++ NULL ++}; ++ ++static struct attribute_group serial8250_dev_attr_group = { ++ .attrs = serial8250_dev_attrs, ++}; ++ ++static void register_dev_spec_attr_grp(struct uart_8250_port *up) ++{ ++ const struct serial8250_config *conf_type = &uart_config[up->port.type]; ++ ++ if (conf_type->rxtrig_bytes[0]) ++ up->port.attr_group = &serial8250_dev_attr_group; ++} ++ ++static void serial8250_config_port(struct uart_port *port, int flags) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ int ret; ++ ++ /* ++ * Find the region that we can probe for. This in turn ++ * tells us whether we can probe for the type of port. ++ */ ++ ret = serial8250_request_std_resource(up); ++ if (ret < 0) ++ return; ++ ++ if (port->iotype != up->cur_iotype) ++ set_io_from_upio(port); ++ ++ if (flags & UART_CONFIG_TYPE) ++ autoconfig(up); ++ ++ /* if access method is AU, it is a 16550 with a quirk */ ++ if (port->type == PORT_16550A && port->iotype == UPIO_AU) ++ up->bugs |= UART_BUG_NOMSR; ++ ++ /* HW bugs may trigger IRQ while IIR == NO_INT */ ++ if (port->type == PORT_TEGRA) ++ up->bugs |= UART_BUG_NOMSR; ++ ++ if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ) ++ autoconfig_irq(up); ++ ++ if (port->type == PORT_UNKNOWN) ++ serial8250_release_std_resource(up); ++ ++ register_dev_spec_attr_grp(up); ++ up->fcr = uart_config[up->port.type].fcr; ++} ++ ++static int ++serial8250_verify_port(struct uart_port *port, struct serial_struct *ser) ++{ ++ if (ser->irq >= nr_irqs || ser->irq < 0 || ++ ser->baud_base < 9600 || ser->type < PORT_UNKNOWN || ++ ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS || ++ ser->type == PORT_STARTECH) ++ return -EINVAL; ++ return 0; ++} ++ ++static const char *serial8250_type(struct uart_port *port) ++{ ++ int type = port->type; ++ ++ if (type >= ARRAY_SIZE(uart_config)) ++ type = 0; ++ return uart_config[type].name; ++} ++ ++static const struct uart_ops serial8250_pops = { ++ .tx_empty = serial8250_tx_empty, ++ .set_mctrl = serial8250_set_mctrl, ++ .get_mctrl = serial8250_get_mctrl, ++ .stop_tx = serial8250_stop_tx, ++ .start_tx = serial8250_start_tx, ++ .throttle = serial8250_throttle, ++ .unthrottle = serial8250_unthrottle, ++ .stop_rx = serial8250_stop_rx, ++ .enable_ms = serial8250_enable_ms, ++ .break_ctl = serial8250_break_ctl, ++ .startup = serial8250_startup, ++ .shutdown = serial8250_shutdown, ++ .set_termios = serial8250_set_termios, ++ .set_ldisc = serial8250_set_ldisc, ++ .pm = serial8250_pm, ++ .type = serial8250_type, ++ .release_port = serial8250_release_port, ++ .request_port = serial8250_request_port, ++ .config_port = serial8250_config_port, ++ .verify_port = serial8250_verify_port, ++#ifdef CONFIG_CONSOLE_POLL ++ .poll_get_char = serial8250_get_poll_char, ++ .poll_put_char = serial8250_put_poll_char, ++#endif ++}; ++ ++void serial8250_init_port(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ ++ spin_lock_init(&port->lock); ++ port->ops = &serial8250_pops; ++ port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE); ++ ++ up->cur_iotype = 0xFF; ++} ++EXPORT_SYMBOL_GPL(serial8250_init_port); ++ ++void serial8250_set_defaults(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ ++ if (up->port.flags & UPF_FIXED_TYPE) { ++ unsigned int type = up->port.type; ++ ++ if (!up->port.fifosize) ++ up->port.fifosize = uart_config[type].fifo_size; ++ if (!up->tx_loadsz) ++ up->tx_loadsz = uart_config[type].tx_loadsz; ++ if (!up->capabilities) ++ up->capabilities = uart_config[type].flags; ++ } ++ ++ set_io_from_upio(port); ++ ++ /* default dma handlers */ ++ if (up->dma) { ++ if (!up->dma->tx_dma) ++ up->dma->tx_dma = serial8250_tx_dma; ++ if (!up->dma->rx_dma) ++ up->dma->rx_dma = serial8250_rx_dma; ++ } ++} ++EXPORT_SYMBOL_GPL(serial8250_set_defaults); ++ ++#ifdef CONFIG_SERIAL_8250_CONSOLE ++ ++static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ wait_for_xmitr(up, UART_LSR_THRE); ++ serial_port_out(port, UART_TX, ch); ++} ++ ++/* ++ * Restore serial console when h/w power-off detected ++ */ ++static void serial8250_console_restore(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ struct ktermios termios; ++ unsigned int baud, quot, frac = 0; ++ ++ termios.c_cflag = port->cons->cflag; ++ termios.c_ispeed = port->cons->ispeed; ++ termios.c_ospeed = port->cons->ospeed; ++ if (port->state->port.tty && termios.c_cflag == 0) { ++ termios.c_cflag = port->state->port.tty->termios.c_cflag; ++ termios.c_ispeed = port->state->port.tty->termios.c_ispeed; ++ termios.c_ospeed = port->state->port.tty->termios.c_ospeed; ++ } ++ ++ baud = serial8250_get_baud_rate(port, &termios, NULL); ++ quot = serial8250_get_divisor(port, baud, &frac); ++ ++ serial8250_set_divisor(port, baud, quot, frac); ++ serial_port_out(port, UART_LCR, up->lcr); ++ serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); ++} ++ ++/* ++ * Print a string to the serial port using the device FIFO ++ * ++ * It sends fifosize bytes and then waits for the fifo ++ * to get empty. ++ */ ++static void serial8250_console_fifo_write(struct uart_8250_port *up, ++ const char *s, unsigned int count) ++{ ++ int i; ++ const char *end = s + count; ++ unsigned int fifosize = up->tx_loadsz; ++ bool cr_sent = false; ++ ++ while (s != end) { ++ wait_for_lsr(up, UART_LSR_THRE); ++ ++ for (i = 0; i < fifosize && s != end; ++i) { ++ if (*s == '\n' && !cr_sent) { ++ serial_out(up, UART_TX, '\r'); ++ cr_sent = true; ++ } else { ++ serial_out(up, UART_TX, *s++); ++ cr_sent = false; ++ } ++ } ++ } ++} ++ ++/* ++ * Print a string to the serial port trying not to disturb ++ * any possible real use of the port... ++ * ++ * The console_lock must be held when we get here. ++ * ++ * Doing runtime PM is really a bad idea for the kernel console. ++ * Thus, we assume the function is called when device is powered up. ++ */ ++void serial8250_console_write(struct uart_8250_port *up, const char *s, ++ unsigned int count) ++{ ++ struct uart_8250_em485 *em485 = up->em485; ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int ier, use_fifo; ++ int locked = 1; ++ ++ touch_nmi_watchdog(); ++ ++ if (oops_in_progress) ++ locked = spin_trylock_irqsave(&port->lock, flags); ++ else ++ spin_lock_irqsave(&port->lock, flags); ++ ++ /* ++ * First save the IER then disable the interrupts ++ */ ++ ier = serial_port_in(port, UART_IER); ++ ++ if (up->capabilities & UART_CAP_UUE) ++ serial_port_out(port, UART_IER, UART_IER_UUE); ++ else ++ serial_port_out(port, UART_IER, 0); ++ ++ /* check scratch reg to see if port powered off during system sleep */ ++ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { ++ serial8250_console_restore(up); ++ up->canary = 0; ++ } ++ ++ if (em485) { ++ if (em485->tx_stopped) ++ up->rs485_start_tx(up); ++ mdelay(port->rs485.delay_rts_before_send); ++ } ++ ++ use_fifo = (up->capabilities & UART_CAP_FIFO) && ++ /* ++ * BCM283x requires to check the fifo ++ * after each byte. ++ */ ++ !(up->capabilities & UART_CAP_MINI) && ++ /* ++ * tx_loadsz contains the transmit fifo size ++ */ ++ up->tx_loadsz > 1 && ++ (up->fcr & UART_FCR_ENABLE_FIFO) && ++ port->state && ++ test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) && ++ /* ++ * After we put a data in the fifo, the controller will send ++ * it regardless of the CTS state. Therefore, only use fifo ++ * if we don't use control flow. ++ */ ++ !(up->port.flags & UPF_CONS_FLOW); ++ ++ if (likely(use_fifo)) ++ serial8250_console_fifo_write(up, s, count); ++ else ++ uart_console_write(port, s, count, serial8250_console_putchar); ++ ++ /* ++ * Finally, wait for transmitter to become empty ++ * and restore the IER ++ */ ++ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); ++ ++ if (em485) { ++ mdelay(port->rs485.delay_rts_after_send); ++ if (em485->tx_stopped) ++ up->rs485_stop_tx(up); ++ } ++ ++ serial_port_out(port, UART_IER, ier); ++ ++ /* ++ * The receive handling will happen properly because the ++ * receive ready bit will still be set; it is not cleared ++ * on read. However, modem control will not, we must ++ * call it if we have saved something in the saved flags ++ * while processing with interrupts off. ++ */ ++ if (up->msr_saved_flags) ++ serial8250_modem_status(up); ++ ++ if (locked) ++ spin_unlock_irqrestore(&port->lock, flags); ++} ++ ++static unsigned int probe_baud(struct uart_port *port) ++{ ++ unsigned char lcr, dll, dlm; ++ unsigned int quot; ++ ++ lcr = serial_port_in(port, UART_LCR); ++ serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB); ++ dll = serial_port_in(port, UART_DLL); ++ dlm = serial_port_in(port, UART_DLM); ++ serial_port_out(port, UART_LCR, lcr); ++ ++ quot = (dlm << 8) | dll; ++ return (port->uartclk / 16) / quot; ++} ++ ++int serial8250_console_setup(struct uart_port *port, char *options, bool probe) ++{ ++ int baud = 9600; ++ int bits = 8; ++ int parity = 'n'; ++ int flow = 'n'; ++ int ret; ++ ++ if (!port->iobase && !port->membase) ++ return -ENODEV; ++ ++ if (options) ++ uart_parse_options(options, &baud, &parity, &bits, &flow); ++ else if (probe) ++ baud = probe_baud(port); ++ ++ ret = uart_set_options(port, port->cons, baud, parity, bits, flow); ++ if (ret) ++ return ret; ++ ++ if (port->dev) ++ pm_runtime_get_sync(port->dev); ++ ++ return 0; ++} ++ ++int serial8250_console_exit(struct uart_port *port) ++{ ++ if (port->dev) ++ pm_runtime_put_sync(port->dev); ++ ++ return 0; ++} ++ ++#endif /* CONFIG_SERIAL_8250_CONSOLE */ ++ ++MODULE_LICENSE("GPL"); +diff -rupN linux.orig/drivers/tty/serial/8250/Kconfig linux/drivers/tty/serial/8250/Kconfig +--- linux.orig/drivers/tty/serial/8250/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/8250/Kconfig 2022-12-04 10:40:26.708034065 -0500 @@ -9,6 +9,7 @@ config SERIAL_8250 depends on !S390 select SERIAL_CORE @@ -4064,11 +26933,10 @@ index d0b49e15fbf5e..02c308467339c 100644 help This selects whether you want to include the driver for the standard serial ports. The standard answer is Y. People who might say N -diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c -index 15f0e4d88c5a0..ffdb001e3d109 100644 ---- a/drivers/tty/serial/amba-pl011.c -+++ b/drivers/tty/serial/amba-pl011.c -@@ -2308,18 +2308,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) +diff -rupN linux.orig/drivers/tty/serial/amba-pl011.c linux/drivers/tty/serial/amba-pl011.c +--- linux.orig/drivers/tty/serial/amba-pl011.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/amba-pl011.c 2022-12-04 10:40:26.708034065 -0500 +@@ -2308,18 +2308,24 @@ pl011_console_write(struct console *co, { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; @@ -4097,7 +26965,7 @@ index 15f0e4d88c5a0..ffdb001e3d109 100644 /* * First save the CR then disable the interrupts -@@ -2345,8 +2351,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) +@@ -2345,8 +2351,7 @@ pl011_console_write(struct console *co, pl011_write(old_cr, uap, REG_CR); if (locked) @@ -4107,11 +26975,10 @@ index 15f0e4d88c5a0..ffdb001e3d109 100644 clk_disable(uap->clk); } -diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c -index 0aa666e247d57..d7130d1ae64c0 100644 ---- a/drivers/tty/serial/omap-serial.c -+++ b/drivers/tty/serial/omap-serial.c -@@ -1240,13 +1240,10 @@ serial_omap_console_write(struct console *co, const char *s, +diff -rupN linux.orig/drivers/tty/serial/omap-serial.c linux/drivers/tty/serial/omap-serial.c +--- linux.orig/drivers/tty/serial/omap-serial.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/serial/omap-serial.c 2022-12-04 10:40:26.708034065 -0500 +@@ -1240,13 +1240,10 @@ serial_omap_console_write(struct console unsigned int ier; int locked = 1; @@ -4128,7 +26995,7 @@ index 0aa666e247d57..d7130d1ae64c0 100644 /* * First save the IER then disable the interrupts -@@ -1273,8 +1270,7 @@ serial_omap_console_write(struct console *co, const char *s, +@@ -1273,8 +1270,7 @@ serial_omap_console_write(struct console check_modem_status(up); if (locked) @@ -4138,11 +27005,10 @@ index 0aa666e247d57..d7130d1ae64c0 100644 } static int __init -diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c -index d2b2720db6ca7..18e623325887f 100644 ---- a/drivers/tty/sysrq.c -+++ b/drivers/tty/sysrq.c -@@ -581,6 +581,7 @@ void __handle_sysrq(int key, bool check_mask) +diff -rupN linux.orig/drivers/tty/sysrq.c linux/drivers/tty/sysrq.c +--- linux.orig/drivers/tty/sysrq.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/tty/sysrq.c 2022-12-04 10:40:26.708034065 -0500 +@@ -581,6 +581,7 @@ void __handle_sysrq(int key, bool check_ rcu_sysrq_start(); rcu_read_lock(); @@ -4150,7 +27016,7 @@ index d2b2720db6ca7..18e623325887f 100644 /* * Raise the apparent loglevel to maximum so that the sysrq header * is shown to provide the user with positive feedback. We do not -@@ -622,6 +623,7 @@ void __handle_sysrq(int key, bool check_mask) +@@ -622,6 +623,7 @@ void __handle_sysrq(int key, bool check_ pr_cont("\n"); console_loglevel = orig_log_level; } @@ -4158,10 +27024,9 @@ index d2b2720db6ca7..18e623325887f 100644 rcu_read_unlock(); rcu_sysrq_end(); -diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h -index 4e0e50e7ac153..173e979b84a93 100644 ---- a/drivers/vdpa/vdpa_user/iova_domain.h -+++ b/drivers/vdpa/vdpa_user/iova_domain.h +diff -rupN linux.orig/drivers/vdpa/vdpa_user/iova_domain.h linux/drivers/vdpa/vdpa_user/iova_domain.h +--- linux.orig/drivers/vdpa/vdpa_user/iova_domain.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/drivers/vdpa/vdpa_user/iova_domain.h 2022-12-04 10:40:26.708034065 -0500 @@ -14,7 +14,6 @@ #include #include @@ -4170,10 +27035,9 @@ index 4e0e50e7ac153..173e979b84a93 100644 #define IOVA_START_PFN 1 -diff --git a/fs/dcache.c b/fs/dcache.c -index bb0c4d0038dbd..2ee8636016ee9 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c +diff -rupN linux.orig/fs/dcache.c linux/fs/dcache.c +--- linux.orig/fs/dcache.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/fs/dcache.c 2022-12-04 10:40:26.708034065 -0500 @@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) @@ -4191,7 +27055,7 @@ index bb0c4d0038dbd..2ee8636016ee9 100644 for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) -@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, +@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct in wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); @@ -4201,10 +27065,9 @@ index bb0c4d0038dbd..2ee8636016ee9 100644 wake_up_all(d_wait); } -diff --git a/include/linux/console.h b/include/linux/console.h -index 8c1686e2c2337..8a813cbaf9285 100644 ---- a/include/linux/console.h -+++ b/include/linux/console.h +diff -rupN linux.orig/include/linux/console.h linux/include/linux/console.h +--- linux.orig/include/linux/console.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/console.h 2022-12-04 10:40:26.712034055 -0500 @@ -16,6 +16,7 @@ #include @@ -4269,10 +27132,9 @@ index 8c1686e2c2337..8a813cbaf9285 100644 CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; -diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h -index 84a466b176cf4..df6d17bc30aa3 100644 ---- a/include/linux/entry-common.h -+++ b/include/linux/entry-common.h +diff -rupN linux.orig/include/linux/entry-common.h linux/include/linux/entry-common.h +--- linux.orig/include/linux/entry-common.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/entry-common.h 2022-12-04 10:40:26.712034055 -0500 @@ -57,9 +57,15 @@ # define ARCH_EXIT_TO_USER_MODE_WORK (0) #endif @@ -4290,11 +27152,10 @@ index 84a466b176cf4..df6d17bc30aa3 100644 ARCH_EXIT_TO_USER_MODE_WORK) /** -diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h -index a92bce40b04b3..bf82980f569df 100644 ---- a/include/linux/interrupt.h -+++ b/include/linux/interrupt.h -@@ -605,6 +605,35 @@ extern void __raise_softirq_irqoff(unsigned int nr); +diff -rupN linux.orig/include/linux/interrupt.h linux/include/linux/interrupt.h +--- linux.orig/include/linux/interrupt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/interrupt.h 2022-12-04 10:40:26.712034055 -0500 +@@ -605,6 +605,35 @@ extern void __raise_softirq_irqoff(unsig extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); @@ -4330,11 +27191,10 @@ index a92bce40b04b3..bf82980f569df 100644 DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) -diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h -index 1cd4e36890fbf..844a8e30e6de5 100644 ---- a/include/linux/irqdesc.h -+++ b/include/linux/irqdesc.h -@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int irq); +diff -rupN linux.orig/include/linux/irqdesc.h linux/include/linux/irqdesc.h +--- linux.orig/include/linux/irqdesc.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/irqdesc.h 2022-12-04 10:40:26.712034055 -0500 +@@ -169,6 +169,7 @@ int generic_handle_irq_safe(unsigned int * conversion failed. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq); @@ -4342,10 +27202,9 @@ index 1cd4e36890fbf..844a8e30e6de5 100644 int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq); #endif -diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h -index 1f1099dac3f05..1023f349af716 100644 ---- a/include/linux/lockdep.h -+++ b/include/linux/lockdep.h +diff -rupN linux.orig/include/linux/lockdep.h linux/include/linux/lockdep.h +--- linux.orig/include/linux/lockdep.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/lockdep.h 2022-12-04 10:40:26.712034055 -0500 @@ -435,7 +435,6 @@ enum xhlock_context_t { XHLOCK_CTX_NR, }; @@ -4354,11 +27213,10 @@ index 1f1099dac3f05..1023f349af716 100644 /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. -diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h -index 15ae78cd28536..b8728d11c9490 100644 ---- a/include/linux/mmdebug.h -+++ b/include/linux/mmdebug.h -@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm); +diff -rupN linux.orig/include/linux/mmdebug.h linux/include/linux/mmdebug.h +--- linux.orig/include/linux/mmdebug.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/mmdebug.h 2022-12-04 10:40:26.712034055 -0500 +@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif @@ -4371,10 +27229,9 @@ index 15ae78cd28536..b8728d11c9490 100644 #ifdef CONFIG_DEBUG_VIRTUAL #define VIRTUAL_BUG_ON(cond) BUG_ON(cond) #else -diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h -index 05d6f3facd5a5..5e6b840f5a9ac 100644 ---- a/include/linux/netdevice.h -+++ b/include/linux/netdevice.h +diff -rupN linux.orig/include/linux/netdevice.h linux/include/linux/netdevice.h +--- linux.orig/include/linux/netdevice.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/netdevice.h 2022-12-04 10:40:26.712034055 -0500 @@ -3156,7 +3156,11 @@ struct softnet_data { int defer_count; int defer_ipi_scheduled; @@ -4387,10 +27244,9 @@ index 05d6f3facd5a5..5e6b840f5a9ac 100644 }; static inline void input_queue_head_incr(struct softnet_data *sd) -diff --git a/include/linux/preempt.h b/include/linux/preempt.h -index b4381f255a5ca..12f59cdaaedda 100644 ---- a/include/linux/preempt.h -+++ b/include/linux/preempt.h +diff -rupN linux.orig/include/linux/preempt.h linux/include/linux/preempt.h +--- linux.orig/include/linux/preempt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/preempt.h 2022-12-04 10:40:26.712034055 -0500 @@ -196,6 +196,20 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) @@ -4537,10 +27393,9 @@ index b4381f255a5ca..12f59cdaaedda 100644 +} + #endif /* __LINUX_PREEMPT_H */ -diff --git a/include/linux/printk.h b/include/linux/printk.h -index cf7d666ab1f8e..f88ec15f83dcc 100644 ---- a/include/linux/printk.h -+++ b/include/linux/printk.h +diff -rupN linux.orig/include/linux/printk.h linux/include/linux/printk.h +--- linux.orig/include/linux/printk.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/printk.h 2022-12-04 10:40:26.712034055 -0500 @@ -169,7 +169,11 @@ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit @@ -4553,7 +27408,7 @@ index cf7d666ab1f8e..f88ec15f83dcc 100644 /* * Please don't use printk_ratelimit(), because it shares ratelimiting state -@@ -221,11 +225,23 @@ static inline void printk_deferred_exit(void) +@@ -221,11 +225,23 @@ static inline void printk_deferred_exit( { } @@ -4577,10 +27432,9 @@ index cf7d666ab1f8e..f88ec15f83dcc 100644 static inline int printk_ratelimit(void) { return 0; -diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h -index 8f416c5e929ea..c0ef596f340b5 100644 ---- a/include/linux/rwlock.h -+++ b/include/linux/rwlock.h +diff -rupN linux.orig/include/linux/rwlock.h linux/include/linux/rwlock.h +--- linux.orig/include/linux/rwlock.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/rwlock.h 2022-12-04 10:40:26.712034055 -0500 @@ -1,7 +1,7 @@ #ifndef __LINUX_RWLOCK_H #define __LINUX_RWLOCK_H @@ -4590,11 +27444,10 @@ index 8f416c5e929ea..c0ef596f340b5 100644 # error "please don't include this file directly" #endif -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 8d82d6d326701..e1623b3001c5b 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -2038,6 +2038,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) +diff -rupN linux.orig/include/linux/sched.h linux/include/linux/sched.h +--- linux.orig/include/linux/sched.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/sched.h 2022-12-04 10:40:26.712034055 -0500 +@@ -2038,6 +2038,43 @@ static inline int test_tsk_need_resched( return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } @@ -4638,10 +27491,9 @@ index 8d82d6d326701..e1623b3001c5b 100644 /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return -diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h -index 16e3d75a324c7..ee1f719a21678 100644 ---- a/include/linux/serial_8250.h -+++ b/include/linux/serial_8250.h +diff -rupN linux.orig/include/linux/serial_8250.h linux/include/linux/serial_8250.h +--- linux.orig/include/linux/serial_8250.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/serial_8250.h 2022-12-04 10:40:26.712034055 -0500 @@ -7,6 +7,7 @@ #ifndef _LINUX_SERIAL_8250_H #define _LINUX_SERIAL_8250_H @@ -4659,7 +27511,7 @@ index 16e3d75a324c7..ee1f719a21678 100644 struct uart_8250_dma *dma; const struct uart_8250_ops *ops; -@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up); +@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_82 void serial8250_set_defaults(struct uart_8250_port *up); void serial8250_console_write(struct uart_8250_port *up, const char *s, unsigned int count); @@ -4668,28 +27520,9 @@ index 16e3d75a324c7..ee1f719a21678 100644 int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); -diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h -index 5c0c5174155d0..1341f7d62da44 100644 ---- a/include/linux/spinlock.h -+++ b/include/linux/spinlock.h -@@ -1,6 +1,7 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - #ifndef __LINUX_SPINLOCK_H - #define __LINUX_SPINLOCK_H -+#define __LINUX_INSIDE_SPINLOCK_H - - /* - * include/linux/spinlock.h - generic spinlock/rwlock declarations -@@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, - - void free_bucket_spinlocks(spinlock_t *locks); - -+#undef __LINUX_INSIDE_SPINLOCK_H - #endif /* __LINUX_SPINLOCK_H */ -diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h -index 51fa0dab68c4d..89eb6f4c659c7 100644 ---- a/include/linux/spinlock_api_smp.h -+++ b/include/linux/spinlock_api_smp.h +diff -rupN linux.orig/include/linux/spinlock_api_smp.h linux/include/linux/spinlock_api_smp.h +--- linux.orig/include/linux/spinlock_api_smp.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock_api_smp.h 2022-12-04 10:40:26.712034055 -0500 @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_SMP_H #define __LINUX_SPINLOCK_API_SMP_H @@ -4699,10 +27532,9 @@ index 51fa0dab68c4d..89eb6f4c659c7 100644 # error "please don't include this file directly" #endif -diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h -index b8ba00ccccdeb..819aeba1c87e6 100644 ---- a/include/linux/spinlock_api_up.h -+++ b/include/linux/spinlock_api_up.h +diff -rupN linux.orig/include/linux/spinlock_api_up.h linux/include/linux/spinlock_api_up.h +--- linux.orig/include/linux/spinlock_api_up.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock_api_up.h 2022-12-04 10:40:26.712034055 -0500 @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_UP_H #define __LINUX_SPINLOCK_API_UP_H @@ -4712,10 +27544,26 @@ index b8ba00ccccdeb..819aeba1c87e6 100644 # error "please don't include this file directly" #endif -diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h -index 835aedaf68acd..61c49b16f69ab 100644 ---- a/include/linux/spinlock_rt.h -+++ b/include/linux/spinlock_rt.h +diff -rupN linux.orig/include/linux/spinlock.h linux/include/linux/spinlock.h +--- linux.orig/include/linux/spinlock.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock.h 2022-12-04 10:40:26.712034055 -0500 +@@ -1,6 +1,7 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + #ifndef __LINUX_SPINLOCK_H + #define __LINUX_SPINLOCK_H ++#define __LINUX_INSIDE_SPINLOCK_H + + /* + * include/linux/spinlock.h - generic spinlock/rwlock declarations +@@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t + + void free_bucket_spinlocks(spinlock_t *locks); + ++#undef __LINUX_INSIDE_SPINLOCK_H + #endif /* __LINUX_SPINLOCK_H */ +diff -rupN linux.orig/include/linux/spinlock_rt.h linux/include/linux/spinlock_rt.h +--- linux.orig/include/linux/spinlock_rt.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock_rt.h 2022-12-04 10:40:26.712034055 -0500 @@ -2,7 +2,7 @@ #ifndef __LINUX_SPINLOCK_RT_H #define __LINUX_SPINLOCK_RT_H @@ -4725,10 +27573,9 @@ index 835aedaf68acd..61c49b16f69ab 100644 #error Do not include directly. Use spinlock.h #endif -diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h -index 16521074b6f7c..c87204247592f 100644 ---- a/include/linux/spinlock_up.h -+++ b/include/linux/spinlock_up.h +diff -rupN linux.orig/include/linux/spinlock_up.h linux/include/linux/spinlock_up.h +--- linux.orig/include/linux/spinlock_up.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/spinlock_up.h 2022-12-04 10:40:26.716034044 -0500 @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_UP_H #define __LINUX_SPINLOCK_UP_H @@ -4738,11 +27585,10 @@ index 16521074b6f7c..c87204247592f 100644 # error "please don't include this file directly" #endif -diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h -index 9f392ec76f2bb..779e0e96b9cb0 100644 ---- a/include/linux/thread_info.h -+++ b/include/linux/thread_info.h -@@ -177,7 +177,17 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti +diff -rupN linux.orig/include/linux/thread_info.h linux/include/linux/thread_info.h +--- linux.orig/include/linux/thread_info.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/thread_info.h 2022-12-04 10:40:26.716034044 -0500 +@@ -177,7 +177,17 @@ static __always_inline unsigned long rea clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ @@ -4761,10 +27607,9 @@ index 9f392ec76f2bb..779e0e96b9cb0 100644 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, -diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h -index 20749bd9db718..224bf60d6563c 100644 ---- a/include/linux/trace_events.h -+++ b/include/linux/trace_events.h +diff -rupN linux.orig/include/linux/trace_events.h linux/include/linux/trace_events.h +--- linux.orig/include/linux/trace_events.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/trace_events.h 2022-12-04 10:40:26.716034044 -0500 @@ -70,6 +70,7 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; @@ -4773,7 +27618,7 @@ index 20749bd9db718..224bf60d6563c 100644 }; #define TRACE_EVENT_TYPE_MAX \ -@@ -159,9 +160,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, +@@ -159,9 +160,10 @@ static inline void tracing_generic_entry unsigned int trace_ctx) { entry->preempt_count = trace_ctx & 0xff; @@ -4799,10 +27644,9 @@ index 20749bd9db718..224bf60d6563c 100644 TRACE_FLAG_NMI = 0x40, TRACE_FLAG_BH_OFF = 0x80, }; -diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h -index 6ad4e9032d538..ffe48e69b3f3a 100644 ---- a/include/linux/u64_stats_sync.h -+++ b/include/linux/u64_stats_sync.h +diff -rupN linux.orig/include/linux/u64_stats_sync.h linux/include/linux/u64_stats_sync.h +--- linux.orig/include/linux/u64_stats_sync.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/include/linux/u64_stats_sync.h 2022-12-04 10:40:26.716034044 -0500 @@ -8,7 +8,7 @@ * * Key points : @@ -4843,7 +27687,7 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 seqcount_t seq; #endif }; -@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p) +@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_sta local64_inc(&p->v); } @@ -4867,7 +27711,7 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 typedef struct { u64 v; -@@ -123,122 +134,82 @@ static inline void u64_stats_inc(u64_stats_t *p) +@@ -123,122 +134,82 @@ static inline void u64_stats_inc(u64_sta { p->v++; } @@ -4944,25 +27788,50 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 -#else - return 0; -#endif -+} -+ + } + +-static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) -+{ + { +-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) +- preempt_disable(); +-#endif +- return __u64_stats_fetch_begin(syncp); + return read_seqcount_retry(&syncp->seq, start); -+} + } +#endif /* !64 bit */ -+ + +-static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +- unsigned int start) +static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) -+{ + { +-#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) +- return read_seqcount_retry(&syncp->seq, start); +-#else +- return false; +-#endif + __u64_stats_update_begin(syncp); -+} -+ + } + +-static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +- unsigned int start) +static inline void u64_stats_update_end(struct u64_stats_sync *syncp) -+{ + { +-#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) +- preempt_enable(); +-#endif +- return __u64_stats_fetch_retry(syncp, start); + __u64_stats_update_end(syncp); -+} -+ + } + +-/* +- * In case irq handlers can update u64 counters, readers can use following helpers +- * - SMP 32bit arches use seqcount protection, irq safe. +- * - UP 32bit must disable irqs. +- * - 64bit have no problem atomically reading u64 values, irq safe. +- */ +-static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) +static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +{ + unsigned long flags = __u64_stats_irqsave(); @@ -4976,54 +27845,23 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 +{ + __u64_stats_update_end(syncp); + __u64_stats_irqrestore(flags); - } - - static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) ++} ++ ++static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { --#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) -- preempt_disable(); --#endif - return __u64_stats_fetch_begin(syncp); - } - --static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, -- unsigned int start) --{ --#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) -- return read_seqcount_retry(&syncp->seq, start); --#else -- return false; --#endif --} -- - static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, - unsigned int start) - { --#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) -- preempt_enable(); --#endif -- return __u64_stats_fetch_retry(syncp, start); --} -- --/* -- * In case irq handlers can update u64 counters, readers can use following helpers -- * - SMP 32bit arches use seqcount protection, irq safe. -- * - UP 32bit must disable irqs. -- * - 64bit have no problem atomically reading u64 values, irq safe. -- */ --static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) --{ -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_disable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) - local_irq_disable(); -#endif -- return __u64_stats_fetch_begin(syncp); --} -- + return __u64_stats_fetch_begin(syncp); + } + -static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, - unsigned int start) --{ ++static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, ++ unsigned int start) + { -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_enable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) @@ -5032,10 +27870,9 @@ index 6ad4e9032d538..ffe48e69b3f3a 100644 return __u64_stats_fetch_retry(syncp, start); } -diff --git a/init/Kconfig b/init/Kconfig -index 532362fcfe31f..08ec5f25e6642 100644 ---- a/init/Kconfig -+++ b/init/Kconfig +diff -rupN linux.orig/init/Kconfig linux/init/Kconfig +--- linux.orig/init/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/init/Kconfig 2022-12-04 10:40:26.716034044 -0500 @@ -1574,6 +1574,10 @@ config PRINTK very difficult to diagnose system problems, saying N here is strongly discouraged. @@ -5047,27 +27884,10 @@ index 532362fcfe31f..08ec5f25e6642 100644 config BUG bool "BUG() support" if EXPERT default y -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index c2f1fd95a8214..260c08efeb486 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -1,5 +1,11 @@ - # SPDX-License-Identifier: GPL-2.0-only - -+config HAVE_PREEMPT_LAZY -+ bool -+ -+config PREEMPT_LAZY -+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT -+ - config PREEMPT_NONE_BUILD - bool - -diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c -index 22e7a805c6723..b492e482b63a9 100644 ---- a/kernel/bpf/syscall.c -+++ b/kernel/bpf/syscall.c -@@ -2107,11 +2107,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog, +diff -rupN linux.orig/kernel/bpf/syscall.c linux/kernel/bpf/syscall.c +--- linux.orig/kernel/bpf/syscall.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/bpf/syscall.c 2022-12-04 10:40:26.716034044 -0500 +@@ -2118,11 +2118,11 @@ static void bpf_prog_get_stats(const str st = per_cpu_ptr(prog->stats, cpu); do { @@ -5081,11 +27901,5333 @@ index 22e7a805c6723..b492e482b63a9 100644 nsecs += tnsecs; cnt += tcnt; misses += tmisses; -diff --git a/kernel/entry/common.c b/kernel/entry/common.c -index 063068a9ea9b3..26b772720b227 100644 ---- a/kernel/entry/common.c -+++ b/kernel/entry/common.c -@@ -153,7 +153,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, +diff -rupN linux.orig/kernel/bpf/syscall.c.orig linux/kernel/bpf/syscall.c.orig +--- linux.orig/kernel/bpf/syscall.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/kernel/bpf/syscall.c.orig 2022-12-04 10:40:18.684054629 -0500 +@@ -0,0 +1,5319 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ ++ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ ++ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) ++#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) ++#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) ++#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ ++ IS_FD_HASH(map)) ++ ++#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) ++ ++DEFINE_PER_CPU(int, bpf_prog_active); ++static DEFINE_IDR(prog_idr); ++static DEFINE_SPINLOCK(prog_idr_lock); ++static DEFINE_IDR(map_idr); ++static DEFINE_SPINLOCK(map_idr_lock); ++static DEFINE_IDR(link_idr); ++static DEFINE_SPINLOCK(link_idr_lock); ++ ++int sysctl_unprivileged_bpf_disabled __read_mostly = ++ IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; ++ ++static const struct bpf_map_ops * const bpf_map_types[] = { ++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) ++#define BPF_MAP_TYPE(_id, _ops) \ ++ [_id] = &_ops, ++#define BPF_LINK_TYPE(_id, _name) ++#include ++#undef BPF_PROG_TYPE ++#undef BPF_MAP_TYPE ++#undef BPF_LINK_TYPE ++}; ++ ++/* ++ * If we're handed a bigger struct than we know of, ensure all the unknown bits ++ * are 0 - i.e. new user-space does not rely on any kernel feature extensions ++ * we don't know about yet. ++ * ++ * There is a ToCToU between this function call and the following ++ * copy_from_user() call. However, this is not a concern since this function is ++ * meant to be a future-proofing of bits. ++ */ ++int bpf_check_uarg_tail_zero(bpfptr_t uaddr, ++ size_t expected_size, ++ size_t actual_size) ++{ ++ int res; ++ ++ if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ ++ return -E2BIG; ++ ++ if (actual_size <= expected_size) ++ return 0; ++ ++ if (uaddr.is_kernel) ++ res = memchr_inv(uaddr.kernel + expected_size, 0, ++ actual_size - expected_size) == NULL; ++ else ++ res = check_zeroed_user(uaddr.user + expected_size, ++ actual_size - expected_size); ++ if (res < 0) ++ return res; ++ return res ? 0 : -E2BIG; ++} ++ ++const struct bpf_map_ops bpf_map_offload_ops = { ++ .map_meta_equal = bpf_map_meta_equal, ++ .map_alloc = bpf_map_offload_map_alloc, ++ .map_free = bpf_map_offload_map_free, ++ .map_check_btf = map_check_no_btf, ++}; ++ ++static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) ++{ ++ const struct bpf_map_ops *ops; ++ u32 type = attr->map_type; ++ struct bpf_map *map; ++ int err; ++ ++ if (type >= ARRAY_SIZE(bpf_map_types)) ++ return ERR_PTR(-EINVAL); ++ type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); ++ ops = bpf_map_types[type]; ++ if (!ops) ++ return ERR_PTR(-EINVAL); ++ ++ if (ops->map_alloc_check) { ++ err = ops->map_alloc_check(attr); ++ if (err) ++ return ERR_PTR(err); ++ } ++ if (attr->map_ifindex) ++ ops = &bpf_map_offload_ops; ++ map = ops->map_alloc(attr); ++ if (IS_ERR(map)) ++ return map; ++ map->ops = ops; ++ map->map_type = type; ++ return map; ++} ++ ++static void bpf_map_write_active_inc(struct bpf_map *map) ++{ ++ atomic64_inc(&map->writecnt); ++} ++ ++static void bpf_map_write_active_dec(struct bpf_map *map) ++{ ++ atomic64_dec(&map->writecnt); ++} ++ ++bool bpf_map_write_active(const struct bpf_map *map) ++{ ++ return atomic64_read(&map->writecnt) != 0; ++} ++ ++static u32 bpf_map_value_size(const struct bpf_map *map) ++{ ++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || ++ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) ++ return round_up(map->value_size, 8) * num_possible_cpus(); ++ else if (IS_FD_MAP(map)) ++ return sizeof(u32); ++ else ++ return map->value_size; ++} ++ ++static void maybe_wait_bpf_programs(struct bpf_map *map) ++{ ++ /* Wait for any running BPF programs to complete so that ++ * userspace, when we return to it, knows that all programs ++ * that could be running use the new map value. ++ */ ++ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || ++ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) ++ synchronize_rcu(); ++} ++ ++static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, ++ void *value, __u64 flags) ++{ ++ int err; ++ ++ /* Need to create a kthread, thus must support schedule */ ++ if (bpf_map_is_dev_bound(map)) { ++ return bpf_map_offload_update_elem(map, key, value, flags); ++ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || ++ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { ++ return map->ops->map_update_elem(map, key, value, flags); ++ } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || ++ map->map_type == BPF_MAP_TYPE_SOCKMAP) { ++ return sock_map_update_elem_sys(map, key, value, flags); ++ } else if (IS_FD_PROG_ARRAY(map)) { ++ return bpf_fd_array_map_update_elem(map, f.file, key, value, ++ flags); ++ } ++ ++ bpf_disable_instrumentation(); ++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { ++ err = bpf_percpu_hash_update(map, key, value, flags); ++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { ++ err = bpf_percpu_array_update(map, key, value, flags); ++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { ++ err = bpf_percpu_cgroup_storage_update(map, key, value, ++ flags); ++ } else if (IS_FD_ARRAY(map)) { ++ rcu_read_lock(); ++ err = bpf_fd_array_map_update_elem(map, f.file, key, value, ++ flags); ++ rcu_read_unlock(); ++ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { ++ rcu_read_lock(); ++ err = bpf_fd_htab_map_update_elem(map, f.file, key, value, ++ flags); ++ rcu_read_unlock(); ++ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { ++ /* rcu_read_lock() is not needed */ ++ err = bpf_fd_reuseport_array_update_elem(map, key, value, ++ flags); ++ } else if (map->map_type == BPF_MAP_TYPE_QUEUE || ++ map->map_type == BPF_MAP_TYPE_STACK || ++ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { ++ err = map->ops->map_push_elem(map, value, flags); ++ } else { ++ rcu_read_lock(); ++ err = map->ops->map_update_elem(map, key, value, flags); ++ rcu_read_unlock(); ++ } ++ bpf_enable_instrumentation(); ++ maybe_wait_bpf_programs(map); ++ ++ return err; ++} ++ ++static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, ++ __u64 flags) ++{ ++ void *ptr; ++ int err; ++ ++ if (bpf_map_is_dev_bound(map)) ++ return bpf_map_offload_lookup_elem(map, key, value); ++ ++ bpf_disable_instrumentation(); ++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { ++ err = bpf_percpu_hash_copy(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { ++ err = bpf_percpu_array_copy(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { ++ err = bpf_percpu_cgroup_storage_copy(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { ++ err = bpf_stackmap_copy(map, key, value); ++ } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { ++ err = bpf_fd_array_map_lookup_elem(map, key, value); ++ } else if (IS_FD_HASH(map)) { ++ err = bpf_fd_htab_map_lookup_elem(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { ++ err = bpf_fd_reuseport_array_lookup_elem(map, key, value); ++ } else if (map->map_type == BPF_MAP_TYPE_QUEUE || ++ map->map_type == BPF_MAP_TYPE_STACK || ++ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { ++ err = map->ops->map_peek_elem(map, value); ++ } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { ++ /* struct_ops map requires directly updating "value" */ ++ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); ++ } else { ++ rcu_read_lock(); ++ if (map->ops->map_lookup_elem_sys_only) ++ ptr = map->ops->map_lookup_elem_sys_only(map, key); ++ else ++ ptr = map->ops->map_lookup_elem(map, key); ++ if (IS_ERR(ptr)) { ++ err = PTR_ERR(ptr); ++ } else if (!ptr) { ++ err = -ENOENT; ++ } else { ++ err = 0; ++ if (flags & BPF_F_LOCK) ++ /* lock 'ptr' and copy everything but lock */ ++ copy_map_value_locked(map, value, ptr, true); ++ else ++ copy_map_value(map, value, ptr); ++ /* mask lock and timer, since value wasn't zero inited */ ++ check_and_init_map_value(map, value); ++ } ++ rcu_read_unlock(); ++ } ++ ++ bpf_enable_instrumentation(); ++ maybe_wait_bpf_programs(map); ++ ++ return err; ++} ++ ++/* Please, do not use this function outside from the map creation path ++ * (e.g. in map update path) without taking care of setting the active ++ * memory cgroup (see at bpf_map_kmalloc_node() for example). ++ */ ++static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) ++{ ++ /* We really just want to fail instead of triggering OOM killer ++ * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, ++ * which is used for lower order allocation requests. ++ * ++ * It has been observed that higher order allocation requests done by ++ * vmalloc with __GFP_NORETRY being set might fail due to not trying ++ * to reclaim memory from the page cache, thus we set ++ * __GFP_RETRY_MAYFAIL to avoid such situations. ++ */ ++ ++ const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; ++ unsigned int flags = 0; ++ unsigned long align = 1; ++ void *area; ++ ++ if (size >= SIZE_MAX) ++ return NULL; ++ ++ /* kmalloc()'ed memory can't be mmap()'ed */ ++ if (mmapable) { ++ BUG_ON(!PAGE_ALIGNED(size)); ++ align = SHMLBA; ++ flags = VM_USERMAP; ++ } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { ++ area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, ++ numa_node); ++ if (area != NULL) ++ return area; ++ } ++ ++ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, ++ gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, ++ flags, numa_node, __builtin_return_address(0)); ++} ++ ++void *bpf_map_area_alloc(u64 size, int numa_node) ++{ ++ return __bpf_map_area_alloc(size, numa_node, false); ++} ++ ++void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) ++{ ++ return __bpf_map_area_alloc(size, numa_node, true); ++} ++ ++void bpf_map_area_free(void *area) ++{ ++ kvfree(area); ++} ++ ++static u32 bpf_map_flags_retain_permanent(u32 flags) ++{ ++ /* Some map creation flags are not tied to the map object but ++ * rather to the map fd instead, so they have no meaning upon ++ * map object inspection since multiple file descriptors with ++ * different (access) properties can exist here. Thus, given ++ * this has zero meaning for the map itself, lets clear these ++ * from here. ++ */ ++ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); ++} ++ ++void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) ++{ ++ map->map_type = attr->map_type; ++ map->key_size = attr->key_size; ++ map->value_size = attr->value_size; ++ map->max_entries = attr->max_entries; ++ map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); ++ map->numa_node = bpf_map_attr_numa_node(attr); ++ map->map_extra = attr->map_extra; ++} ++ ++static int bpf_map_alloc_id(struct bpf_map *map) ++{ ++ int id; ++ ++ idr_preload(GFP_KERNEL); ++ spin_lock_bh(&map_idr_lock); ++ id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); ++ if (id > 0) ++ map->id = id; ++ spin_unlock_bh(&map_idr_lock); ++ idr_preload_end(); ++ ++ if (WARN_ON_ONCE(!id)) ++ return -ENOSPC; ++ ++ return id > 0 ? 0 : id; ++} ++ ++void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) ++{ ++ unsigned long flags; ++ ++ /* Offloaded maps are removed from the IDR store when their device ++ * disappears - even if someone holds an fd to them they are unusable, ++ * the memory is gone, all ops will fail; they are simply waiting for ++ * refcnt to drop to be freed. ++ */ ++ if (!map->id) ++ return; ++ ++ if (do_idr_lock) ++ spin_lock_irqsave(&map_idr_lock, flags); ++ else ++ __acquire(&map_idr_lock); ++ ++ idr_remove(&map_idr, map->id); ++ map->id = 0; ++ ++ if (do_idr_lock) ++ spin_unlock_irqrestore(&map_idr_lock, flags); ++ else ++ __release(&map_idr_lock); ++} ++ ++#ifdef CONFIG_MEMCG_KMEM ++static void bpf_map_save_memcg(struct bpf_map *map) ++{ ++ /* Currently if a map is created by a process belonging to the root ++ * memory cgroup, get_obj_cgroup_from_current() will return NULL. ++ * So we have to check map->objcg for being NULL each time it's ++ * being used. ++ */ ++ map->objcg = get_obj_cgroup_from_current(); ++} ++ ++static void bpf_map_release_memcg(struct bpf_map *map) ++{ ++ if (map->objcg) ++ obj_cgroup_put(map->objcg); ++} ++ ++static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) ++{ ++ if (map->objcg) ++ return get_mem_cgroup_from_objcg(map->objcg); ++ ++ return root_mem_cgroup; ++} ++ ++void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, ++ int node) ++{ ++ struct mem_cgroup *memcg, *old_memcg; ++ void *ptr; ++ ++ memcg = bpf_map_get_memcg(map); ++ old_memcg = set_active_memcg(memcg); ++ ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); ++ set_active_memcg(old_memcg); ++ mem_cgroup_put(memcg); ++ ++ return ptr; ++} ++ ++void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) ++{ ++ struct mem_cgroup *memcg, *old_memcg; ++ void *ptr; ++ ++ memcg = bpf_map_get_memcg(map); ++ old_memcg = set_active_memcg(memcg); ++ ptr = kzalloc(size, flags | __GFP_ACCOUNT); ++ set_active_memcg(old_memcg); ++ mem_cgroup_put(memcg); ++ ++ return ptr; ++} ++ ++void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, ++ size_t align, gfp_t flags) ++{ ++ struct mem_cgroup *memcg, *old_memcg; ++ void __percpu *ptr; ++ ++ memcg = bpf_map_get_memcg(map); ++ old_memcg = set_active_memcg(memcg); ++ ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); ++ set_active_memcg(old_memcg); ++ mem_cgroup_put(memcg); ++ ++ return ptr; ++} ++ ++#else ++static void bpf_map_save_memcg(struct bpf_map *map) ++{ ++} ++ ++static void bpf_map_release_memcg(struct bpf_map *map) ++{ ++} ++#endif ++ ++static int bpf_map_kptr_off_cmp(const void *a, const void *b) ++{ ++ const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; ++ ++ if (off_desc1->offset < off_desc2->offset) ++ return -1; ++ else if (off_desc1->offset > off_desc2->offset) ++ return 1; ++ return 0; ++} ++ ++struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) ++{ ++ /* Since members are iterated in btf_find_field in increasing order, ++ * offsets appended to kptr_off_tab are in increasing order, so we can ++ * do bsearch to find exact match. ++ */ ++ struct bpf_map_value_off *tab; ++ ++ if (!map_value_has_kptrs(map)) ++ return NULL; ++ tab = map->kptr_off_tab; ++ return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); ++} ++ ++void bpf_map_free_kptr_off_tab(struct bpf_map *map) ++{ ++ struct bpf_map_value_off *tab = map->kptr_off_tab; ++ int i; ++ ++ if (!map_value_has_kptrs(map)) ++ return; ++ for (i = 0; i < tab->nr_off; i++) { ++ if (tab->off[i].kptr.module) ++ module_put(tab->off[i].kptr.module); ++ btf_put(tab->off[i].kptr.btf); ++ } ++ kfree(tab); ++ map->kptr_off_tab = NULL; ++} ++ ++struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) ++{ ++ struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; ++ int size, i; ++ ++ if (!map_value_has_kptrs(map)) ++ return ERR_PTR(-ENOENT); ++ size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); ++ new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); ++ if (!new_tab) ++ return ERR_PTR(-ENOMEM); ++ /* Do a deep copy of the kptr_off_tab */ ++ for (i = 0; i < tab->nr_off; i++) { ++ btf_get(tab->off[i].kptr.btf); ++ if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { ++ while (i--) { ++ if (tab->off[i].kptr.module) ++ module_put(tab->off[i].kptr.module); ++ btf_put(tab->off[i].kptr.btf); ++ } ++ kfree(new_tab); ++ return ERR_PTR(-ENXIO); ++ } ++ } ++ return new_tab; ++} ++ ++bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) ++{ ++ struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; ++ bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); ++ int size; ++ ++ if (!a_has_kptr && !b_has_kptr) ++ return true; ++ if (a_has_kptr != b_has_kptr) ++ return false; ++ if (tab_a->nr_off != tab_b->nr_off) ++ return false; ++ size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); ++ return !memcmp(tab_a, tab_b, size); ++} ++ ++/* Caller must ensure map_value_has_kptrs is true. Note that this function can ++ * be called on a map value while the map_value is visible to BPF programs, as ++ * it ensures the correct synchronization, and we already enforce the same using ++ * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. ++ */ ++void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) ++{ ++ struct bpf_map_value_off *tab = map->kptr_off_tab; ++ unsigned long *btf_id_ptr; ++ int i; ++ ++ for (i = 0; i < tab->nr_off; i++) { ++ struct bpf_map_value_off_desc *off_desc = &tab->off[i]; ++ unsigned long old_ptr; ++ ++ btf_id_ptr = map_value + off_desc->offset; ++ if (off_desc->type == BPF_KPTR_UNREF) { ++ u64 *p = (u64 *)btf_id_ptr; ++ ++ WRITE_ONCE(*p, 0); ++ continue; ++ } ++ old_ptr = xchg(btf_id_ptr, 0); ++ off_desc->kptr.dtor((void *)old_ptr); ++ } ++} ++ ++/* called from workqueue */ ++static void bpf_map_free_deferred(struct work_struct *work) ++{ ++ struct bpf_map *map = container_of(work, struct bpf_map, work); ++ ++ security_bpf_map_free(map); ++ kfree(map->off_arr); ++ bpf_map_release_memcg(map); ++ /* implementation dependent freeing, map_free callback also does ++ * bpf_map_free_kptr_off_tab, if needed. ++ */ ++ map->ops->map_free(map); ++} ++ ++static void bpf_map_put_uref(struct bpf_map *map) ++{ ++ if (atomic64_dec_and_test(&map->usercnt)) { ++ if (map->ops->map_release_uref) ++ map->ops->map_release_uref(map); ++ } ++} ++ ++/* decrement map refcnt and schedule it for freeing via workqueue ++ * (unrelying map implementation ops->map_free() might sleep) ++ */ ++static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) ++{ ++ if (atomic64_dec_and_test(&map->refcnt)) { ++ /* bpf_map_free_id() must be called first */ ++ bpf_map_free_id(map, do_idr_lock); ++ btf_put(map->btf); ++ INIT_WORK(&map->work, bpf_map_free_deferred); ++ schedule_work(&map->work); ++ } ++} ++ ++void bpf_map_put(struct bpf_map *map) ++{ ++ __bpf_map_put(map, true); ++} ++EXPORT_SYMBOL_GPL(bpf_map_put); ++ ++void bpf_map_put_with_uref(struct bpf_map *map) ++{ ++ bpf_map_put_uref(map); ++ bpf_map_put(map); ++} ++ ++static int bpf_map_release(struct inode *inode, struct file *filp) ++{ ++ struct bpf_map *map = filp->private_data; ++ ++ if (map->ops->map_release) ++ map->ops->map_release(map, filp); ++ ++ bpf_map_put_with_uref(map); ++ return 0; ++} ++ ++static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) ++{ ++ fmode_t mode = f.file->f_mode; ++ ++ /* Our file permissions may have been overridden by global ++ * map permissions facing syscall side. ++ */ ++ if (READ_ONCE(map->frozen)) ++ mode &= ~FMODE_CAN_WRITE; ++ return mode; ++} ++ ++#ifdef CONFIG_PROC_FS ++/* Provides an approximation of the map's memory footprint. ++ * Used only to provide a backward compatibility and display ++ * a reasonable "memlock" info. ++ */ ++static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) ++{ ++ unsigned long size; ++ ++ size = round_up(map->key_size + bpf_map_value_size(map), 8); ++ ++ return round_up(map->max_entries * size, PAGE_SIZE); ++} ++ ++static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) ++{ ++ struct bpf_map *map = filp->private_data; ++ u32 type = 0, jited = 0; ++ ++ if (map_type_contains_progs(map)) { ++ spin_lock(&map->owner.lock); ++ type = map->owner.type; ++ jited = map->owner.jited; ++ spin_unlock(&map->owner.lock); ++ } ++ ++ seq_printf(m, ++ "map_type:\t%u\n" ++ "key_size:\t%u\n" ++ "value_size:\t%u\n" ++ "max_entries:\t%u\n" ++ "map_flags:\t%#x\n" ++ "map_extra:\t%#llx\n" ++ "memlock:\t%lu\n" ++ "map_id:\t%u\n" ++ "frozen:\t%u\n", ++ map->map_type, ++ map->key_size, ++ map->value_size, ++ map->max_entries, ++ map->map_flags, ++ (unsigned long long)map->map_extra, ++ bpf_map_memory_footprint(map), ++ map->id, ++ READ_ONCE(map->frozen)); ++ if (type) { ++ seq_printf(m, "owner_prog_type:\t%u\n", type); ++ seq_printf(m, "owner_jited:\t%u\n", jited); ++ } ++} ++#endif ++ ++static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, ++ loff_t *ppos) ++{ ++ /* We need this handler such that alloc_file() enables ++ * f_mode with FMODE_CAN_READ. ++ */ ++ return -EINVAL; ++} ++ ++static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, ++ size_t siz, loff_t *ppos) ++{ ++ /* We need this handler such that alloc_file() enables ++ * f_mode with FMODE_CAN_WRITE. ++ */ ++ return -EINVAL; ++} ++ ++/* called for any extra memory-mapped regions (except initial) */ ++static void bpf_map_mmap_open(struct vm_area_struct *vma) ++{ ++ struct bpf_map *map = vma->vm_file->private_data; ++ ++ if (vma->vm_flags & VM_MAYWRITE) ++ bpf_map_write_active_inc(map); ++} ++ ++/* called for all unmapped memory region (including initial) */ ++static void bpf_map_mmap_close(struct vm_area_struct *vma) ++{ ++ struct bpf_map *map = vma->vm_file->private_data; ++ ++ if (vma->vm_flags & VM_MAYWRITE) ++ bpf_map_write_active_dec(map); ++} ++ ++static const struct vm_operations_struct bpf_map_default_vmops = { ++ .open = bpf_map_mmap_open, ++ .close = bpf_map_mmap_close, ++}; ++ ++static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) ++{ ++ struct bpf_map *map = filp->private_data; ++ int err; ++ ++ if (!map->ops->map_mmap || map_value_has_spin_lock(map) || ++ map_value_has_timer(map) || map_value_has_kptrs(map)) ++ return -ENOTSUPP; ++ ++ if (!(vma->vm_flags & VM_SHARED)) ++ return -EINVAL; ++ ++ mutex_lock(&map->freeze_mutex); ++ ++ if (vma->vm_flags & VM_WRITE) { ++ if (map->frozen) { ++ err = -EPERM; ++ goto out; ++ } ++ /* map is meant to be read-only, so do not allow mapping as ++ * writable, because it's possible to leak a writable page ++ * reference and allows user-space to still modify it after ++ * freezing, while verifier will assume contents do not change ++ */ ++ if (map->map_flags & BPF_F_RDONLY_PROG) { ++ err = -EACCES; ++ goto out; ++ } ++ } ++ ++ /* set default open/close callbacks */ ++ vma->vm_ops = &bpf_map_default_vmops; ++ vma->vm_private_data = map; ++ vma->vm_flags &= ~VM_MAYEXEC; ++ if (!(vma->vm_flags & VM_WRITE)) ++ /* disallow re-mapping with PROT_WRITE */ ++ vma->vm_flags &= ~VM_MAYWRITE; ++ ++ err = map->ops->map_mmap(map, vma); ++ if (err) ++ goto out; ++ ++ if (vma->vm_flags & VM_MAYWRITE) ++ bpf_map_write_active_inc(map); ++out: ++ mutex_unlock(&map->freeze_mutex); ++ return err; ++} ++ ++static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) ++{ ++ struct bpf_map *map = filp->private_data; ++ ++ if (map->ops->map_poll) ++ return map->ops->map_poll(map, filp, pts); ++ ++ return EPOLLERR; ++} ++ ++const struct file_operations bpf_map_fops = { ++#ifdef CONFIG_PROC_FS ++ .show_fdinfo = bpf_map_show_fdinfo, ++#endif ++ .release = bpf_map_release, ++ .read = bpf_dummy_read, ++ .write = bpf_dummy_write, ++ .mmap = bpf_map_mmap, ++ .poll = bpf_map_poll, ++}; ++ ++int bpf_map_new_fd(struct bpf_map *map, int flags) ++{ ++ int ret; ++ ++ ret = security_bpf_map(map, OPEN_FMODE(flags)); ++ if (ret < 0) ++ return ret; ++ ++ return anon_inode_getfd("bpf-map", &bpf_map_fops, map, ++ flags | O_CLOEXEC); ++} ++ ++int bpf_get_file_flag(int flags) ++{ ++ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) ++ return -EINVAL; ++ if (flags & BPF_F_RDONLY) ++ return O_RDONLY; ++ if (flags & BPF_F_WRONLY) ++ return O_WRONLY; ++ return O_RDWR; ++} ++ ++/* helper macro to check that unused fields 'union bpf_attr' are zero */ ++#define CHECK_ATTR(CMD) \ ++ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ ++ sizeof(attr->CMD##_LAST_FIELD), 0, \ ++ sizeof(*attr) - \ ++ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ ++ sizeof(attr->CMD##_LAST_FIELD)) != NULL ++ ++/* dst and src must have at least "size" number of bytes. ++ * Return strlen on success and < 0 on error. ++ */ ++int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) ++{ ++ const char *end = src + size; ++ const char *orig_src = src; ++ ++ memset(dst, 0, size); ++ /* Copy all isalnum(), '_' and '.' chars. */ ++ while (src < end && *src) { ++ if (!isalnum(*src) && ++ *src != '_' && *src != '.') ++ return -EINVAL; ++ *dst++ = *src++; ++ } ++ ++ /* No '\0' found in "size" number of bytes */ ++ if (src == end) ++ return -EINVAL; ++ ++ return src - orig_src; ++} ++ ++int map_check_no_btf(const struct bpf_map *map, ++ const struct btf *btf, ++ const struct btf_type *key_type, ++ const struct btf_type *value_type) ++{ ++ return -ENOTSUPP; ++} ++ ++static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) ++{ ++ const u32 a = *(const u32 *)_a; ++ const u32 b = *(const u32 *)_b; ++ ++ if (a < b) ++ return -1; ++ else if (a > b) ++ return 1; ++ return 0; ++} ++ ++static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) ++{ ++ struct bpf_map *map = (struct bpf_map *)priv; ++ u32 *off_base = map->off_arr->field_off; ++ u32 *a = _a, *b = _b; ++ u8 *sz_a, *sz_b; ++ ++ sz_a = map->off_arr->field_sz + (a - off_base); ++ sz_b = map->off_arr->field_sz + (b - off_base); ++ ++ swap(*a, *b); ++ swap(*sz_a, *sz_b); ++} ++ ++static int bpf_map_alloc_off_arr(struct bpf_map *map) ++{ ++ bool has_spin_lock = map_value_has_spin_lock(map); ++ bool has_timer = map_value_has_timer(map); ++ bool has_kptrs = map_value_has_kptrs(map); ++ struct bpf_map_off_arr *off_arr; ++ u32 i; ++ ++ if (!has_spin_lock && !has_timer && !has_kptrs) { ++ map->off_arr = NULL; ++ return 0; ++ } ++ ++ off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); ++ if (!off_arr) ++ return -ENOMEM; ++ map->off_arr = off_arr; ++ ++ off_arr->cnt = 0; ++ if (has_spin_lock) { ++ i = off_arr->cnt; ++ ++ off_arr->field_off[i] = map->spin_lock_off; ++ off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); ++ off_arr->cnt++; ++ } ++ if (has_timer) { ++ i = off_arr->cnt; ++ ++ off_arr->field_off[i] = map->timer_off; ++ off_arr->field_sz[i] = sizeof(struct bpf_timer); ++ off_arr->cnt++; ++ } ++ if (has_kptrs) { ++ struct bpf_map_value_off *tab = map->kptr_off_tab; ++ u32 *off = &off_arr->field_off[off_arr->cnt]; ++ u8 *sz = &off_arr->field_sz[off_arr->cnt]; ++ ++ for (i = 0; i < tab->nr_off; i++) { ++ *off++ = tab->off[i].offset; ++ *sz++ = sizeof(u64); ++ } ++ off_arr->cnt += tab->nr_off; ++ } ++ ++ if (off_arr->cnt == 1) ++ return 0; ++ sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), ++ map_off_arr_cmp, map_off_arr_swap, map); ++ return 0; ++} ++ ++static int map_check_btf(struct bpf_map *map, const struct btf *btf, ++ u32 btf_key_id, u32 btf_value_id) ++{ ++ const struct btf_type *key_type, *value_type; ++ u32 key_size, value_size; ++ int ret = 0; ++ ++ /* Some maps allow key to be unspecified. */ ++ if (btf_key_id) { ++ key_type = btf_type_id_size(btf, &btf_key_id, &key_size); ++ if (!key_type || key_size != map->key_size) ++ return -EINVAL; ++ } else { ++ key_type = btf_type_by_id(btf, 0); ++ if (!map->ops->map_check_btf) ++ return -EINVAL; ++ } ++ ++ value_type = btf_type_id_size(btf, &btf_value_id, &value_size); ++ if (!value_type || value_size != map->value_size) ++ return -EINVAL; ++ ++ map->spin_lock_off = btf_find_spin_lock(btf, value_type); ++ ++ if (map_value_has_spin_lock(map)) { ++ if (map->map_flags & BPF_F_RDONLY_PROG) ++ return -EACCES; ++ if (map->map_type != BPF_MAP_TYPE_HASH && ++ map->map_type != BPF_MAP_TYPE_ARRAY && ++ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && ++ map->map_type != BPF_MAP_TYPE_SK_STORAGE && ++ map->map_type != BPF_MAP_TYPE_INODE_STORAGE && ++ map->map_type != BPF_MAP_TYPE_TASK_STORAGE) ++ return -ENOTSUPP; ++ if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > ++ map->value_size) { ++ WARN_ONCE(1, ++ "verifier bug spin_lock_off %d value_size %d\n", ++ map->spin_lock_off, map->value_size); ++ return -EFAULT; ++ } ++ } ++ ++ map->timer_off = btf_find_timer(btf, value_type); ++ if (map_value_has_timer(map)) { ++ if (map->map_flags & BPF_F_RDONLY_PROG) ++ return -EACCES; ++ if (map->map_type != BPF_MAP_TYPE_HASH && ++ map->map_type != BPF_MAP_TYPE_LRU_HASH && ++ map->map_type != BPF_MAP_TYPE_ARRAY) ++ return -EOPNOTSUPP; ++ } ++ ++ map->kptr_off_tab = btf_parse_kptrs(btf, value_type); ++ if (map_value_has_kptrs(map)) { ++ if (!bpf_capable()) { ++ ret = -EPERM; ++ goto free_map_tab; ++ } ++ if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { ++ ret = -EACCES; ++ goto free_map_tab; ++ } ++ if (map->map_type != BPF_MAP_TYPE_HASH && ++ map->map_type != BPF_MAP_TYPE_LRU_HASH && ++ map->map_type != BPF_MAP_TYPE_ARRAY) { ++ ret = -EOPNOTSUPP; ++ goto free_map_tab; ++ } ++ } ++ ++ if (map->ops->map_check_btf) { ++ ret = map->ops->map_check_btf(map, btf, key_type, value_type); ++ if (ret < 0) ++ goto free_map_tab; ++ } ++ ++ return ret; ++free_map_tab: ++ bpf_map_free_kptr_off_tab(map); ++ return ret; ++} ++ ++#define BPF_MAP_CREATE_LAST_FIELD map_extra ++/* called via syscall */ ++static int map_create(union bpf_attr *attr) ++{ ++ int numa_node = bpf_map_attr_numa_node(attr); ++ struct bpf_map *map; ++ int f_flags; ++ int err; ++ ++ err = CHECK_ATTR(BPF_MAP_CREATE); ++ if (err) ++ return -EINVAL; ++ ++ if (attr->btf_vmlinux_value_type_id) { ++ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || ++ attr->btf_key_type_id || attr->btf_value_type_id) ++ return -EINVAL; ++ } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { ++ return -EINVAL; ++ } ++ ++ if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && ++ attr->map_extra != 0) ++ return -EINVAL; ++ ++ f_flags = bpf_get_file_flag(attr->map_flags); ++ if (f_flags < 0) ++ return f_flags; ++ ++ if (numa_node != NUMA_NO_NODE && ++ ((unsigned int)numa_node >= nr_node_ids || ++ !node_online(numa_node))) ++ return -EINVAL; ++ ++ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ ++ map = find_and_alloc_map(attr); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ ++ err = bpf_obj_name_cpy(map->name, attr->map_name, ++ sizeof(attr->map_name)); ++ if (err < 0) ++ goto free_map; ++ ++ atomic64_set(&map->refcnt, 1); ++ atomic64_set(&map->usercnt, 1); ++ mutex_init(&map->freeze_mutex); ++ spin_lock_init(&map->owner.lock); ++ ++ map->spin_lock_off = -EINVAL; ++ map->timer_off = -EINVAL; ++ if (attr->btf_key_type_id || attr->btf_value_type_id || ++ /* Even the map's value is a kernel's struct, ++ * the bpf_prog.o must have BTF to begin with ++ * to figure out the corresponding kernel's ++ * counter part. Thus, attr->btf_fd has ++ * to be valid also. ++ */ ++ attr->btf_vmlinux_value_type_id) { ++ struct btf *btf; ++ ++ btf = btf_get_by_fd(attr->btf_fd); ++ if (IS_ERR(btf)) { ++ err = PTR_ERR(btf); ++ goto free_map; ++ } ++ if (btf_is_kernel(btf)) { ++ btf_put(btf); ++ err = -EACCES; ++ goto free_map; ++ } ++ map->btf = btf; ++ ++ if (attr->btf_value_type_id) { ++ err = map_check_btf(map, btf, attr->btf_key_type_id, ++ attr->btf_value_type_id); ++ if (err) ++ goto free_map; ++ } ++ ++ map->btf_key_type_id = attr->btf_key_type_id; ++ map->btf_value_type_id = attr->btf_value_type_id; ++ map->btf_vmlinux_value_type_id = ++ attr->btf_vmlinux_value_type_id; ++ } ++ ++ err = bpf_map_alloc_off_arr(map); ++ if (err) ++ goto free_map; ++ ++ err = security_bpf_map_alloc(map); ++ if (err) ++ goto free_map_off_arr; ++ ++ err = bpf_map_alloc_id(map); ++ if (err) ++ goto free_map_sec; ++ ++ bpf_map_save_memcg(map); ++ ++ err = bpf_map_new_fd(map, f_flags); ++ if (err < 0) { ++ /* failed to allocate fd. ++ * bpf_map_put_with_uref() is needed because the above ++ * bpf_map_alloc_id() has published the map ++ * to the userspace and the userspace may ++ * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. ++ */ ++ bpf_map_put_with_uref(map); ++ return err; ++ } ++ ++ return err; ++ ++free_map_sec: ++ security_bpf_map_free(map); ++free_map_off_arr: ++ kfree(map->off_arr); ++free_map: ++ btf_put(map->btf); ++ map->ops->map_free(map); ++ return err; ++} ++ ++/* if error is returned, fd is released. ++ * On success caller should complete fd access with matching fdput() ++ */ ++struct bpf_map *__bpf_map_get(struct fd f) ++{ ++ if (!f.file) ++ return ERR_PTR(-EBADF); ++ if (f.file->f_op != &bpf_map_fops) { ++ fdput(f); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ return f.file->private_data; ++} ++ ++void bpf_map_inc(struct bpf_map *map) ++{ ++ atomic64_inc(&map->refcnt); ++} ++EXPORT_SYMBOL_GPL(bpf_map_inc); ++ ++void bpf_map_inc_with_uref(struct bpf_map *map) ++{ ++ atomic64_inc(&map->refcnt); ++ atomic64_inc(&map->usercnt); ++} ++EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); ++ ++struct bpf_map *bpf_map_get(u32 ufd) ++{ ++ struct fd f = fdget(ufd); ++ struct bpf_map *map; ++ ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return map; ++ ++ bpf_map_inc(map); ++ fdput(f); ++ ++ return map; ++} ++EXPORT_SYMBOL(bpf_map_get); ++ ++struct bpf_map *bpf_map_get_with_uref(u32 ufd) ++{ ++ struct fd f = fdget(ufd); ++ struct bpf_map *map; ++ ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return map; ++ ++ bpf_map_inc_with_uref(map); ++ fdput(f); ++ ++ return map; ++} ++ ++/* map_idr_lock should have been held */ ++static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) ++{ ++ int refold; ++ ++ refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); ++ if (!refold) ++ return ERR_PTR(-ENOENT); ++ if (uref) ++ atomic64_inc(&map->usercnt); ++ ++ return map; ++} ++ ++struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) ++{ ++ spin_lock_bh(&map_idr_lock); ++ map = __bpf_map_inc_not_zero(map, false); ++ spin_unlock_bh(&map_idr_lock); ++ ++ return map; ++} ++EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); ++ ++int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) ++{ ++ return -ENOTSUPP; ++} ++ ++static void *__bpf_copy_key(void __user *ukey, u64 key_size) ++{ ++ if (key_size) ++ return vmemdup_user(ukey, key_size); ++ ++ if (ukey) ++ return ERR_PTR(-EINVAL); ++ ++ return NULL; ++} ++ ++static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) ++{ ++ if (key_size) ++ return kvmemdup_bpfptr(ukey, key_size); ++ ++ if (!bpfptr_is_null(ukey)) ++ return ERR_PTR(-EINVAL); ++ ++ return NULL; ++} ++ ++/* last field in 'union bpf_attr' used by this command */ ++#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags ++ ++static int map_lookup_elem(union bpf_attr *attr) ++{ ++ void __user *ukey = u64_to_user_ptr(attr->key); ++ void __user *uvalue = u64_to_user_ptr(attr->value); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *value; ++ u32 value_size; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) ++ return -EINVAL; ++ ++ if (attr->flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if ((attr->flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ err = -EINVAL; ++ goto err_put; ++ } ++ ++ key = __bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ ++ value_size = bpf_map_value_size(map); ++ ++ err = -ENOMEM; ++ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) ++ goto free_key; ++ ++ if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { ++ if (copy_from_user(value, uvalue, value_size)) ++ err = -EFAULT; ++ else ++ err = bpf_map_copy_value(map, key, value, attr->flags); ++ goto free_value; ++ } ++ ++ err = bpf_map_copy_value(map, key, value, attr->flags); ++ if (err) ++ goto free_value; ++ ++ err = -EFAULT; ++ if (copy_to_user(uvalue, value, value_size) != 0) ++ goto free_value; ++ ++ err = 0; ++ ++free_value: ++ kvfree(value); ++free_key: ++ kvfree(key); ++err_put: ++ fdput(f); ++ return err; ++} ++ ++ ++#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags ++ ++static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) ++{ ++ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); ++ bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *value; ++ u32 value_size; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ bpf_map_write_active_inc(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if ((attr->flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ err = -EINVAL; ++ goto err_put; ++ } ++ ++ key = ___bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ ++ value_size = bpf_map_value_size(map); ++ ++ err = -ENOMEM; ++ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) ++ goto free_key; ++ ++ err = -EFAULT; ++ if (copy_from_bpfptr(value, uvalue, value_size) != 0) ++ goto free_value; ++ ++ err = bpf_map_update_value(map, f, key, value, attr->flags); ++ ++free_value: ++ kvfree(value); ++free_key: ++ kvfree(key); ++err_put: ++ bpf_map_write_active_dec(map); ++ fdput(f); ++ return err; ++} ++ ++#define BPF_MAP_DELETE_ELEM_LAST_FIELD key ++ ++static int map_delete_elem(union bpf_attr *attr) ++{ ++ void __user *ukey = u64_to_user_ptr(attr->key); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ struct fd f; ++ void *key; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ bpf_map_write_active_inc(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ key = __bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ ++ if (bpf_map_is_dev_bound(map)) { ++ err = bpf_map_offload_delete_elem(map, key); ++ goto out; ++ } else if (IS_FD_PROG_ARRAY(map) || ++ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { ++ /* These maps require sleepable context */ ++ err = map->ops->map_delete_elem(map, key); ++ goto out; ++ } ++ ++ bpf_disable_instrumentation(); ++ rcu_read_lock(); ++ err = map->ops->map_delete_elem(map, key); ++ rcu_read_unlock(); ++ bpf_enable_instrumentation(); ++ maybe_wait_bpf_programs(map); ++out: ++ kvfree(key); ++err_put: ++ bpf_map_write_active_dec(map); ++ fdput(f); ++ return err; ++} ++ ++/* last field in 'union bpf_attr' used by this command */ ++#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key ++ ++static int map_get_next_key(union bpf_attr *attr) ++{ ++ void __user *ukey = u64_to_user_ptr(attr->key); ++ void __user *unext_key = u64_to_user_ptr(attr->next_key); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *next_key; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if (ukey) { ++ key = __bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ } else { ++ key = NULL; ++ } ++ ++ err = -ENOMEM; ++ next_key = kvmalloc(map->key_size, GFP_USER); ++ if (!next_key) ++ goto free_key; ++ ++ if (bpf_map_is_dev_bound(map)) { ++ err = bpf_map_offload_get_next_key(map, key, next_key); ++ goto out; ++ } ++ ++ rcu_read_lock(); ++ err = map->ops->map_get_next_key(map, key, next_key); ++ rcu_read_unlock(); ++out: ++ if (err) ++ goto free_next_key; ++ ++ err = -EFAULT; ++ if (copy_to_user(unext_key, next_key, map->key_size) != 0) ++ goto free_next_key; ++ ++ err = 0; ++ ++free_next_key: ++ kvfree(next_key); ++free_key: ++ kvfree(key); ++err_put: ++ fdput(f); ++ return err; ++} ++ ++int generic_map_delete_batch(struct bpf_map *map, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ void __user *keys = u64_to_user_ptr(attr->batch.keys); ++ u32 cp, max_count; ++ int err = 0; ++ void *key; ++ ++ if (attr->batch.elem_flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ if ((attr->batch.elem_flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ return -EINVAL; ++ } ++ ++ max_count = attr->batch.count; ++ if (!max_count) ++ return 0; ++ ++ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); ++ if (!key) ++ return -ENOMEM; ++ ++ for (cp = 0; cp < max_count; cp++) { ++ err = -EFAULT; ++ if (copy_from_user(key, keys + cp * map->key_size, ++ map->key_size)) ++ break; ++ ++ if (bpf_map_is_dev_bound(map)) { ++ err = bpf_map_offload_delete_elem(map, key); ++ break; ++ } ++ ++ bpf_disable_instrumentation(); ++ rcu_read_lock(); ++ err = map->ops->map_delete_elem(map, key); ++ rcu_read_unlock(); ++ bpf_enable_instrumentation(); ++ if (err) ++ break; ++ cond_resched(); ++ } ++ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) ++ err = -EFAULT; ++ ++ kvfree(key); ++ ++ maybe_wait_bpf_programs(map); ++ return err; ++} ++ ++int generic_map_update_batch(struct bpf_map *map, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ void __user *values = u64_to_user_ptr(attr->batch.values); ++ void __user *keys = u64_to_user_ptr(attr->batch.keys); ++ u32 value_size, cp, max_count; ++ int ufd = attr->batch.map_fd; ++ void *key, *value; ++ struct fd f; ++ int err = 0; ++ ++ if (attr->batch.elem_flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ if ((attr->batch.elem_flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ return -EINVAL; ++ } ++ ++ value_size = bpf_map_value_size(map); ++ ++ max_count = attr->batch.count; ++ if (!max_count) ++ return 0; ++ ++ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); ++ if (!key) ++ return -ENOMEM; ++ ++ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) { ++ kvfree(key); ++ return -ENOMEM; ++ } ++ ++ f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */ ++ for (cp = 0; cp < max_count; cp++) { ++ err = -EFAULT; ++ if (copy_from_user(key, keys + cp * map->key_size, ++ map->key_size) || ++ copy_from_user(value, values + cp * value_size, value_size)) ++ break; ++ ++ err = bpf_map_update_value(map, f, key, value, ++ attr->batch.elem_flags); ++ ++ if (err) ++ break; ++ cond_resched(); ++ } ++ ++ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) ++ err = -EFAULT; ++ ++ kvfree(value); ++ kvfree(key); ++ fdput(f); ++ return err; ++} ++ ++#define MAP_LOOKUP_RETRIES 3 ++ ++int generic_map_lookup_batch(struct bpf_map *map, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); ++ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); ++ void __user *values = u64_to_user_ptr(attr->batch.values); ++ void __user *keys = u64_to_user_ptr(attr->batch.keys); ++ void *buf, *buf_prevkey, *prev_key, *key, *value; ++ int err, retry = MAP_LOOKUP_RETRIES; ++ u32 value_size, cp, max_count; ++ ++ if (attr->batch.elem_flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ if ((attr->batch.elem_flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) ++ return -EINVAL; ++ ++ value_size = bpf_map_value_size(map); ++ ++ max_count = attr->batch.count; ++ if (!max_count) ++ return 0; ++ ++ if (put_user(0, &uattr->batch.count)) ++ return -EFAULT; ++ ++ buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); ++ if (!buf_prevkey) ++ return -ENOMEM; ++ ++ buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); ++ if (!buf) { ++ kvfree(buf_prevkey); ++ return -ENOMEM; ++ } ++ ++ err = -EFAULT; ++ prev_key = NULL; ++ if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) ++ goto free_buf; ++ key = buf; ++ value = key + map->key_size; ++ if (ubatch) ++ prev_key = buf_prevkey; ++ ++ for (cp = 0; cp < max_count;) { ++ rcu_read_lock(); ++ err = map->ops->map_get_next_key(map, prev_key, key); ++ rcu_read_unlock(); ++ if (err) ++ break; ++ err = bpf_map_copy_value(map, key, value, ++ attr->batch.elem_flags); ++ ++ if (err == -ENOENT) { ++ if (retry) { ++ retry--; ++ continue; ++ } ++ err = -EINTR; ++ break; ++ } ++ ++ if (err) ++ goto free_buf; ++ ++ if (copy_to_user(keys + cp * map->key_size, key, ++ map->key_size)) { ++ err = -EFAULT; ++ goto free_buf; ++ } ++ if (copy_to_user(values + cp * value_size, value, value_size)) { ++ err = -EFAULT; ++ goto free_buf; ++ } ++ ++ if (!prev_key) ++ prev_key = buf_prevkey; ++ ++ swap(prev_key, key); ++ retry = MAP_LOOKUP_RETRIES; ++ cp++; ++ cond_resched(); ++ } ++ ++ if (err == -EFAULT) ++ goto free_buf; ++ ++ if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || ++ (cp && copy_to_user(uobatch, prev_key, map->key_size)))) ++ err = -EFAULT; ++ ++free_buf: ++ kvfree(buf_prevkey); ++ kvfree(buf); ++ return err; ++} ++ ++#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags ++ ++static int map_lookup_and_delete_elem(union bpf_attr *attr) ++{ ++ void __user *ukey = u64_to_user_ptr(attr->key); ++ void __user *uvalue = u64_to_user_ptr(attr->value); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *value; ++ u32 value_size; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) ++ return -EINVAL; ++ ++ if (attr->flags & ~BPF_F_LOCK) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ bpf_map_write_active_inc(map); ++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || ++ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if (attr->flags && ++ (map->map_type == BPF_MAP_TYPE_QUEUE || ++ map->map_type == BPF_MAP_TYPE_STACK)) { ++ err = -EINVAL; ++ goto err_put; ++ } ++ ++ if ((attr->flags & BPF_F_LOCK) && ++ !map_value_has_spin_lock(map)) { ++ err = -EINVAL; ++ goto err_put; ++ } ++ ++ key = __bpf_copy_key(ukey, map->key_size); ++ if (IS_ERR(key)) { ++ err = PTR_ERR(key); ++ goto err_put; ++ } ++ ++ value_size = bpf_map_value_size(map); ++ ++ err = -ENOMEM; ++ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) ++ goto free_key; ++ ++ err = -ENOTSUPP; ++ if (map->map_type == BPF_MAP_TYPE_QUEUE || ++ map->map_type == BPF_MAP_TYPE_STACK) { ++ err = map->ops->map_pop_elem(map, value); ++ } else if (map->map_type == BPF_MAP_TYPE_HASH || ++ map->map_type == BPF_MAP_TYPE_PERCPU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_HASH || ++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { ++ if (!bpf_map_is_dev_bound(map)) { ++ bpf_disable_instrumentation(); ++ rcu_read_lock(); ++ err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); ++ rcu_read_unlock(); ++ bpf_enable_instrumentation(); ++ } ++ } ++ ++ if (err) ++ goto free_value; ++ ++ if (copy_to_user(uvalue, value, value_size) != 0) { ++ err = -EFAULT; ++ goto free_value; ++ } ++ ++ err = 0; ++ ++free_value: ++ kvfree(value); ++free_key: ++ kvfree(key); ++err_put: ++ bpf_map_write_active_dec(map); ++ fdput(f); ++ return err; ++} ++ ++#define BPF_MAP_FREEZE_LAST_FIELD map_fd ++ ++static int map_freeze(const union bpf_attr *attr) ++{ ++ int err = 0, ufd = attr->map_fd; ++ struct bpf_map *map; ++ struct fd f; ++ ++ if (CHECK_ATTR(BPF_MAP_FREEZE)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ ++ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || ++ map_value_has_timer(map) || map_value_has_kptrs(map)) { ++ fdput(f); ++ return -ENOTSUPP; ++ } ++ ++ mutex_lock(&map->freeze_mutex); ++ if (bpf_map_write_active(map)) { ++ err = -EBUSY; ++ goto err_put; ++ } ++ if (READ_ONCE(map->frozen)) { ++ err = -EBUSY; ++ goto err_put; ++ } ++ if (!bpf_capable()) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ WRITE_ONCE(map->frozen, true); ++err_put: ++ mutex_unlock(&map->freeze_mutex); ++ fdput(f); ++ return err; ++} ++ ++static const struct bpf_prog_ops * const bpf_prog_types[] = { ++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ ++ [_id] = & _name ## _prog_ops, ++#define BPF_MAP_TYPE(_id, _ops) ++#define BPF_LINK_TYPE(_id, _name) ++#include ++#undef BPF_PROG_TYPE ++#undef BPF_MAP_TYPE ++#undef BPF_LINK_TYPE ++}; ++ ++static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) ++{ ++ const struct bpf_prog_ops *ops; ++ ++ if (type >= ARRAY_SIZE(bpf_prog_types)) ++ return -EINVAL; ++ type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); ++ ops = bpf_prog_types[type]; ++ if (!ops) ++ return -EINVAL; ++ ++ if (!bpf_prog_is_dev_bound(prog->aux)) ++ prog->aux->ops = ops; ++ else ++ prog->aux->ops = &bpf_offload_prog_ops; ++ prog->type = type; ++ return 0; ++} ++ ++enum bpf_audit { ++ BPF_AUDIT_LOAD, ++ BPF_AUDIT_UNLOAD, ++ BPF_AUDIT_MAX, ++}; ++ ++static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { ++ [BPF_AUDIT_LOAD] = "LOAD", ++ [BPF_AUDIT_UNLOAD] = "UNLOAD", ++}; ++ ++static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) ++{ ++ struct audit_context *ctx = NULL; ++ struct audit_buffer *ab; ++ ++ if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) ++ return; ++ if (audit_enabled == AUDIT_OFF) ++ return; ++ if (op == BPF_AUDIT_LOAD) ++ ctx = audit_context(); ++ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); ++ if (unlikely(!ab)) ++ return; ++ audit_log_format(ab, "prog-id=%u op=%s", ++ prog->aux->id, bpf_audit_str[op]); ++ audit_log_end(ab); ++} ++ ++static int bpf_prog_alloc_id(struct bpf_prog *prog) ++{ ++ int id; ++ ++ idr_preload(GFP_KERNEL); ++ spin_lock_bh(&prog_idr_lock); ++ id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); ++ if (id > 0) ++ prog->aux->id = id; ++ spin_unlock_bh(&prog_idr_lock); ++ idr_preload_end(); ++ ++ /* id is in [1, INT_MAX) */ ++ if (WARN_ON_ONCE(!id)) ++ return -ENOSPC; ++ ++ return id > 0 ? 0 : id; ++} ++ ++void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) ++{ ++ unsigned long flags; ++ ++ /* cBPF to eBPF migrations are currently not in the idr store. ++ * Offloaded programs are removed from the store when their device ++ * disappears - even if someone grabs an fd to them they are unusable, ++ * simply waiting for refcnt to drop to be freed. ++ */ ++ if (!prog->aux->id) ++ return; ++ ++ if (do_idr_lock) ++ spin_lock_irqsave(&prog_idr_lock, flags); ++ else ++ __acquire(&prog_idr_lock); ++ ++ idr_remove(&prog_idr, prog->aux->id); ++ prog->aux->id = 0; ++ ++ if (do_idr_lock) ++ spin_unlock_irqrestore(&prog_idr_lock, flags); ++ else ++ __release(&prog_idr_lock); ++} ++ ++static void __bpf_prog_put_rcu(struct rcu_head *rcu) ++{ ++ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); ++ ++ kvfree(aux->func_info); ++ kfree(aux->func_info_aux); ++ free_uid(aux->user); ++ security_bpf_prog_free(aux); ++ bpf_prog_free(aux->prog); ++} ++ ++static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) ++{ ++ bpf_prog_kallsyms_del_all(prog); ++ btf_put(prog->aux->btf); ++ kvfree(prog->aux->jited_linfo); ++ kvfree(prog->aux->linfo); ++ kfree(prog->aux->kfunc_tab); ++ if (prog->aux->attach_btf) ++ btf_put(prog->aux->attach_btf); ++ ++ if (deferred) { ++ if (prog->aux->sleepable) ++ call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); ++ else ++ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); ++ } else { ++ __bpf_prog_put_rcu(&prog->aux->rcu); ++ } ++} ++ ++static void bpf_prog_put_deferred(struct work_struct *work) ++{ ++ struct bpf_prog_aux *aux; ++ struct bpf_prog *prog; ++ ++ aux = container_of(work, struct bpf_prog_aux, work); ++ prog = aux->prog; ++ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); ++ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); ++ __bpf_prog_put_noref(prog, true); ++} ++ ++static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) ++{ ++ struct bpf_prog_aux *aux = prog->aux; ++ ++ if (atomic64_dec_and_test(&aux->refcnt)) { ++ /* bpf_prog_free_id() must be called first */ ++ bpf_prog_free_id(prog, do_idr_lock); ++ ++ if (in_irq() || irqs_disabled()) { ++ INIT_WORK(&aux->work, bpf_prog_put_deferred); ++ schedule_work(&aux->work); ++ } else { ++ bpf_prog_put_deferred(&aux->work); ++ } ++ } ++} ++ ++void bpf_prog_put(struct bpf_prog *prog) ++{ ++ __bpf_prog_put(prog, true); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_put); ++ ++static int bpf_prog_release(struct inode *inode, struct file *filp) ++{ ++ struct bpf_prog *prog = filp->private_data; ++ ++ bpf_prog_put(prog); ++ return 0; ++} ++ ++struct bpf_prog_kstats { ++ u64 nsecs; ++ u64 cnt; ++ u64 misses; ++}; ++ ++void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) ++{ ++ struct bpf_prog_stats *stats; ++ unsigned int flags; ++ ++ stats = this_cpu_ptr(prog->stats); ++ flags = u64_stats_update_begin_irqsave(&stats->syncp); ++ u64_stats_inc(&stats->misses); ++ u64_stats_update_end_irqrestore(&stats->syncp, flags); ++} ++ ++static void bpf_prog_get_stats(const struct bpf_prog *prog, ++ struct bpf_prog_kstats *stats) ++{ ++ u64 nsecs = 0, cnt = 0, misses = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ const struct bpf_prog_stats *st; ++ unsigned int start; ++ u64 tnsecs, tcnt, tmisses; ++ ++ st = per_cpu_ptr(prog->stats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&st->syncp); ++ tnsecs = u64_stats_read(&st->nsecs); ++ tcnt = u64_stats_read(&st->cnt); ++ tmisses = u64_stats_read(&st->misses); ++ } while (u64_stats_fetch_retry_irq(&st->syncp, start)); ++ nsecs += tnsecs; ++ cnt += tcnt; ++ misses += tmisses; ++ } ++ stats->nsecs = nsecs; ++ stats->cnt = cnt; ++ stats->misses = misses; ++} ++ ++#ifdef CONFIG_PROC_FS ++static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) ++{ ++ const struct bpf_prog *prog = filp->private_data; ++ char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; ++ struct bpf_prog_kstats stats; ++ ++ bpf_prog_get_stats(prog, &stats); ++ bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); ++ seq_printf(m, ++ "prog_type:\t%u\n" ++ "prog_jited:\t%u\n" ++ "prog_tag:\t%s\n" ++ "memlock:\t%llu\n" ++ "prog_id:\t%u\n" ++ "run_time_ns:\t%llu\n" ++ "run_cnt:\t%llu\n" ++ "recursion_misses:\t%llu\n" ++ "verified_insns:\t%u\n", ++ prog->type, ++ prog->jited, ++ prog_tag, ++ prog->pages * 1ULL << PAGE_SHIFT, ++ prog->aux->id, ++ stats.nsecs, ++ stats.cnt, ++ stats.misses, ++ prog->aux->verified_insns); ++} ++#endif ++ ++const struct file_operations bpf_prog_fops = { ++#ifdef CONFIG_PROC_FS ++ .show_fdinfo = bpf_prog_show_fdinfo, ++#endif ++ .release = bpf_prog_release, ++ .read = bpf_dummy_read, ++ .write = bpf_dummy_write, ++}; ++ ++int bpf_prog_new_fd(struct bpf_prog *prog) ++{ ++ int ret; ++ ++ ret = security_bpf_prog(prog); ++ if (ret < 0) ++ return ret; ++ ++ return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, ++ O_RDWR | O_CLOEXEC); ++} ++ ++static struct bpf_prog *____bpf_prog_get(struct fd f) ++{ ++ if (!f.file) ++ return ERR_PTR(-EBADF); ++ if (f.file->f_op != &bpf_prog_fops) { ++ fdput(f); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ return f.file->private_data; ++} ++ ++void bpf_prog_add(struct bpf_prog *prog, int i) ++{ ++ atomic64_add(i, &prog->aux->refcnt); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_add); ++ ++void bpf_prog_sub(struct bpf_prog *prog, int i) ++{ ++ /* Only to be used for undoing previous bpf_prog_add() in some ++ * error path. We still know that another entity in our call ++ * path holds a reference to the program, thus atomic_sub() can ++ * be safely used in such cases! ++ */ ++ WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_sub); ++ ++void bpf_prog_inc(struct bpf_prog *prog) ++{ ++ atomic64_inc(&prog->aux->refcnt); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_inc); ++ ++/* prog_idr_lock should have been held */ ++struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) ++{ ++ int refold; ++ ++ refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); ++ ++ if (!refold) ++ return ERR_PTR(-ENOENT); ++ ++ return prog; ++} ++EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); ++ ++bool bpf_prog_get_ok(struct bpf_prog *prog, ++ enum bpf_prog_type *attach_type, bool attach_drv) ++{ ++ /* not an attachment, just a refcount inc, always allow */ ++ if (!attach_type) ++ return true; ++ ++ if (prog->type != *attach_type) ++ return false; ++ if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) ++ return false; ++ ++ return true; ++} ++ ++static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, ++ bool attach_drv) ++{ ++ struct fd f = fdget(ufd); ++ struct bpf_prog *prog; ++ ++ prog = ____bpf_prog_get(f); ++ if (IS_ERR(prog)) ++ return prog; ++ if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) { ++ prog = ERR_PTR(-EINVAL); ++ goto out; ++ } ++ ++ bpf_prog_inc(prog); ++out: ++ fdput(f); ++ return prog; ++} ++ ++struct bpf_prog *bpf_prog_get(u32 ufd) ++{ ++ return __bpf_prog_get(ufd, NULL, false); ++} ++ ++struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, ++ bool attach_drv) ++{ ++ return __bpf_prog_get(ufd, &type, attach_drv); ++} ++EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); ++ ++/* Initially all BPF programs could be loaded w/o specifying ++ * expected_attach_type. Later for some of them specifying expected_attach_type ++ * at load time became required so that program could be validated properly. ++ * Programs of types that are allowed to be loaded both w/ and w/o (for ++ * backward compatibility) expected_attach_type, should have the default attach ++ * type assigned to expected_attach_type for the latter case, so that it can be ++ * validated later at attach time. ++ * ++ * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if ++ * prog type requires it but has some attach types that have to be backward ++ * compatible. ++ */ ++static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) ++{ ++ switch (attr->prog_type) { ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't ++ * exist so checking for non-zero is the way to go here. ++ */ ++ if (!attr->expected_attach_type) ++ attr->expected_attach_type = ++ BPF_CGROUP_INET_SOCK_CREATE; ++ break; ++ case BPF_PROG_TYPE_SK_REUSEPORT: ++ if (!attr->expected_attach_type) ++ attr->expected_attach_type = ++ BPF_SK_REUSEPORT_SELECT; ++ break; ++ } ++} ++ ++static int ++bpf_prog_load_check_attach(enum bpf_prog_type prog_type, ++ enum bpf_attach_type expected_attach_type, ++ struct btf *attach_btf, u32 btf_id, ++ struct bpf_prog *dst_prog) ++{ ++ if (btf_id) { ++ if (btf_id > BTF_MAX_TYPE) ++ return -EINVAL; ++ ++ if (!attach_btf && !dst_prog) ++ return -EINVAL; ++ ++ switch (prog_type) { ++ case BPF_PROG_TYPE_TRACING: ++ case BPF_PROG_TYPE_LSM: ++ case BPF_PROG_TYPE_STRUCT_OPS: ++ case BPF_PROG_TYPE_EXT: ++ break; ++ default: ++ return -EINVAL; ++ } ++ } ++ ++ if (attach_btf && (!btf_id || dst_prog)) ++ return -EINVAL; ++ ++ if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && ++ prog_type != BPF_PROG_TYPE_EXT) ++ return -EINVAL; ++ ++ switch (prog_type) { ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ switch (expected_attach_type) { ++ case BPF_CGROUP_INET_SOCK_CREATE: ++ case BPF_CGROUP_INET_SOCK_RELEASE: ++ case BPF_CGROUP_INET4_POST_BIND: ++ case BPF_CGROUP_INET6_POST_BIND: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ switch (expected_attach_type) { ++ case BPF_CGROUP_INET4_BIND: ++ case BPF_CGROUP_INET6_BIND: ++ case BPF_CGROUP_INET4_CONNECT: ++ case BPF_CGROUP_INET6_CONNECT: ++ case BPF_CGROUP_INET4_GETPEERNAME: ++ case BPF_CGROUP_INET6_GETPEERNAME: ++ case BPF_CGROUP_INET4_GETSOCKNAME: ++ case BPF_CGROUP_INET6_GETSOCKNAME: ++ case BPF_CGROUP_UDP4_SENDMSG: ++ case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ switch (expected_attach_type) { ++ case BPF_CGROUP_INET_INGRESS: ++ case BPF_CGROUP_INET_EGRESS: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ switch (expected_attach_type) { ++ case BPF_CGROUP_SETSOCKOPT: ++ case BPF_CGROUP_GETSOCKOPT: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_SK_LOOKUP: ++ if (expected_attach_type == BPF_SK_LOOKUP) ++ return 0; ++ return -EINVAL; ++ case BPF_PROG_TYPE_SK_REUSEPORT: ++ switch (expected_attach_type) { ++ case BPF_SK_REUSEPORT_SELECT: ++ case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ case BPF_PROG_TYPE_SYSCALL: ++ case BPF_PROG_TYPE_EXT: ++ if (expected_attach_type) ++ return -EINVAL; ++ fallthrough; ++ default: ++ return 0; ++ } ++} ++ ++static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) ++{ ++ switch (prog_type) { ++ case BPF_PROG_TYPE_SCHED_CLS: ++ case BPF_PROG_TYPE_SCHED_ACT: ++ case BPF_PROG_TYPE_XDP: ++ case BPF_PROG_TYPE_LWT_IN: ++ case BPF_PROG_TYPE_LWT_OUT: ++ case BPF_PROG_TYPE_LWT_XMIT: ++ case BPF_PROG_TYPE_LWT_SEG6LOCAL: ++ case BPF_PROG_TYPE_SK_SKB: ++ case BPF_PROG_TYPE_SK_MSG: ++ case BPF_PROG_TYPE_LIRC_MODE2: ++ case BPF_PROG_TYPE_FLOW_DISSECTOR: ++ case BPF_PROG_TYPE_CGROUP_DEVICE: ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ case BPF_PROG_TYPE_CGROUP_SYSCTL: ++ case BPF_PROG_TYPE_SOCK_OPS: ++ case BPF_PROG_TYPE_EXT: /* extends any prog */ ++ return true; ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ /* always unpriv */ ++ case BPF_PROG_TYPE_SK_REUSEPORT: ++ /* equivalent to SOCKET_FILTER. need CAP_BPF only */ ++ default: ++ return false; ++ } ++} ++ ++static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) ++{ ++ switch (prog_type) { ++ case BPF_PROG_TYPE_KPROBE: ++ case BPF_PROG_TYPE_TRACEPOINT: ++ case BPF_PROG_TYPE_PERF_EVENT: ++ case BPF_PROG_TYPE_RAW_TRACEPOINT: ++ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: ++ case BPF_PROG_TYPE_TRACING: ++ case BPF_PROG_TYPE_LSM: ++ case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ ++ case BPF_PROG_TYPE_EXT: /* extends any prog */ ++ return true; ++ default: ++ return false; ++ } ++} ++ ++/* last field in 'union bpf_attr' used by this command */ ++#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size ++ ++static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) ++{ ++ enum bpf_prog_type type = attr->prog_type; ++ struct bpf_prog *prog, *dst_prog = NULL; ++ struct btf *attach_btf = NULL; ++ int err; ++ char license[128]; ++ bool is_gpl; ++ ++ if (CHECK_ATTR(BPF_PROG_LOAD)) ++ return -EINVAL; ++ ++ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | ++ BPF_F_ANY_ALIGNMENT | ++ BPF_F_TEST_STATE_FREQ | ++ BPF_F_SLEEPABLE | ++ BPF_F_TEST_RND_HI32 | ++ BPF_F_XDP_HAS_FRAGS)) ++ return -EINVAL; ++ ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && ++ (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && ++ !bpf_capable()) ++ return -EPERM; ++ ++ /* copy eBPF program license from user space */ ++ if (strncpy_from_bpfptr(license, ++ make_bpfptr(attr->license, uattr.is_kernel), ++ sizeof(license) - 1) < 0) ++ return -EFAULT; ++ license[sizeof(license) - 1] = 0; ++ ++ /* eBPF programs must be GPL compatible to use GPL-ed functions */ ++ is_gpl = license_is_gpl_compatible(license); ++ ++ if (attr->insn_cnt == 0 || ++ attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) ++ return -E2BIG; ++ if (type != BPF_PROG_TYPE_SOCKET_FILTER && ++ type != BPF_PROG_TYPE_CGROUP_SKB && ++ !bpf_capable()) ++ return -EPERM; ++ ++ if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ if (is_perfmon_prog_type(type) && !perfmon_capable()) ++ return -EPERM; ++ ++ /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog ++ * or btf, we need to check which one it is ++ */ ++ if (attr->attach_prog_fd) { ++ dst_prog = bpf_prog_get(attr->attach_prog_fd); ++ if (IS_ERR(dst_prog)) { ++ dst_prog = NULL; ++ attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); ++ if (IS_ERR(attach_btf)) ++ return -EINVAL; ++ if (!btf_is_kernel(attach_btf)) { ++ /* attaching through specifying bpf_prog's BTF ++ * objects directly might be supported eventually ++ */ ++ btf_put(attach_btf); ++ return -ENOTSUPP; ++ } ++ } ++ } else if (attr->attach_btf_id) { ++ /* fall back to vmlinux BTF, if BTF type ID is specified */ ++ attach_btf = bpf_get_btf_vmlinux(); ++ if (IS_ERR(attach_btf)) ++ return PTR_ERR(attach_btf); ++ if (!attach_btf) ++ return -EINVAL; ++ btf_get(attach_btf); ++ } ++ ++ bpf_prog_load_fixup_attach_type(attr); ++ if (bpf_prog_load_check_attach(type, attr->expected_attach_type, ++ attach_btf, attr->attach_btf_id, ++ dst_prog)) { ++ if (dst_prog) ++ bpf_prog_put(dst_prog); ++ if (attach_btf) ++ btf_put(attach_btf); ++ return -EINVAL; ++ } ++ ++ /* plain bpf_prog allocation */ ++ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); ++ if (!prog) { ++ if (dst_prog) ++ bpf_prog_put(dst_prog); ++ if (attach_btf) ++ btf_put(attach_btf); ++ return -ENOMEM; ++ } ++ ++ prog->expected_attach_type = attr->expected_attach_type; ++ prog->aux->attach_btf = attach_btf; ++ prog->aux->attach_btf_id = attr->attach_btf_id; ++ prog->aux->dst_prog = dst_prog; ++ prog->aux->offload_requested = !!attr->prog_ifindex; ++ prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; ++ prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; ++ ++ err = security_bpf_prog_alloc(prog->aux); ++ if (err) ++ goto free_prog; ++ ++ prog->aux->user = get_current_user(); ++ prog->len = attr->insn_cnt; ++ ++ err = -EFAULT; ++ if (copy_from_bpfptr(prog->insns, ++ make_bpfptr(attr->insns, uattr.is_kernel), ++ bpf_prog_insn_size(prog)) != 0) ++ goto free_prog_sec; ++ ++ prog->orig_prog = NULL; ++ prog->jited = 0; ++ ++ atomic64_set(&prog->aux->refcnt, 1); ++ prog->gpl_compatible = is_gpl ? 1 : 0; ++ ++ if (bpf_prog_is_dev_bound(prog->aux)) { ++ err = bpf_prog_offload_init(prog, attr); ++ if (err) ++ goto free_prog_sec; ++ } ++ ++ /* find program type: socket_filter vs tracing_filter */ ++ err = find_prog_type(type, prog); ++ if (err < 0) ++ goto free_prog_sec; ++ ++ prog->aux->load_time = ktime_get_boottime_ns(); ++ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, ++ sizeof(attr->prog_name)); ++ if (err < 0) ++ goto free_prog_sec; ++ ++ /* run eBPF verifier */ ++ err = bpf_check(&prog, attr, uattr); ++ if (err < 0) ++ goto free_used_maps; ++ ++ prog = bpf_prog_select_runtime(prog, &err); ++ if (err < 0) ++ goto free_used_maps; ++ ++ err = bpf_prog_alloc_id(prog); ++ if (err) ++ goto free_used_maps; ++ ++ /* Upon success of bpf_prog_alloc_id(), the BPF prog is ++ * effectively publicly exposed. However, retrieving via ++ * bpf_prog_get_fd_by_id() will take another reference, ++ * therefore it cannot be gone underneath us. ++ * ++ * Only for the time /after/ successful bpf_prog_new_fd() ++ * and before returning to userspace, we might just hold ++ * one reference and any parallel close on that fd could ++ * rip everything out. Hence, below notifications must ++ * happen before bpf_prog_new_fd(). ++ * ++ * Also, any failure handling from this point onwards must ++ * be using bpf_prog_put() given the program is exposed. ++ */ ++ bpf_prog_kallsyms_add(prog); ++ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); ++ bpf_audit_prog(prog, BPF_AUDIT_LOAD); ++ ++ err = bpf_prog_new_fd(prog); ++ if (err < 0) ++ bpf_prog_put(prog); ++ return err; ++ ++free_used_maps: ++ /* In case we have subprogs, we need to wait for a grace ++ * period before we can tear down JIT memory since symbols ++ * are already exposed under kallsyms. ++ */ ++ __bpf_prog_put_noref(prog, prog->aux->func_cnt); ++ return err; ++free_prog_sec: ++ free_uid(prog->aux->user); ++ security_bpf_prog_free(prog->aux); ++free_prog: ++ if (prog->aux->attach_btf) ++ btf_put(prog->aux->attach_btf); ++ bpf_prog_free(prog); ++ return err; ++} ++ ++#define BPF_OBJ_LAST_FIELD file_flags ++ ++static int bpf_obj_pin(const union bpf_attr *attr) ++{ ++ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) ++ return -EINVAL; ++ ++ return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); ++} ++ ++static int bpf_obj_get(const union bpf_attr *attr) ++{ ++ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || ++ attr->file_flags & ~BPF_OBJ_FLAG_MASK) ++ return -EINVAL; ++ ++ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), ++ attr->file_flags); ++} ++ ++void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, ++ const struct bpf_link_ops *ops, struct bpf_prog *prog) ++{ ++ atomic64_set(&link->refcnt, 1); ++ link->type = type; ++ link->id = 0; ++ link->ops = ops; ++ link->prog = prog; ++} ++ ++static void bpf_link_free_id(int id) ++{ ++ if (!id) ++ return; ++ ++ spin_lock_bh(&link_idr_lock); ++ idr_remove(&link_idr, id); ++ spin_unlock_bh(&link_idr_lock); ++} ++ ++/* Clean up bpf_link and corresponding anon_inode file and FD. After ++ * anon_inode is created, bpf_link can't be just kfree()'d due to deferred ++ * anon_inode's release() call. This helper marksbpf_link as ++ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt ++ * is not decremented, it's the responsibility of a calling code that failed ++ * to complete bpf_link initialization. ++ */ ++void bpf_link_cleanup(struct bpf_link_primer *primer) ++{ ++ primer->link->prog = NULL; ++ bpf_link_free_id(primer->id); ++ fput(primer->file); ++ put_unused_fd(primer->fd); ++} ++ ++void bpf_link_inc(struct bpf_link *link) ++{ ++ atomic64_inc(&link->refcnt); ++} ++ ++/* bpf_link_free is guaranteed to be called from process context */ ++static void bpf_link_free(struct bpf_link *link) ++{ ++ bpf_link_free_id(link->id); ++ if (link->prog) { ++ /* detach BPF program, clean up used resources */ ++ link->ops->release(link); ++ bpf_prog_put(link->prog); ++ } ++ /* free bpf_link and its containing memory */ ++ link->ops->dealloc(link); ++} ++ ++static void bpf_link_put_deferred(struct work_struct *work) ++{ ++ struct bpf_link *link = container_of(work, struct bpf_link, work); ++ ++ bpf_link_free(link); ++} ++ ++/* bpf_link_put can be called from atomic context, but ensures that resources ++ * are freed from process context ++ */ ++void bpf_link_put(struct bpf_link *link) ++{ ++ if (!atomic64_dec_and_test(&link->refcnt)) ++ return; ++ ++ if (in_atomic()) { ++ INIT_WORK(&link->work, bpf_link_put_deferred); ++ schedule_work(&link->work); ++ } else { ++ bpf_link_free(link); ++ } ++} ++EXPORT_SYMBOL(bpf_link_put); ++ ++static int bpf_link_release(struct inode *inode, struct file *filp) ++{ ++ struct bpf_link *link = filp->private_data; ++ ++ bpf_link_put(link); ++ return 0; ++} ++ ++#ifdef CONFIG_PROC_FS ++#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) ++#define BPF_MAP_TYPE(_id, _ops) ++#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, ++static const char *bpf_link_type_strs[] = { ++ [BPF_LINK_TYPE_UNSPEC] = "", ++#include ++}; ++#undef BPF_PROG_TYPE ++#undef BPF_MAP_TYPE ++#undef BPF_LINK_TYPE ++ ++static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) ++{ ++ const struct bpf_link *link = filp->private_data; ++ const struct bpf_prog *prog = link->prog; ++ char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; ++ ++ bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); ++ seq_printf(m, ++ "link_type:\t%s\n" ++ "link_id:\t%u\n" ++ "prog_tag:\t%s\n" ++ "prog_id:\t%u\n", ++ bpf_link_type_strs[link->type], ++ link->id, ++ prog_tag, ++ prog->aux->id); ++ if (link->ops->show_fdinfo) ++ link->ops->show_fdinfo(link, m); ++} ++#endif ++ ++static const struct file_operations bpf_link_fops = { ++#ifdef CONFIG_PROC_FS ++ .show_fdinfo = bpf_link_show_fdinfo, ++#endif ++ .release = bpf_link_release, ++ .read = bpf_dummy_read, ++ .write = bpf_dummy_write, ++}; ++ ++static int bpf_link_alloc_id(struct bpf_link *link) ++{ ++ int id; ++ ++ idr_preload(GFP_KERNEL); ++ spin_lock_bh(&link_idr_lock); ++ id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); ++ spin_unlock_bh(&link_idr_lock); ++ idr_preload_end(); ++ ++ return id; ++} ++ ++/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, ++ * reserving unused FD and allocating ID from link_idr. This is to be paired ++ * with bpf_link_settle() to install FD and ID and expose bpf_link to ++ * user-space, if bpf_link is successfully attached. If not, bpf_link and ++ * pre-allocated resources are to be freed with bpf_cleanup() call. All the ++ * transient state is passed around in struct bpf_link_primer. ++ * This is preferred way to create and initialize bpf_link, especially when ++ * there are complicated and expensive operations in between creating bpf_link ++ * itself and attaching it to BPF hook. By using bpf_link_prime() and ++ * bpf_link_settle() kernel code using bpf_link doesn't have to perform ++ * expensive (and potentially failing) roll back operations in a rare case ++ * that file, FD, or ID can't be allocated. ++ */ ++int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) ++{ ++ struct file *file; ++ int fd, id; ++ ++ fd = get_unused_fd_flags(O_CLOEXEC); ++ if (fd < 0) ++ return fd; ++ ++ ++ id = bpf_link_alloc_id(link); ++ if (id < 0) { ++ put_unused_fd(fd); ++ return id; ++ } ++ ++ file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); ++ if (IS_ERR(file)) { ++ bpf_link_free_id(id); ++ put_unused_fd(fd); ++ return PTR_ERR(file); ++ } ++ ++ primer->link = link; ++ primer->file = file; ++ primer->fd = fd; ++ primer->id = id; ++ return 0; ++} ++ ++int bpf_link_settle(struct bpf_link_primer *primer) ++{ ++ /* make bpf_link fetchable by ID */ ++ spin_lock_bh(&link_idr_lock); ++ primer->link->id = primer->id; ++ spin_unlock_bh(&link_idr_lock); ++ /* make bpf_link fetchable by FD */ ++ fd_install(primer->fd, primer->file); ++ /* pass through installed FD */ ++ return primer->fd; ++} ++ ++int bpf_link_new_fd(struct bpf_link *link) ++{ ++ return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); ++} ++ ++struct bpf_link *bpf_link_get_from_fd(u32 ufd) ++{ ++ struct fd f = fdget(ufd); ++ struct bpf_link *link; ++ ++ if (!f.file) ++ return ERR_PTR(-EBADF); ++ if (f.file->f_op != &bpf_link_fops) { ++ fdput(f); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ link = f.file->private_data; ++ bpf_link_inc(link); ++ fdput(f); ++ ++ return link; ++} ++EXPORT_SYMBOL(bpf_link_get_from_fd); ++ ++static void bpf_tracing_link_release(struct bpf_link *link) ++{ ++ struct bpf_tracing_link *tr_link = ++ container_of(link, struct bpf_tracing_link, link.link); ++ ++ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, ++ tr_link->trampoline)); ++ ++ bpf_trampoline_put(tr_link->trampoline); ++ ++ /* tgt_prog is NULL if target is a kernel function */ ++ if (tr_link->tgt_prog) ++ bpf_prog_put(tr_link->tgt_prog); ++} ++ ++static void bpf_tracing_link_dealloc(struct bpf_link *link) ++{ ++ struct bpf_tracing_link *tr_link = ++ container_of(link, struct bpf_tracing_link, link.link); ++ ++ kfree(tr_link); ++} ++ ++static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, ++ struct seq_file *seq) ++{ ++ struct bpf_tracing_link *tr_link = ++ container_of(link, struct bpf_tracing_link, link.link); ++ ++ seq_printf(seq, ++ "attach_type:\t%d\n", ++ tr_link->attach_type); ++} ++ ++static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, ++ struct bpf_link_info *info) ++{ ++ struct bpf_tracing_link *tr_link = ++ container_of(link, struct bpf_tracing_link, link.link); ++ ++ info->tracing.attach_type = tr_link->attach_type; ++ bpf_trampoline_unpack_key(tr_link->trampoline->key, ++ &info->tracing.target_obj_id, ++ &info->tracing.target_btf_id); ++ ++ return 0; ++} ++ ++static const struct bpf_link_ops bpf_tracing_link_lops = { ++ .release = bpf_tracing_link_release, ++ .dealloc = bpf_tracing_link_dealloc, ++ .show_fdinfo = bpf_tracing_link_show_fdinfo, ++ .fill_link_info = bpf_tracing_link_fill_link_info, ++}; ++ ++static int bpf_tracing_prog_attach(struct bpf_prog *prog, ++ int tgt_prog_fd, ++ u32 btf_id, ++ u64 bpf_cookie) ++{ ++ struct bpf_link_primer link_primer; ++ struct bpf_prog *tgt_prog = NULL; ++ struct bpf_trampoline *tr = NULL; ++ struct bpf_tracing_link *link; ++ u64 key = 0; ++ int err; ++ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_TRACING: ++ if (prog->expected_attach_type != BPF_TRACE_FENTRY && ++ prog->expected_attach_type != BPF_TRACE_FEXIT && ++ prog->expected_attach_type != BPF_MODIFY_RETURN) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ break; ++ case BPF_PROG_TYPE_EXT: ++ if (prog->expected_attach_type != 0) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ break; ++ case BPF_PROG_TYPE_LSM: ++ if (prog->expected_attach_type != BPF_LSM_MAC) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ break; ++ default: ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ ++ if (!!tgt_prog_fd != !!btf_id) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ ++ if (tgt_prog_fd) { ++ /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ ++ if (prog->type != BPF_PROG_TYPE_EXT) { ++ err = -EINVAL; ++ goto out_put_prog; ++ } ++ ++ tgt_prog = bpf_prog_get(tgt_prog_fd); ++ if (IS_ERR(tgt_prog)) { ++ err = PTR_ERR(tgt_prog); ++ tgt_prog = NULL; ++ goto out_put_prog; ++ } ++ ++ key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); ++ } ++ ++ link = kzalloc(sizeof(*link), GFP_USER); ++ if (!link) { ++ err = -ENOMEM; ++ goto out_put_prog; ++ } ++ bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, ++ &bpf_tracing_link_lops, prog); ++ link->attach_type = prog->expected_attach_type; ++ link->link.cookie = bpf_cookie; ++ ++ mutex_lock(&prog->aux->dst_mutex); ++ ++ /* There are a few possible cases here: ++ * ++ * - if prog->aux->dst_trampoline is set, the program was just loaded ++ * and not yet attached to anything, so we can use the values stored ++ * in prog->aux ++ * ++ * - if prog->aux->dst_trampoline is NULL, the program has already been ++ * attached to a target and its initial target was cleared (below) ++ * ++ * - if tgt_prog != NULL, the caller specified tgt_prog_fd + ++ * target_btf_id using the link_create API. ++ * ++ * - if tgt_prog == NULL when this function was called using the old ++ * raw_tracepoint_open API, and we need a target from prog->aux ++ * ++ * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program ++ * was detached and is going for re-attachment. ++ */ ++ if (!prog->aux->dst_trampoline && !tgt_prog) { ++ /* ++ * Allow re-attach for TRACING and LSM programs. If it's ++ * currently linked, bpf_trampoline_link_prog will fail. ++ * EXT programs need to specify tgt_prog_fd, so they ++ * re-attach in separate code path. ++ */ ++ if (prog->type != BPF_PROG_TYPE_TRACING && ++ prog->type != BPF_PROG_TYPE_LSM) { ++ err = -EINVAL; ++ goto out_unlock; ++ } ++ btf_id = prog->aux->attach_btf_id; ++ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); ++ } ++ ++ if (!prog->aux->dst_trampoline || ++ (key && key != prog->aux->dst_trampoline->key)) { ++ /* If there is no saved target, or the specified target is ++ * different from the destination specified at load time, we ++ * need a new trampoline and a check for compatibility ++ */ ++ struct bpf_attach_target_info tgt_info = {}; ++ ++ err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, ++ &tgt_info); ++ if (err) ++ goto out_unlock; ++ ++ tr = bpf_trampoline_get(key, &tgt_info); ++ if (!tr) { ++ err = -ENOMEM; ++ goto out_unlock; ++ } ++ } else { ++ /* The caller didn't specify a target, or the target was the ++ * same as the destination supplied during program load. This ++ * means we can reuse the trampoline and reference from program ++ * load time, and there is no need to allocate a new one. This ++ * can only happen once for any program, as the saved values in ++ * prog->aux are cleared below. ++ */ ++ tr = prog->aux->dst_trampoline; ++ tgt_prog = prog->aux->dst_prog; ++ } ++ ++ err = bpf_link_prime(&link->link.link, &link_primer); ++ if (err) ++ goto out_unlock; ++ ++ err = bpf_trampoline_link_prog(&link->link, tr); ++ if (err) { ++ bpf_link_cleanup(&link_primer); ++ link = NULL; ++ goto out_unlock; ++ } ++ ++ link->tgt_prog = tgt_prog; ++ link->trampoline = tr; ++ ++ /* Always clear the trampoline and target prog from prog->aux to make ++ * sure the original attach destination is not kept alive after a ++ * program is (re-)attached to another target. ++ */ ++ if (prog->aux->dst_prog && ++ (tgt_prog_fd || tr != prog->aux->dst_trampoline)) ++ /* got extra prog ref from syscall, or attaching to different prog */ ++ bpf_prog_put(prog->aux->dst_prog); ++ if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) ++ /* we allocated a new trampoline, so free the old one */ ++ bpf_trampoline_put(prog->aux->dst_trampoline); ++ ++ prog->aux->dst_prog = NULL; ++ prog->aux->dst_trampoline = NULL; ++ mutex_unlock(&prog->aux->dst_mutex); ++ ++ return bpf_link_settle(&link_primer); ++out_unlock: ++ if (tr && tr != prog->aux->dst_trampoline) ++ bpf_trampoline_put(tr); ++ mutex_unlock(&prog->aux->dst_mutex); ++ kfree(link); ++out_put_prog: ++ if (tgt_prog_fd && tgt_prog) ++ bpf_prog_put(tgt_prog); ++ return err; ++} ++ ++struct bpf_raw_tp_link { ++ struct bpf_link link; ++ struct bpf_raw_event_map *btp; ++}; ++ ++static void bpf_raw_tp_link_release(struct bpf_link *link) ++{ ++ struct bpf_raw_tp_link *raw_tp = ++ container_of(link, struct bpf_raw_tp_link, link); ++ ++ bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); ++ bpf_put_raw_tracepoint(raw_tp->btp); ++} ++ ++static void bpf_raw_tp_link_dealloc(struct bpf_link *link) ++{ ++ struct bpf_raw_tp_link *raw_tp = ++ container_of(link, struct bpf_raw_tp_link, link); ++ ++ kfree(raw_tp); ++} ++ ++static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, ++ struct seq_file *seq) ++{ ++ struct bpf_raw_tp_link *raw_tp_link = ++ container_of(link, struct bpf_raw_tp_link, link); ++ ++ seq_printf(seq, ++ "tp_name:\t%s\n", ++ raw_tp_link->btp->tp->name); ++} ++ ++static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, ++ struct bpf_link_info *info) ++{ ++ struct bpf_raw_tp_link *raw_tp_link = ++ container_of(link, struct bpf_raw_tp_link, link); ++ char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); ++ const char *tp_name = raw_tp_link->btp->tp->name; ++ u32 ulen = info->raw_tracepoint.tp_name_len; ++ size_t tp_len = strlen(tp_name); ++ ++ if (!ulen ^ !ubuf) ++ return -EINVAL; ++ ++ info->raw_tracepoint.tp_name_len = tp_len + 1; ++ ++ if (!ubuf) ++ return 0; ++ ++ if (ulen >= tp_len + 1) { ++ if (copy_to_user(ubuf, tp_name, tp_len + 1)) ++ return -EFAULT; ++ } else { ++ char zero = '\0'; ++ ++ if (copy_to_user(ubuf, tp_name, ulen - 1)) ++ return -EFAULT; ++ if (put_user(zero, ubuf + ulen - 1)) ++ return -EFAULT; ++ return -ENOSPC; ++ } ++ ++ return 0; ++} ++ ++static const struct bpf_link_ops bpf_raw_tp_link_lops = { ++ .release = bpf_raw_tp_link_release, ++ .dealloc = bpf_raw_tp_link_dealloc, ++ .show_fdinfo = bpf_raw_tp_link_show_fdinfo, ++ .fill_link_info = bpf_raw_tp_link_fill_link_info, ++}; ++ ++#ifdef CONFIG_PERF_EVENTS ++struct bpf_perf_link { ++ struct bpf_link link; ++ struct file *perf_file; ++}; ++ ++static void bpf_perf_link_release(struct bpf_link *link) ++{ ++ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); ++ struct perf_event *event = perf_link->perf_file->private_data; ++ ++ perf_event_free_bpf_prog(event); ++ fput(perf_link->perf_file); ++} ++ ++static void bpf_perf_link_dealloc(struct bpf_link *link) ++{ ++ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); ++ ++ kfree(perf_link); ++} ++ ++static const struct bpf_link_ops bpf_perf_link_lops = { ++ .release = bpf_perf_link_release, ++ .dealloc = bpf_perf_link_dealloc, ++}; ++ ++static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) ++{ ++ struct bpf_link_primer link_primer; ++ struct bpf_perf_link *link; ++ struct perf_event *event; ++ struct file *perf_file; ++ int err; ++ ++ if (attr->link_create.flags) ++ return -EINVAL; ++ ++ perf_file = perf_event_get(attr->link_create.target_fd); ++ if (IS_ERR(perf_file)) ++ return PTR_ERR(perf_file); ++ ++ link = kzalloc(sizeof(*link), GFP_USER); ++ if (!link) { ++ err = -ENOMEM; ++ goto out_put_file; ++ } ++ bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); ++ link->perf_file = perf_file; ++ ++ err = bpf_link_prime(&link->link, &link_primer); ++ if (err) { ++ kfree(link); ++ goto out_put_file; ++ } ++ ++ event = perf_file->private_data; ++ err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); ++ if (err) { ++ bpf_link_cleanup(&link_primer); ++ goto out_put_file; ++ } ++ /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ ++ bpf_prog_inc(prog); ++ ++ return bpf_link_settle(&link_primer); ++ ++out_put_file: ++ fput(perf_file); ++ return err; ++} ++#else ++static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) ++{ ++ return -EOPNOTSUPP; ++} ++#endif /* CONFIG_PERF_EVENTS */ ++ ++static int bpf_raw_tp_link_attach(struct bpf_prog *prog, ++ const char __user *user_tp_name) ++{ ++ struct bpf_link_primer link_primer; ++ struct bpf_raw_tp_link *link; ++ struct bpf_raw_event_map *btp; ++ const char *tp_name; ++ char buf[128]; ++ int err; ++ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_TRACING: ++ case BPF_PROG_TYPE_EXT: ++ case BPF_PROG_TYPE_LSM: ++ if (user_tp_name) ++ /* The attach point for this category of programs ++ * should be specified via btf_id during program load. ++ */ ++ return -EINVAL; ++ if (prog->type == BPF_PROG_TYPE_TRACING && ++ prog->expected_attach_type == BPF_TRACE_RAW_TP) { ++ tp_name = prog->aux->attach_func_name; ++ break; ++ } ++ return bpf_tracing_prog_attach(prog, 0, 0, 0); ++ case BPF_PROG_TYPE_RAW_TRACEPOINT: ++ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: ++ if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) ++ return -EFAULT; ++ buf[sizeof(buf) - 1] = 0; ++ tp_name = buf; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ btp = bpf_get_raw_tracepoint(tp_name); ++ if (!btp) ++ return -ENOENT; ++ ++ link = kzalloc(sizeof(*link), GFP_USER); ++ if (!link) { ++ err = -ENOMEM; ++ goto out_put_btp; ++ } ++ bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, ++ &bpf_raw_tp_link_lops, prog); ++ link->btp = btp; ++ ++ err = bpf_link_prime(&link->link, &link_primer); ++ if (err) { ++ kfree(link); ++ goto out_put_btp; ++ } ++ ++ err = bpf_probe_register(link->btp, prog); ++ if (err) { ++ bpf_link_cleanup(&link_primer); ++ goto out_put_btp; ++ } ++ ++ return bpf_link_settle(&link_primer); ++ ++out_put_btp: ++ bpf_put_raw_tracepoint(btp); ++ return err; ++} ++ ++#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd ++ ++static int bpf_raw_tracepoint_open(const union bpf_attr *attr) ++{ ++ struct bpf_prog *prog; ++ int fd; ++ ++ if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) ++ return -EINVAL; ++ ++ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); ++ if (fd < 0) ++ bpf_prog_put(prog); ++ return fd; ++} ++ ++static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, ++ enum bpf_attach_type attach_type) ++{ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ case BPF_PROG_TYPE_SK_LOOKUP: ++ return attach_type == prog->expected_attach_type ? 0 : -EINVAL; ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ if (!capable(CAP_NET_ADMIN)) ++ /* cg-skb progs can be loaded by unpriv user. ++ * check permissions at attach time. ++ */ ++ return -EPERM; ++ return prog->enforce_expected_attach_type && ++ prog->expected_attach_type != attach_type ? ++ -EINVAL : 0; ++ default: ++ return 0; ++ } ++} ++ ++static enum bpf_prog_type ++attach_type_to_prog_type(enum bpf_attach_type attach_type) ++{ ++ switch (attach_type) { ++ case BPF_CGROUP_INET_INGRESS: ++ case BPF_CGROUP_INET_EGRESS: ++ return BPF_PROG_TYPE_CGROUP_SKB; ++ case BPF_CGROUP_INET_SOCK_CREATE: ++ case BPF_CGROUP_INET_SOCK_RELEASE: ++ case BPF_CGROUP_INET4_POST_BIND: ++ case BPF_CGROUP_INET6_POST_BIND: ++ return BPF_PROG_TYPE_CGROUP_SOCK; ++ case BPF_CGROUP_INET4_BIND: ++ case BPF_CGROUP_INET6_BIND: ++ case BPF_CGROUP_INET4_CONNECT: ++ case BPF_CGROUP_INET6_CONNECT: ++ case BPF_CGROUP_INET4_GETPEERNAME: ++ case BPF_CGROUP_INET6_GETPEERNAME: ++ case BPF_CGROUP_INET4_GETSOCKNAME: ++ case BPF_CGROUP_INET6_GETSOCKNAME: ++ case BPF_CGROUP_UDP4_SENDMSG: ++ case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: ++ return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; ++ case BPF_CGROUP_SOCK_OPS: ++ return BPF_PROG_TYPE_SOCK_OPS; ++ case BPF_CGROUP_DEVICE: ++ return BPF_PROG_TYPE_CGROUP_DEVICE; ++ case BPF_SK_MSG_VERDICT: ++ return BPF_PROG_TYPE_SK_MSG; ++ case BPF_SK_SKB_STREAM_PARSER: ++ case BPF_SK_SKB_STREAM_VERDICT: ++ case BPF_SK_SKB_VERDICT: ++ return BPF_PROG_TYPE_SK_SKB; ++ case BPF_LIRC_MODE2: ++ return BPF_PROG_TYPE_LIRC_MODE2; ++ case BPF_FLOW_DISSECTOR: ++ return BPF_PROG_TYPE_FLOW_DISSECTOR; ++ case BPF_CGROUP_SYSCTL: ++ return BPF_PROG_TYPE_CGROUP_SYSCTL; ++ case BPF_CGROUP_GETSOCKOPT: ++ case BPF_CGROUP_SETSOCKOPT: ++ return BPF_PROG_TYPE_CGROUP_SOCKOPT; ++ case BPF_TRACE_ITER: ++ case BPF_TRACE_RAW_TP: ++ case BPF_TRACE_FENTRY: ++ case BPF_TRACE_FEXIT: ++ case BPF_MODIFY_RETURN: ++ return BPF_PROG_TYPE_TRACING; ++ case BPF_LSM_MAC: ++ return BPF_PROG_TYPE_LSM; ++ case BPF_SK_LOOKUP: ++ return BPF_PROG_TYPE_SK_LOOKUP; ++ case BPF_XDP: ++ return BPF_PROG_TYPE_XDP; ++ case BPF_LSM_CGROUP: ++ return BPF_PROG_TYPE_LSM; ++ default: ++ return BPF_PROG_TYPE_UNSPEC; ++ } ++} ++ ++#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd ++ ++#define BPF_F_ATTACH_MASK \ ++ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) ++ ++static int bpf_prog_attach(const union bpf_attr *attr) ++{ ++ enum bpf_prog_type ptype; ++ struct bpf_prog *prog; ++ int ret; ++ ++ if (CHECK_ATTR(BPF_PROG_ATTACH)) ++ return -EINVAL; ++ ++ if (attr->attach_flags & ~BPF_F_ATTACH_MASK) ++ return -EINVAL; ++ ++ ptype = attach_type_to_prog_type(attr->attach_type); ++ if (ptype == BPF_PROG_TYPE_UNSPEC) ++ return -EINVAL; ++ ++ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { ++ bpf_prog_put(prog); ++ return -EINVAL; ++ } ++ ++ switch (ptype) { ++ case BPF_PROG_TYPE_SK_SKB: ++ case BPF_PROG_TYPE_SK_MSG: ++ ret = sock_map_get_from_fd(attr, prog); ++ break; ++ case BPF_PROG_TYPE_LIRC_MODE2: ++ ret = lirc_prog_attach(attr, prog); ++ break; ++ case BPF_PROG_TYPE_FLOW_DISSECTOR: ++ ret = netns_bpf_prog_attach(attr, prog); ++ break; ++ case BPF_PROG_TYPE_CGROUP_DEVICE: ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ case BPF_PROG_TYPE_CGROUP_SYSCTL: ++ case BPF_PROG_TYPE_SOCK_OPS: ++ case BPF_PROG_TYPE_LSM: ++ if (ptype == BPF_PROG_TYPE_LSM && ++ prog->expected_attach_type != BPF_LSM_CGROUP) ++ return -EINVAL; ++ ++ ret = cgroup_bpf_prog_attach(attr, ptype, prog); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ if (ret) ++ bpf_prog_put(prog); ++ return ret; ++} ++ ++#define BPF_PROG_DETACH_LAST_FIELD attach_type ++ ++static int bpf_prog_detach(const union bpf_attr *attr) ++{ ++ enum bpf_prog_type ptype; ++ ++ if (CHECK_ATTR(BPF_PROG_DETACH)) ++ return -EINVAL; ++ ++ ptype = attach_type_to_prog_type(attr->attach_type); ++ ++ switch (ptype) { ++ case BPF_PROG_TYPE_SK_MSG: ++ case BPF_PROG_TYPE_SK_SKB: ++ return sock_map_prog_detach(attr, ptype); ++ case BPF_PROG_TYPE_LIRC_MODE2: ++ return lirc_prog_detach(attr); ++ case BPF_PROG_TYPE_FLOW_DISSECTOR: ++ return netns_bpf_prog_detach(attr, ptype); ++ case BPF_PROG_TYPE_CGROUP_DEVICE: ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ case BPF_PROG_TYPE_CGROUP_SYSCTL: ++ case BPF_PROG_TYPE_SOCK_OPS: ++ case BPF_PROG_TYPE_LSM: ++ return cgroup_bpf_prog_detach(attr, ptype); ++ default: ++ return -EINVAL; ++ } ++} ++ ++#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags ++ ++static int bpf_prog_query(const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ if (CHECK_ATTR(BPF_PROG_QUERY)) ++ return -EINVAL; ++ if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) ++ return -EINVAL; ++ ++ switch (attr->query.attach_type) { ++ case BPF_CGROUP_INET_INGRESS: ++ case BPF_CGROUP_INET_EGRESS: ++ case BPF_CGROUP_INET_SOCK_CREATE: ++ case BPF_CGROUP_INET_SOCK_RELEASE: ++ case BPF_CGROUP_INET4_BIND: ++ case BPF_CGROUP_INET6_BIND: ++ case BPF_CGROUP_INET4_POST_BIND: ++ case BPF_CGROUP_INET6_POST_BIND: ++ case BPF_CGROUP_INET4_CONNECT: ++ case BPF_CGROUP_INET6_CONNECT: ++ case BPF_CGROUP_INET4_GETPEERNAME: ++ case BPF_CGROUP_INET6_GETPEERNAME: ++ case BPF_CGROUP_INET4_GETSOCKNAME: ++ case BPF_CGROUP_INET6_GETSOCKNAME: ++ case BPF_CGROUP_UDP4_SENDMSG: ++ case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: ++ case BPF_CGROUP_SOCK_OPS: ++ case BPF_CGROUP_DEVICE: ++ case BPF_CGROUP_SYSCTL: ++ case BPF_CGROUP_GETSOCKOPT: ++ case BPF_CGROUP_SETSOCKOPT: ++ case BPF_LSM_CGROUP: ++ return cgroup_bpf_prog_query(attr, uattr); ++ case BPF_LIRC_MODE2: ++ return lirc_prog_query(attr, uattr); ++ case BPF_FLOW_DISSECTOR: ++ case BPF_SK_LOOKUP: ++ return netns_bpf_prog_query(attr, uattr); ++ case BPF_SK_SKB_STREAM_PARSER: ++ case BPF_SK_SKB_STREAM_VERDICT: ++ case BPF_SK_MSG_VERDICT: ++ case BPF_SK_SKB_VERDICT: ++ return sock_map_bpf_prog_query(attr, uattr); ++ default: ++ return -EINVAL; ++ } ++} ++ ++#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size ++ ++static int bpf_prog_test_run(const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_prog *prog; ++ int ret = -ENOTSUPP; ++ ++ if (CHECK_ATTR(BPF_PROG_TEST_RUN)) ++ return -EINVAL; ++ ++ if ((attr->test.ctx_size_in && !attr->test.ctx_in) || ++ (!attr->test.ctx_size_in && attr->test.ctx_in)) ++ return -EINVAL; ++ ++ if ((attr->test.ctx_size_out && !attr->test.ctx_out) || ++ (!attr->test.ctx_size_out && attr->test.ctx_out)) ++ return -EINVAL; ++ ++ prog = bpf_prog_get(attr->test.prog_fd); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ if (prog->aux->ops->test_run) ++ ret = prog->aux->ops->test_run(prog, attr, uattr); ++ ++ bpf_prog_put(prog); ++ return ret; ++} ++ ++#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id ++ ++static int bpf_obj_get_next_id(const union bpf_attr *attr, ++ union bpf_attr __user *uattr, ++ struct idr *idr, ++ spinlock_t *lock) ++{ ++ u32 next_id = attr->start_id; ++ int err = 0; ++ ++ if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ next_id++; ++ spin_lock_bh(lock); ++ if (!idr_get_next(idr, &next_id)) ++ err = -ENOENT; ++ spin_unlock_bh(lock); ++ ++ if (!err) ++ err = put_user(next_id, &uattr->next_id); ++ ++ return err; ++} ++ ++struct bpf_map *bpf_map_get_curr_or_next(u32 *id) ++{ ++ struct bpf_map *map; ++ ++ spin_lock_bh(&map_idr_lock); ++again: ++ map = idr_get_next(&map_idr, id); ++ if (map) { ++ map = __bpf_map_inc_not_zero(map, false); ++ if (IS_ERR(map)) { ++ (*id)++; ++ goto again; ++ } ++ } ++ spin_unlock_bh(&map_idr_lock); ++ ++ return map; ++} ++ ++struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) ++{ ++ struct bpf_prog *prog; ++ ++ spin_lock_bh(&prog_idr_lock); ++again: ++ prog = idr_get_next(&prog_idr, id); ++ if (prog) { ++ prog = bpf_prog_inc_not_zero(prog); ++ if (IS_ERR(prog)) { ++ (*id)++; ++ goto again; ++ } ++ } ++ spin_unlock_bh(&prog_idr_lock); ++ ++ return prog; ++} ++ ++#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id ++ ++struct bpf_prog *bpf_prog_by_id(u32 id) ++{ ++ struct bpf_prog *prog; ++ ++ if (!id) ++ return ERR_PTR(-ENOENT); ++ ++ spin_lock_bh(&prog_idr_lock); ++ prog = idr_find(&prog_idr, id); ++ if (prog) ++ prog = bpf_prog_inc_not_zero(prog); ++ else ++ prog = ERR_PTR(-ENOENT); ++ spin_unlock_bh(&prog_idr_lock); ++ return prog; ++} ++ ++static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) ++{ ++ struct bpf_prog *prog; ++ u32 id = attr->prog_id; ++ int fd; ++ ++ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ prog = bpf_prog_by_id(id); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ fd = bpf_prog_new_fd(prog); ++ if (fd < 0) ++ bpf_prog_put(prog); ++ ++ return fd; ++} ++ ++#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags ++ ++static int bpf_map_get_fd_by_id(const union bpf_attr *attr) ++{ ++ struct bpf_map *map; ++ u32 id = attr->map_id; ++ int f_flags; ++ int fd; ++ ++ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || ++ attr->open_flags & ~BPF_OBJ_FLAG_MASK) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ f_flags = bpf_get_file_flag(attr->open_flags); ++ if (f_flags < 0) ++ return f_flags; ++ ++ spin_lock_bh(&map_idr_lock); ++ map = idr_find(&map_idr, id); ++ if (map) ++ map = __bpf_map_inc_not_zero(map, true); ++ else ++ map = ERR_PTR(-ENOENT); ++ spin_unlock_bh(&map_idr_lock); ++ ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ ++ fd = bpf_map_new_fd(map, f_flags); ++ if (fd < 0) ++ bpf_map_put_with_uref(map); ++ ++ return fd; ++} ++ ++static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, ++ unsigned long addr, u32 *off, ++ u32 *type) ++{ ++ const struct bpf_map *map; ++ int i; ++ ++ mutex_lock(&prog->aux->used_maps_mutex); ++ for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { ++ map = prog->aux->used_maps[i]; ++ if (map == (void *)addr) { ++ *type = BPF_PSEUDO_MAP_FD; ++ goto out; ++ } ++ if (!map->ops->map_direct_value_meta) ++ continue; ++ if (!map->ops->map_direct_value_meta(map, addr, off)) { ++ *type = BPF_PSEUDO_MAP_VALUE; ++ goto out; ++ } ++ } ++ map = NULL; ++ ++out: ++ mutex_unlock(&prog->aux->used_maps_mutex); ++ return map; ++} ++ ++static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, ++ const struct cred *f_cred) ++{ ++ const struct bpf_map *map; ++ struct bpf_insn *insns; ++ u32 off, type; ++ u64 imm; ++ u8 code; ++ int i; ++ ++ insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), ++ GFP_USER); ++ if (!insns) ++ return insns; ++ ++ for (i = 0; i < prog->len; i++) { ++ code = insns[i].code; ++ ++ if (code == (BPF_JMP | BPF_TAIL_CALL)) { ++ insns[i].code = BPF_JMP | BPF_CALL; ++ insns[i].imm = BPF_FUNC_tail_call; ++ /* fall-through */ ++ } ++ if (code == (BPF_JMP | BPF_CALL) || ++ code == (BPF_JMP | BPF_CALL_ARGS)) { ++ if (code == (BPF_JMP | BPF_CALL_ARGS)) ++ insns[i].code = BPF_JMP | BPF_CALL; ++ if (!bpf_dump_raw_ok(f_cred)) ++ insns[i].imm = 0; ++ continue; ++ } ++ if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { ++ insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; ++ continue; ++ } ++ ++ if (code != (BPF_LD | BPF_IMM | BPF_DW)) ++ continue; ++ ++ imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; ++ map = bpf_map_from_imm(prog, imm, &off, &type); ++ if (map) { ++ insns[i].src_reg = type; ++ insns[i].imm = map->id; ++ insns[i + 1].imm = off; ++ continue; ++ } ++ } ++ ++ return insns; ++} ++ ++static int set_info_rec_size(struct bpf_prog_info *info) ++{ ++ /* ++ * Ensure info.*_rec_size is the same as kernel expected size ++ * ++ * or ++ * ++ * Only allow zero *_rec_size if both _rec_size and _cnt are ++ * zero. In this case, the kernel will set the expected ++ * _rec_size back to the info. ++ */ ++ ++ if ((info->nr_func_info || info->func_info_rec_size) && ++ info->func_info_rec_size != sizeof(struct bpf_func_info)) ++ return -EINVAL; ++ ++ if ((info->nr_line_info || info->line_info_rec_size) && ++ info->line_info_rec_size != sizeof(struct bpf_line_info)) ++ return -EINVAL; ++ ++ if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && ++ info->jited_line_info_rec_size != sizeof(__u64)) ++ return -EINVAL; ++ ++ info->func_info_rec_size = sizeof(struct bpf_func_info); ++ info->line_info_rec_size = sizeof(struct bpf_line_info); ++ info->jited_line_info_rec_size = sizeof(__u64); ++ ++ return 0; ++} ++ ++static int bpf_prog_get_info_by_fd(struct file *file, ++ struct bpf_prog *prog, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); ++ struct btf *attach_btf = bpf_prog_get_target_btf(prog); ++ struct bpf_prog_info info; ++ u32 info_len = attr->info.info_len; ++ struct bpf_prog_kstats stats; ++ char __user *uinsns; ++ u32 ulen; ++ int err; ++ ++ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); ++ if (err) ++ return err; ++ info_len = min_t(u32, sizeof(info), info_len); ++ ++ memset(&info, 0, sizeof(info)); ++ if (copy_from_user(&info, uinfo, info_len)) ++ return -EFAULT; ++ ++ info.type = prog->type; ++ info.id = prog->aux->id; ++ info.load_time = prog->aux->load_time; ++ info.created_by_uid = from_kuid_munged(current_user_ns(), ++ prog->aux->user->uid); ++ info.gpl_compatible = prog->gpl_compatible; ++ ++ memcpy(info.tag, prog->tag, sizeof(prog->tag)); ++ memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); ++ ++ mutex_lock(&prog->aux->used_maps_mutex); ++ ulen = info.nr_map_ids; ++ info.nr_map_ids = prog->aux->used_map_cnt; ++ ulen = min_t(u32, info.nr_map_ids, ulen); ++ if (ulen) { ++ u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); ++ u32 i; ++ ++ for (i = 0; i < ulen; i++) ++ if (put_user(prog->aux->used_maps[i]->id, ++ &user_map_ids[i])) { ++ mutex_unlock(&prog->aux->used_maps_mutex); ++ return -EFAULT; ++ } ++ } ++ mutex_unlock(&prog->aux->used_maps_mutex); ++ ++ err = set_info_rec_size(&info); ++ if (err) ++ return err; ++ ++ bpf_prog_get_stats(prog, &stats); ++ info.run_time_ns = stats.nsecs; ++ info.run_cnt = stats.cnt; ++ info.recursion_misses = stats.misses; ++ ++ info.verified_insns = prog->aux->verified_insns; ++ ++ if (!bpf_capable()) { ++ info.jited_prog_len = 0; ++ info.xlated_prog_len = 0; ++ info.nr_jited_ksyms = 0; ++ info.nr_jited_func_lens = 0; ++ info.nr_func_info = 0; ++ info.nr_line_info = 0; ++ info.nr_jited_line_info = 0; ++ goto done; ++ } ++ ++ ulen = info.xlated_prog_len; ++ info.xlated_prog_len = bpf_prog_insn_size(prog); ++ if (info.xlated_prog_len && ulen) { ++ struct bpf_insn *insns_sanitized; ++ bool fault; ++ ++ if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { ++ info.xlated_prog_insns = 0; ++ goto done; ++ } ++ insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); ++ if (!insns_sanitized) ++ return -ENOMEM; ++ uinsns = u64_to_user_ptr(info.xlated_prog_insns); ++ ulen = min_t(u32, info.xlated_prog_len, ulen); ++ fault = copy_to_user(uinsns, insns_sanitized, ulen); ++ kfree(insns_sanitized); ++ if (fault) ++ return -EFAULT; ++ } ++ ++ if (bpf_prog_is_dev_bound(prog->aux)) { ++ err = bpf_prog_offload_info_fill(&info, prog); ++ if (err) ++ return err; ++ goto done; ++ } ++ ++ /* NOTE: the following code is supposed to be skipped for offload. ++ * bpf_prog_offload_info_fill() is the place to fill similar fields ++ * for offload. ++ */ ++ ulen = info.jited_prog_len; ++ if (prog->aux->func_cnt) { ++ u32 i; ++ ++ info.jited_prog_len = 0; ++ for (i = 0; i < prog->aux->func_cnt; i++) ++ info.jited_prog_len += prog->aux->func[i]->jited_len; ++ } else { ++ info.jited_prog_len = prog->jited_len; ++ } ++ ++ if (info.jited_prog_len && ulen) { ++ if (bpf_dump_raw_ok(file->f_cred)) { ++ uinsns = u64_to_user_ptr(info.jited_prog_insns); ++ ulen = min_t(u32, info.jited_prog_len, ulen); ++ ++ /* for multi-function programs, copy the JITed ++ * instructions for all the functions ++ */ ++ if (prog->aux->func_cnt) { ++ u32 len, free, i; ++ u8 *img; ++ ++ free = ulen; ++ for (i = 0; i < prog->aux->func_cnt; i++) { ++ len = prog->aux->func[i]->jited_len; ++ len = min_t(u32, len, free); ++ img = (u8 *) prog->aux->func[i]->bpf_func; ++ if (copy_to_user(uinsns, img, len)) ++ return -EFAULT; ++ uinsns += len; ++ free -= len; ++ if (!free) ++ break; ++ } ++ } else { ++ if (copy_to_user(uinsns, prog->bpf_func, ulen)) ++ return -EFAULT; ++ } ++ } else { ++ info.jited_prog_insns = 0; ++ } ++ } ++ ++ ulen = info.nr_jited_ksyms; ++ info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; ++ if (ulen) { ++ if (bpf_dump_raw_ok(file->f_cred)) { ++ unsigned long ksym_addr; ++ u64 __user *user_ksyms; ++ u32 i; ++ ++ /* copy the address of the kernel symbol ++ * corresponding to each function ++ */ ++ ulen = min_t(u32, info.nr_jited_ksyms, ulen); ++ user_ksyms = u64_to_user_ptr(info.jited_ksyms); ++ if (prog->aux->func_cnt) { ++ for (i = 0; i < ulen; i++) { ++ ksym_addr = (unsigned long) ++ prog->aux->func[i]->bpf_func; ++ if (put_user((u64) ksym_addr, ++ &user_ksyms[i])) ++ return -EFAULT; ++ } ++ } else { ++ ksym_addr = (unsigned long) prog->bpf_func; ++ if (put_user((u64) ksym_addr, &user_ksyms[0])) ++ return -EFAULT; ++ } ++ } else { ++ info.jited_ksyms = 0; ++ } ++ } ++ ++ ulen = info.nr_jited_func_lens; ++ info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; ++ if (ulen) { ++ if (bpf_dump_raw_ok(file->f_cred)) { ++ u32 __user *user_lens; ++ u32 func_len, i; ++ ++ /* copy the JITed image lengths for each function */ ++ ulen = min_t(u32, info.nr_jited_func_lens, ulen); ++ user_lens = u64_to_user_ptr(info.jited_func_lens); ++ if (prog->aux->func_cnt) { ++ for (i = 0; i < ulen; i++) { ++ func_len = ++ prog->aux->func[i]->jited_len; ++ if (put_user(func_len, &user_lens[i])) ++ return -EFAULT; ++ } ++ } else { ++ func_len = prog->jited_len; ++ if (put_user(func_len, &user_lens[0])) ++ return -EFAULT; ++ } ++ } else { ++ info.jited_func_lens = 0; ++ } ++ } ++ ++ if (prog->aux->btf) ++ info.btf_id = btf_obj_id(prog->aux->btf); ++ info.attach_btf_id = prog->aux->attach_btf_id; ++ if (attach_btf) ++ info.attach_btf_obj_id = btf_obj_id(attach_btf); ++ ++ ulen = info.nr_func_info; ++ info.nr_func_info = prog->aux->func_info_cnt; ++ if (info.nr_func_info && ulen) { ++ char __user *user_finfo; ++ ++ user_finfo = u64_to_user_ptr(info.func_info); ++ ulen = min_t(u32, info.nr_func_info, ulen); ++ if (copy_to_user(user_finfo, prog->aux->func_info, ++ info.func_info_rec_size * ulen)) ++ return -EFAULT; ++ } ++ ++ ulen = info.nr_line_info; ++ info.nr_line_info = prog->aux->nr_linfo; ++ if (info.nr_line_info && ulen) { ++ __u8 __user *user_linfo; ++ ++ user_linfo = u64_to_user_ptr(info.line_info); ++ ulen = min_t(u32, info.nr_line_info, ulen); ++ if (copy_to_user(user_linfo, prog->aux->linfo, ++ info.line_info_rec_size * ulen)) ++ return -EFAULT; ++ } ++ ++ ulen = info.nr_jited_line_info; ++ if (prog->aux->jited_linfo) ++ info.nr_jited_line_info = prog->aux->nr_linfo; ++ else ++ info.nr_jited_line_info = 0; ++ if (info.nr_jited_line_info && ulen) { ++ if (bpf_dump_raw_ok(file->f_cred)) { ++ unsigned long line_addr; ++ __u64 __user *user_linfo; ++ u32 i; ++ ++ user_linfo = u64_to_user_ptr(info.jited_line_info); ++ ulen = min_t(u32, info.nr_jited_line_info, ulen); ++ for (i = 0; i < ulen; i++) { ++ line_addr = (unsigned long)prog->aux->jited_linfo[i]; ++ if (put_user((__u64)line_addr, &user_linfo[i])) ++ return -EFAULT; ++ } ++ } else { ++ info.jited_line_info = 0; ++ } ++ } ++ ++ ulen = info.nr_prog_tags; ++ info.nr_prog_tags = prog->aux->func_cnt ? : 1; ++ if (ulen) { ++ __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; ++ u32 i; ++ ++ user_prog_tags = u64_to_user_ptr(info.prog_tags); ++ ulen = min_t(u32, info.nr_prog_tags, ulen); ++ if (prog->aux->func_cnt) { ++ for (i = 0; i < ulen; i++) { ++ if (copy_to_user(user_prog_tags[i], ++ prog->aux->func[i]->tag, ++ BPF_TAG_SIZE)) ++ return -EFAULT; ++ } ++ } else { ++ if (copy_to_user(user_prog_tags[0], ++ prog->tag, BPF_TAG_SIZE)) ++ return -EFAULT; ++ } ++ } ++ ++done: ++ if (copy_to_user(uinfo, &info, info_len) || ++ put_user(info_len, &uattr->info.info_len)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++static int bpf_map_get_info_by_fd(struct file *file, ++ struct bpf_map *map, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); ++ struct bpf_map_info info; ++ u32 info_len = attr->info.info_len; ++ int err; ++ ++ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); ++ if (err) ++ return err; ++ info_len = min_t(u32, sizeof(info), info_len); ++ ++ memset(&info, 0, sizeof(info)); ++ info.type = map->map_type; ++ info.id = map->id; ++ info.key_size = map->key_size; ++ info.value_size = map->value_size; ++ info.max_entries = map->max_entries; ++ info.map_flags = map->map_flags; ++ info.map_extra = map->map_extra; ++ memcpy(info.name, map->name, sizeof(map->name)); ++ ++ if (map->btf) { ++ info.btf_id = btf_obj_id(map->btf); ++ info.btf_key_type_id = map->btf_key_type_id; ++ info.btf_value_type_id = map->btf_value_type_id; ++ } ++ info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; ++ ++ if (bpf_map_is_dev_bound(map)) { ++ err = bpf_map_offload_info_fill(&info, map); ++ if (err) ++ return err; ++ } ++ ++ if (copy_to_user(uinfo, &info, info_len) || ++ put_user(info_len, &uattr->info.info_len)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++static int bpf_btf_get_info_by_fd(struct file *file, ++ struct btf *btf, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); ++ u32 info_len = attr->info.info_len; ++ int err; ++ ++ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); ++ if (err) ++ return err; ++ ++ return btf_get_info_by_fd(btf, attr, uattr); ++} ++ ++static int bpf_link_get_info_by_fd(struct file *file, ++ struct bpf_link *link, ++ const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); ++ struct bpf_link_info info; ++ u32 info_len = attr->info.info_len; ++ int err; ++ ++ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); ++ if (err) ++ return err; ++ info_len = min_t(u32, sizeof(info), info_len); ++ ++ memset(&info, 0, sizeof(info)); ++ if (copy_from_user(&info, uinfo, info_len)) ++ return -EFAULT; ++ ++ info.type = link->type; ++ info.id = link->id; ++ info.prog_id = link->prog->aux->id; ++ ++ if (link->ops->fill_link_info) { ++ err = link->ops->fill_link_info(link, &info); ++ if (err) ++ return err; ++ } ++ ++ if (copy_to_user(uinfo, &info, info_len) || ++ put_user(info_len, &uattr->info.info_len)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++ ++#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info ++ ++static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ int ufd = attr->info.bpf_fd; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) ++ return -EINVAL; ++ ++ f = fdget(ufd); ++ if (!f.file) ++ return -EBADFD; ++ ++ if (f.file->f_op == &bpf_prog_fops) ++ err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, ++ uattr); ++ else if (f.file->f_op == &bpf_map_fops) ++ err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, ++ uattr); ++ else if (f.file->f_op == &btf_fops) ++ err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); ++ else if (f.file->f_op == &bpf_link_fops) ++ err = bpf_link_get_info_by_fd(f.file, f.file->private_data, ++ attr, uattr); ++ else ++ err = -EINVAL; ++ ++ fdput(f); ++ return err; ++} ++ ++#define BPF_BTF_LOAD_LAST_FIELD btf_log_level ++ ++static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) ++{ ++ if (CHECK_ATTR(BPF_BTF_LOAD)) ++ return -EINVAL; ++ ++ if (!bpf_capable()) ++ return -EPERM; ++ ++ return btf_new_fd(attr, uattr); ++} ++ ++#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id ++ ++static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) ++{ ++ if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ return btf_get_fd_by_id(attr->btf_id); ++} ++ ++static int bpf_task_fd_query_copy(const union bpf_attr *attr, ++ union bpf_attr __user *uattr, ++ u32 prog_id, u32 fd_type, ++ const char *buf, u64 probe_offset, ++ u64 probe_addr) ++{ ++ char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); ++ u32 len = buf ? strlen(buf) : 0, input_len; ++ int err = 0; ++ ++ if (put_user(len, &uattr->task_fd_query.buf_len)) ++ return -EFAULT; ++ input_len = attr->task_fd_query.buf_len; ++ if (input_len && ubuf) { ++ if (!len) { ++ /* nothing to copy, just make ubuf NULL terminated */ ++ char zero = '\0'; ++ ++ if (put_user(zero, ubuf)) ++ return -EFAULT; ++ } else if (input_len >= len + 1) { ++ /* ubuf can hold the string with NULL terminator */ ++ if (copy_to_user(ubuf, buf, len + 1)) ++ return -EFAULT; ++ } else { ++ /* ubuf cannot hold the string with NULL terminator, ++ * do a partial copy with NULL terminator. ++ */ ++ char zero = '\0'; ++ ++ err = -ENOSPC; ++ if (copy_to_user(ubuf, buf, input_len - 1)) ++ return -EFAULT; ++ if (put_user(zero, ubuf + input_len - 1)) ++ return -EFAULT; ++ } ++ } ++ ++ if (put_user(prog_id, &uattr->task_fd_query.prog_id) || ++ put_user(fd_type, &uattr->task_fd_query.fd_type) || ++ put_user(probe_offset, &uattr->task_fd_query.probe_offset) || ++ put_user(probe_addr, &uattr->task_fd_query.probe_addr)) ++ return -EFAULT; ++ ++ return err; ++} ++ ++#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr ++ ++static int bpf_task_fd_query(const union bpf_attr *attr, ++ union bpf_attr __user *uattr) ++{ ++ pid_t pid = attr->task_fd_query.pid; ++ u32 fd = attr->task_fd_query.fd; ++ const struct perf_event *event; ++ struct task_struct *task; ++ struct file *file; ++ int err; ++ ++ if (CHECK_ATTR(BPF_TASK_FD_QUERY)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (attr->task_fd_query.flags != 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ task = get_pid_task(find_vpid(pid), PIDTYPE_PID); ++ rcu_read_unlock(); ++ if (!task) ++ return -ENOENT; ++ ++ err = 0; ++ file = fget_task(task, fd); ++ put_task_struct(task); ++ if (!file) ++ return -EBADF; ++ ++ if (file->f_op == &bpf_link_fops) { ++ struct bpf_link *link = file->private_data; ++ ++ if (link->ops == &bpf_raw_tp_link_lops) { ++ struct bpf_raw_tp_link *raw_tp = ++ container_of(link, struct bpf_raw_tp_link, link); ++ struct bpf_raw_event_map *btp = raw_tp->btp; ++ ++ err = bpf_task_fd_query_copy(attr, uattr, ++ raw_tp->link.prog->aux->id, ++ BPF_FD_TYPE_RAW_TRACEPOINT, ++ btp->tp->name, 0, 0); ++ goto put_file; ++ } ++ goto out_not_supp; ++ } ++ ++ event = perf_get_event(file); ++ if (!IS_ERR(event)) { ++ u64 probe_offset, probe_addr; ++ u32 prog_id, fd_type; ++ const char *buf; ++ ++ err = bpf_get_perf_event_info(event, &prog_id, &fd_type, ++ &buf, &probe_offset, ++ &probe_addr); ++ if (!err) ++ err = bpf_task_fd_query_copy(attr, uattr, prog_id, ++ fd_type, buf, ++ probe_offset, ++ probe_addr); ++ goto put_file; ++ } ++ ++out_not_supp: ++ err = -ENOTSUPP; ++put_file: ++ fput(file); ++ return err; ++} ++ ++#define BPF_MAP_BATCH_LAST_FIELD batch.flags ++ ++#define BPF_DO_BATCH(fn) \ ++ do { \ ++ if (!fn) { \ ++ err = -ENOTSUPP; \ ++ goto err_put; \ ++ } \ ++ err = fn(map, attr, uattr); \ ++ } while (0) ++ ++static int bpf_map_do_batch(const union bpf_attr *attr, ++ union bpf_attr __user *uattr, ++ int cmd) ++{ ++ bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || ++ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; ++ bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; ++ struct bpf_map *map; ++ int err, ufd; ++ struct fd f; ++ ++ if (CHECK_ATTR(BPF_MAP_BATCH)) ++ return -EINVAL; ++ ++ ufd = attr->batch.map_fd; ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ if (has_write) ++ bpf_map_write_active_inc(map); ++ if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { ++ err = -EPERM; ++ goto err_put; ++ } ++ ++ if (cmd == BPF_MAP_LOOKUP_BATCH) ++ BPF_DO_BATCH(map->ops->map_lookup_batch); ++ else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) ++ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); ++ else if (cmd == BPF_MAP_UPDATE_BATCH) ++ BPF_DO_BATCH(map->ops->map_update_batch); ++ else ++ BPF_DO_BATCH(map->ops->map_delete_batch); ++err_put: ++ if (has_write) ++ bpf_map_write_active_dec(map); ++ fdput(f); ++ return err; ++} ++ ++#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies ++static int link_create(union bpf_attr *attr, bpfptr_t uattr) ++{ ++ enum bpf_prog_type ptype; ++ struct bpf_prog *prog; ++ int ret; ++ ++ if (CHECK_ATTR(BPF_LINK_CREATE)) ++ return -EINVAL; ++ ++ prog = bpf_prog_get(attr->link_create.prog_fd); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ ret = bpf_prog_attach_check_attach_type(prog, ++ attr->link_create.attach_type); ++ if (ret) ++ goto out; ++ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_EXT: ++ break; ++ case BPF_PROG_TYPE_PERF_EVENT: ++ case BPF_PROG_TYPE_TRACEPOINT: ++ if (attr->link_create.attach_type != BPF_PERF_EVENT) { ++ ret = -EINVAL; ++ goto out; ++ } ++ break; ++ case BPF_PROG_TYPE_KPROBE: ++ if (attr->link_create.attach_type != BPF_PERF_EVENT && ++ attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) { ++ ret = -EINVAL; ++ goto out; ++ } ++ break; ++ default: ++ ptype = attach_type_to_prog_type(attr->link_create.attach_type); ++ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { ++ ret = -EINVAL; ++ goto out; ++ } ++ break; ++ } ++ ++ switch (prog->type) { ++ case BPF_PROG_TYPE_CGROUP_SKB: ++ case BPF_PROG_TYPE_CGROUP_SOCK: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ case BPF_PROG_TYPE_SOCK_OPS: ++ case BPF_PROG_TYPE_CGROUP_DEVICE: ++ case BPF_PROG_TYPE_CGROUP_SYSCTL: ++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: ++ ret = cgroup_bpf_link_attach(attr, prog); ++ break; ++ case BPF_PROG_TYPE_EXT: ++ ret = bpf_tracing_prog_attach(prog, ++ attr->link_create.target_fd, ++ attr->link_create.target_btf_id, ++ attr->link_create.tracing.cookie); ++ break; ++ case BPF_PROG_TYPE_LSM: ++ case BPF_PROG_TYPE_TRACING: ++ if (attr->link_create.attach_type != prog->expected_attach_type) { ++ ret = -EINVAL; ++ goto out; ++ } ++ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) ++ ret = bpf_raw_tp_link_attach(prog, NULL); ++ else if (prog->expected_attach_type == BPF_TRACE_ITER) ++ ret = bpf_iter_link_attach(attr, uattr, prog); ++ else if (prog->expected_attach_type == BPF_LSM_CGROUP) ++ ret = cgroup_bpf_link_attach(attr, prog); ++ else ++ ret = bpf_tracing_prog_attach(prog, ++ attr->link_create.target_fd, ++ attr->link_create.target_btf_id, ++ attr->link_create.tracing.cookie); ++ break; ++ case BPF_PROG_TYPE_FLOW_DISSECTOR: ++ case BPF_PROG_TYPE_SK_LOOKUP: ++ ret = netns_bpf_link_create(attr, prog); ++ break; ++#ifdef CONFIG_NET ++ case BPF_PROG_TYPE_XDP: ++ ret = bpf_xdp_link_attach(attr, prog); ++ break; ++#endif ++ case BPF_PROG_TYPE_PERF_EVENT: ++ case BPF_PROG_TYPE_TRACEPOINT: ++ ret = bpf_perf_link_attach(attr, prog); ++ break; ++ case BPF_PROG_TYPE_KPROBE: ++ if (attr->link_create.attach_type == BPF_PERF_EVENT) ++ ret = bpf_perf_link_attach(attr, prog); ++ else ++ ret = bpf_kprobe_multi_link_attach(attr, prog); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++out: ++ if (ret < 0) ++ bpf_prog_put(prog); ++ return ret; ++} ++ ++#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd ++ ++static int link_update(union bpf_attr *attr) ++{ ++ struct bpf_prog *old_prog = NULL, *new_prog; ++ struct bpf_link *link; ++ u32 flags; ++ int ret; ++ ++ if (CHECK_ATTR(BPF_LINK_UPDATE)) ++ return -EINVAL; ++ ++ flags = attr->link_update.flags; ++ if (flags & ~BPF_F_REPLACE) ++ return -EINVAL; ++ ++ link = bpf_link_get_from_fd(attr->link_update.link_fd); ++ if (IS_ERR(link)) ++ return PTR_ERR(link); ++ ++ new_prog = bpf_prog_get(attr->link_update.new_prog_fd); ++ if (IS_ERR(new_prog)) { ++ ret = PTR_ERR(new_prog); ++ goto out_put_link; ++ } ++ ++ if (flags & BPF_F_REPLACE) { ++ old_prog = bpf_prog_get(attr->link_update.old_prog_fd); ++ if (IS_ERR(old_prog)) { ++ ret = PTR_ERR(old_prog); ++ old_prog = NULL; ++ goto out_put_progs; ++ } ++ } else if (attr->link_update.old_prog_fd) { ++ ret = -EINVAL; ++ goto out_put_progs; ++ } ++ ++ if (link->ops->update_prog) ++ ret = link->ops->update_prog(link, new_prog, old_prog); ++ else ++ ret = -EINVAL; ++ ++out_put_progs: ++ if (old_prog) ++ bpf_prog_put(old_prog); ++ if (ret) ++ bpf_prog_put(new_prog); ++out_put_link: ++ bpf_link_put(link); ++ return ret; ++} ++ ++#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd ++ ++static int link_detach(union bpf_attr *attr) ++{ ++ struct bpf_link *link; ++ int ret; ++ ++ if (CHECK_ATTR(BPF_LINK_DETACH)) ++ return -EINVAL; ++ ++ link = bpf_link_get_from_fd(attr->link_detach.link_fd); ++ if (IS_ERR(link)) ++ return PTR_ERR(link); ++ ++ if (link->ops->detach) ++ ret = link->ops->detach(link); ++ else ++ ret = -EOPNOTSUPP; ++ ++ bpf_link_put(link); ++ return ret; ++} ++ ++static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) ++{ ++ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); ++} ++ ++struct bpf_link *bpf_link_by_id(u32 id) ++{ ++ struct bpf_link *link; ++ ++ if (!id) ++ return ERR_PTR(-ENOENT); ++ ++ spin_lock_bh(&link_idr_lock); ++ /* before link is "settled", ID is 0, pretend it doesn't exist yet */ ++ link = idr_find(&link_idr, id); ++ if (link) { ++ if (link->id) ++ link = bpf_link_inc_not_zero(link); ++ else ++ link = ERR_PTR(-EAGAIN); ++ } else { ++ link = ERR_PTR(-ENOENT); ++ } ++ spin_unlock_bh(&link_idr_lock); ++ return link; ++} ++ ++struct bpf_link *bpf_link_get_curr_or_next(u32 *id) ++{ ++ struct bpf_link *link; ++ ++ spin_lock_bh(&link_idr_lock); ++again: ++ link = idr_get_next(&link_idr, id); ++ if (link) { ++ link = bpf_link_inc_not_zero(link); ++ if (IS_ERR(link)) { ++ (*id)++; ++ goto again; ++ } ++ } ++ spin_unlock_bh(&link_idr_lock); ++ ++ return link; ++} ++ ++#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id ++ ++static int bpf_link_get_fd_by_id(const union bpf_attr *attr) ++{ ++ struct bpf_link *link; ++ u32 id = attr->link_id; ++ int fd; ++ ++ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ link = bpf_link_by_id(id); ++ if (IS_ERR(link)) ++ return PTR_ERR(link); ++ ++ fd = bpf_link_new_fd(link); ++ if (fd < 0) ++ bpf_link_put(link); ++ ++ return fd; ++} ++ ++DEFINE_MUTEX(bpf_stats_enabled_mutex); ++ ++static int bpf_stats_release(struct inode *inode, struct file *file) ++{ ++ mutex_lock(&bpf_stats_enabled_mutex); ++ static_key_slow_dec(&bpf_stats_enabled_key.key); ++ mutex_unlock(&bpf_stats_enabled_mutex); ++ return 0; ++} ++ ++static const struct file_operations bpf_stats_fops = { ++ .release = bpf_stats_release, ++}; ++ ++static int bpf_enable_runtime_stats(void) ++{ ++ int fd; ++ ++ mutex_lock(&bpf_stats_enabled_mutex); ++ ++ /* Set a very high limit to avoid overflow */ ++ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { ++ mutex_unlock(&bpf_stats_enabled_mutex); ++ return -EBUSY; ++ } ++ ++ fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); ++ if (fd >= 0) ++ static_key_slow_inc(&bpf_stats_enabled_key.key); ++ ++ mutex_unlock(&bpf_stats_enabled_mutex); ++ return fd; ++} ++ ++#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type ++ ++static int bpf_enable_stats(union bpf_attr *attr) ++{ ++ ++ if (CHECK_ATTR(BPF_ENABLE_STATS)) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ switch (attr->enable_stats.type) { ++ case BPF_STATS_RUN_TIME: ++ return bpf_enable_runtime_stats(); ++ default: ++ break; ++ } ++ return -EINVAL; ++} ++ ++#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags ++ ++static int bpf_iter_create(union bpf_attr *attr) ++{ ++ struct bpf_link *link; ++ int err; ++ ++ if (CHECK_ATTR(BPF_ITER_CREATE)) ++ return -EINVAL; ++ ++ if (attr->iter_create.flags) ++ return -EINVAL; ++ ++ link = bpf_link_get_from_fd(attr->iter_create.link_fd); ++ if (IS_ERR(link)) ++ return PTR_ERR(link); ++ ++ err = bpf_iter_new_fd(link); ++ bpf_link_put(link); ++ ++ return err; ++} ++ ++#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags ++ ++static int bpf_prog_bind_map(union bpf_attr *attr) ++{ ++ struct bpf_prog *prog; ++ struct bpf_map *map; ++ struct bpf_map **used_maps_old, **used_maps_new; ++ int i, ret = 0; ++ ++ if (CHECK_ATTR(BPF_PROG_BIND_MAP)) ++ return -EINVAL; ++ ++ if (attr->prog_bind_map.flags) ++ return -EINVAL; ++ ++ prog = bpf_prog_get(attr->prog_bind_map.prog_fd); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ map = bpf_map_get(attr->prog_bind_map.map_fd); ++ if (IS_ERR(map)) { ++ ret = PTR_ERR(map); ++ goto out_prog_put; ++ } ++ ++ mutex_lock(&prog->aux->used_maps_mutex); ++ ++ used_maps_old = prog->aux->used_maps; ++ ++ for (i = 0; i < prog->aux->used_map_cnt; i++) ++ if (used_maps_old[i] == map) { ++ bpf_map_put(map); ++ goto out_unlock; ++ } ++ ++ used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, ++ sizeof(used_maps_new[0]), ++ GFP_KERNEL); ++ if (!used_maps_new) { ++ ret = -ENOMEM; ++ goto out_unlock; ++ } ++ ++ memcpy(used_maps_new, used_maps_old, ++ sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); ++ used_maps_new[prog->aux->used_map_cnt] = map; ++ ++ prog->aux->used_map_cnt++; ++ prog->aux->used_maps = used_maps_new; ++ ++ kfree(used_maps_old); ++ ++out_unlock: ++ mutex_unlock(&prog->aux->used_maps_mutex); ++ ++ if (ret) ++ bpf_map_put(map); ++out_prog_put: ++ bpf_prog_put(prog); ++ return ret; ++} ++ ++static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) ++{ ++ union bpf_attr attr; ++ bool capable; ++ int err; ++ ++ capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; ++ ++ /* Intent here is for unprivileged_bpf_disabled to block key object ++ * creation commands for unprivileged users; other actions depend ++ * of fd availability and access to bpffs, so are dependent on ++ * object creation success. Capabilities are later verified for ++ * operations such as load and map create, so even with unprivileged ++ * BPF disabled, capability checks are still carried out for these ++ * and other operations. ++ */ ++ if (!capable && ++ (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) ++ return -EPERM; ++ ++ err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); ++ if (err) ++ return err; ++ size = min_t(u32, size, sizeof(attr)); ++ ++ /* copy attributes from user space, may be less than sizeof(bpf_attr) */ ++ memset(&attr, 0, sizeof(attr)); ++ if (copy_from_bpfptr(&attr, uattr, size) != 0) ++ return -EFAULT; ++ ++ err = security_bpf(cmd, &attr, size); ++ if (err < 0) ++ return err; ++ ++ switch (cmd) { ++ case BPF_MAP_CREATE: ++ err = map_create(&attr); ++ break; ++ case BPF_MAP_LOOKUP_ELEM: ++ err = map_lookup_elem(&attr); ++ break; ++ case BPF_MAP_UPDATE_ELEM: ++ err = map_update_elem(&attr, uattr); ++ break; ++ case BPF_MAP_DELETE_ELEM: ++ err = map_delete_elem(&attr); ++ break; ++ case BPF_MAP_GET_NEXT_KEY: ++ err = map_get_next_key(&attr); ++ break; ++ case BPF_MAP_FREEZE: ++ err = map_freeze(&attr); ++ break; ++ case BPF_PROG_LOAD: ++ err = bpf_prog_load(&attr, uattr); ++ break; ++ case BPF_OBJ_PIN: ++ err = bpf_obj_pin(&attr); ++ break; ++ case BPF_OBJ_GET: ++ err = bpf_obj_get(&attr); ++ break; ++ case BPF_PROG_ATTACH: ++ err = bpf_prog_attach(&attr); ++ break; ++ case BPF_PROG_DETACH: ++ err = bpf_prog_detach(&attr); ++ break; ++ case BPF_PROG_QUERY: ++ err = bpf_prog_query(&attr, uattr.user); ++ break; ++ case BPF_PROG_TEST_RUN: ++ err = bpf_prog_test_run(&attr, uattr.user); ++ break; ++ case BPF_PROG_GET_NEXT_ID: ++ err = bpf_obj_get_next_id(&attr, uattr.user, ++ &prog_idr, &prog_idr_lock); ++ break; ++ case BPF_MAP_GET_NEXT_ID: ++ err = bpf_obj_get_next_id(&attr, uattr.user, ++ &map_idr, &map_idr_lock); ++ break; ++ case BPF_BTF_GET_NEXT_ID: ++ err = bpf_obj_get_next_id(&attr, uattr.user, ++ &btf_idr, &btf_idr_lock); ++ break; ++ case BPF_PROG_GET_FD_BY_ID: ++ err = bpf_prog_get_fd_by_id(&attr); ++ break; ++ case BPF_MAP_GET_FD_BY_ID: ++ err = bpf_map_get_fd_by_id(&attr); ++ break; ++ case BPF_OBJ_GET_INFO_BY_FD: ++ err = bpf_obj_get_info_by_fd(&attr, uattr.user); ++ break; ++ case BPF_RAW_TRACEPOINT_OPEN: ++ err = bpf_raw_tracepoint_open(&attr); ++ break; ++ case BPF_BTF_LOAD: ++ err = bpf_btf_load(&attr, uattr); ++ break; ++ case BPF_BTF_GET_FD_BY_ID: ++ err = bpf_btf_get_fd_by_id(&attr); ++ break; ++ case BPF_TASK_FD_QUERY: ++ err = bpf_task_fd_query(&attr, uattr.user); ++ break; ++ case BPF_MAP_LOOKUP_AND_DELETE_ELEM: ++ err = map_lookup_and_delete_elem(&attr); ++ break; ++ case BPF_MAP_LOOKUP_BATCH: ++ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH); ++ break; ++ case BPF_MAP_LOOKUP_AND_DELETE_BATCH: ++ err = bpf_map_do_batch(&attr, uattr.user, ++ BPF_MAP_LOOKUP_AND_DELETE_BATCH); ++ break; ++ case BPF_MAP_UPDATE_BATCH: ++ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH); ++ break; ++ case BPF_MAP_DELETE_BATCH: ++ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH); ++ break; ++ case BPF_LINK_CREATE: ++ err = link_create(&attr, uattr); ++ break; ++ case BPF_LINK_UPDATE: ++ err = link_update(&attr); ++ break; ++ case BPF_LINK_GET_FD_BY_ID: ++ err = bpf_link_get_fd_by_id(&attr); ++ break; ++ case BPF_LINK_GET_NEXT_ID: ++ err = bpf_obj_get_next_id(&attr, uattr.user, ++ &link_idr, &link_idr_lock); ++ break; ++ case BPF_ENABLE_STATS: ++ err = bpf_enable_stats(&attr); ++ break; ++ case BPF_ITER_CREATE: ++ err = bpf_iter_create(&attr); ++ break; ++ case BPF_LINK_DETACH: ++ err = link_detach(&attr); ++ break; ++ case BPF_PROG_BIND_MAP: ++ err = bpf_prog_bind_map(&attr); ++ break; ++ default: ++ err = -EINVAL; ++ break; ++ } ++ ++ return err; ++} ++ ++SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) ++{ ++ return __sys_bpf(cmd, USER_BPFPTR(uattr), size); ++} ++ ++static bool syscall_prog_is_valid_access(int off, int size, ++ enum bpf_access_type type, ++ const struct bpf_prog *prog, ++ struct bpf_insn_access_aux *info) ++{ ++ if (off < 0 || off >= U16_MAX) ++ return false; ++ if (off % size != 0) ++ return false; ++ return true; ++} ++ ++BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) ++{ ++ switch (cmd) { ++ case BPF_MAP_CREATE: ++ case BPF_MAP_UPDATE_ELEM: ++ case BPF_MAP_FREEZE: ++ case BPF_PROG_LOAD: ++ case BPF_BTF_LOAD: ++ case BPF_LINK_CREATE: ++ case BPF_RAW_TRACEPOINT_OPEN: ++ break; ++ default: ++ return -EINVAL; ++ } ++ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); ++} ++ ++ ++/* To shut up -Wmissing-prototypes. ++ * This function is used by the kernel light skeleton ++ * to load bpf programs when modules are loaded or during kernel boot. ++ * See tools/lib/bpf/skel_internal.h ++ */ ++int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); ++ ++int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) ++{ ++ struct bpf_prog * __maybe_unused prog; ++ struct bpf_tramp_run_ctx __maybe_unused run_ctx; ++ ++ switch (cmd) { ++#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ ++ case BPF_PROG_TEST_RUN: ++ if (attr->test.data_in || attr->test.data_out || ++ attr->test.ctx_out || attr->test.duration || ++ attr->test.repeat || attr->test.flags) ++ return -EINVAL; ++ ++ prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++ if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || ++ attr->test.ctx_size_in > U16_MAX) { ++ bpf_prog_put(prog); ++ return -EINVAL; ++ } ++ ++ run_ctx.bpf_cookie = 0; ++ run_ctx.saved_run_ctx = NULL; ++ if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) { ++ /* recursion detected */ ++ bpf_prog_put(prog); ++ return -EBUSY; ++ } ++ attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); ++ __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); ++ bpf_prog_put(prog); ++ return 0; ++#endif ++ default: ++ return ____bpf_sys_bpf(cmd, attr, size); ++ } ++} ++EXPORT_SYMBOL(kern_sys_bpf); ++ ++static const struct bpf_func_proto bpf_sys_bpf_proto = { ++ .func = bpf_sys_bpf, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_ANYTHING, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, ++ .arg3_type = ARG_CONST_SIZE, ++}; ++ ++const struct bpf_func_proto * __weak ++tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ++{ ++ return bpf_base_func_proto(func_id); ++} ++ ++BPF_CALL_1(bpf_sys_close, u32, fd) ++{ ++ /* When bpf program calls this helper there should not be ++ * an fdget() without matching completed fdput(). ++ * This helper is allowed in the following callchain only: ++ * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close ++ */ ++ return close_fd(fd); ++} ++ ++static const struct bpf_func_proto bpf_sys_close_proto = { ++ .func = bpf_sys_close, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_ANYTHING, ++}; ++ ++BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res) ++{ ++ if (flags) ++ return -EINVAL; ++ ++ if (name_sz <= 1 || name[name_sz - 1]) ++ return -EINVAL; ++ ++ if (!bpf_dump_raw_ok(current_cred())) ++ return -EPERM; ++ ++ *res = kallsyms_lookup_name(name); ++ return *res ? 0 : -ENOENT; ++} ++ ++static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { ++ .func = bpf_kallsyms_lookup_name, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_CONST_SIZE_OR_ZERO, ++ .arg3_type = ARG_ANYTHING, ++ .arg4_type = ARG_PTR_TO_LONG, ++}; ++ ++static const struct bpf_func_proto * ++syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ++{ ++ switch (func_id) { ++ case BPF_FUNC_sys_bpf: ++ return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; ++ case BPF_FUNC_btf_find_by_name_kind: ++ return &bpf_btf_find_by_name_kind_proto; ++ case BPF_FUNC_sys_close: ++ return &bpf_sys_close_proto; ++ case BPF_FUNC_kallsyms_lookup_name: ++ return &bpf_kallsyms_lookup_name_proto; ++ default: ++ return tracing_prog_func_proto(func_id, prog); ++ } ++} ++ ++const struct bpf_verifier_ops bpf_syscall_verifier_ops = { ++ .get_func_proto = syscall_prog_func_proto, ++ .is_valid_access = syscall_prog_is_valid_access, ++}; ++ ++const struct bpf_prog_ops bpf_syscall_prog_ops = { ++ .test_run = bpf_prog_test_run_syscall, ++}; ++ ++#ifdef CONFIG_SYSCTL ++static int bpf_stats_handler(struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct static_key *key = (struct static_key *)table->data; ++ static int saved_val; ++ int val, ret; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .mode = table->mode, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ mutex_lock(&bpf_stats_enabled_mutex); ++ val = saved_val; ++ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); ++ if (write && !ret && val != saved_val) { ++ if (val) ++ static_key_slow_inc(key); ++ else ++ static_key_slow_dec(key); ++ saved_val = val; ++ } ++ mutex_unlock(&bpf_stats_enabled_mutex); ++ return ret; ++} ++ ++void __weak unpriv_ebpf_notify(int new_state) ++{ ++} ++ ++static int bpf_unpriv_handler(struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret, unpriv_enable = *(int *)table->data; ++ bool locked_state = unpriv_enable == 1; ++ struct ctl_table tmp = *table; ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ tmp.data = &unpriv_enable; ++ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); ++ if (write && !ret) { ++ if (locked_state && unpriv_enable != 1) ++ return -EPERM; ++ *(int *)table->data = unpriv_enable; ++ } ++ ++ unpriv_ebpf_notify(unpriv_enable); ++ ++ return ret; ++} ++ ++static struct ctl_table bpf_syscall_table[] = { ++ { ++ .procname = "unprivileged_bpf_disabled", ++ .data = &sysctl_unprivileged_bpf_disabled, ++ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), ++ .mode = 0644, ++ .proc_handler = bpf_unpriv_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_TWO, ++ }, ++ { ++ .procname = "bpf_stats_enabled", ++ .data = &bpf_stats_enabled_key.key, ++ .maxlen = sizeof(bpf_stats_enabled_key), ++ .mode = 0644, ++ .proc_handler = bpf_stats_handler, ++ }, ++ { } ++}; ++ ++static int __init bpf_syscall_sysctl_init(void) ++{ ++ register_sysctl_init("kernel", bpf_syscall_table); ++ return 0; ++} ++late_initcall(bpf_syscall_sysctl_init); ++#endif /* CONFIG_SYSCTL */ +diff -rupN linux.orig/kernel/entry/common.c linux/kernel/entry/common.c +--- linux.orig/kernel/entry/common.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/entry/common.c 2022-12-04 10:40:26.716034044 -0500 +@@ -153,7 +153,7 @@ static unsigned long exit_to_user_mode_l local_irq_enable_exit_to_user(ti_work); @@ -5094,7 +33236,7 @@ index 063068a9ea9b3..26b772720b227 100644 schedule(); if (ti_work & _TIF_UPROBE) -@@ -381,7 +381,7 @@ void raw_irqentry_exit_cond_resched(void) +@@ -381,7 +381,7 @@ void raw_irqentry_exit_cond_resched(void rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); @@ -5103,11 +33245,10 @@ index 063068a9ea9b3..26b772720b227 100644 preempt_schedule_irq(); } } -diff --git a/kernel/hung_task.c b/kernel/hung_task.c -index bb2354f73dedc..19c9de825d248 100644 ---- a/kernel/hung_task.c -+++ b/kernel/hung_task.c -@@ -127,6 +127,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) +diff -rupN linux.orig/kernel/hung_task.c linux/kernel/hung_task.c +--- linux.orig/kernel/hung_task.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/hung_task.c 2022-12-04 10:40:26.716034044 -0500 +@@ -127,6 +127,8 @@ static void check_hung_task(struct task_ * complain: */ if (sysctl_hung_task_warnings) { @@ -5116,7 +33257,7 @@ index bb2354f73dedc..19c9de825d248 100644 if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", -@@ -142,6 +144,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) +@@ -142,6 +144,8 @@ static void check_hung_task(struct task_ if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; @@ -5125,7 +33266,7 @@ index bb2354f73dedc..19c9de825d248 100644 } touch_nmi_watchdog(); -@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) +@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_t } unlock: rcu_read_unlock(); @@ -5144,11 +33285,10 @@ index bb2354f73dedc..19c9de825d248 100644 } if (hung_task_call_panic) -diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c -index 5db0230aa6b52..476a3fecb8c53 100644 ---- a/kernel/irq/irqdesc.c -+++ b/kernel/irq/irqdesc.c -@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) +diff -rupN linux.orig/kernel/irq/irqdesc.c linux/kernel/irq/irqdesc.c +--- linux.orig/kernel/irq/irqdesc.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/irq/irqdesc.c 2022-12-04 10:40:26.716034044 -0500 +@@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); @@ -5179,10 +33319,24 @@ index 5db0230aa6b52..476a3fecb8c53 100644 /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. -diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c -index b1292a57c2a53..a6514db7ef58e 100644 ---- a/kernel/ksysfs.c -+++ b/kernel/ksysfs.c +diff -rupN linux.orig/kernel/Kconfig.preempt linux/kernel/Kconfig.preempt +--- linux.orig/kernel/Kconfig.preempt 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/Kconfig.preempt 2022-12-04 10:40:26.716034044 -0500 +@@ -1,5 +1,11 @@ + # SPDX-License-Identifier: GPL-2.0-only + ++config HAVE_PREEMPT_LAZY ++ bool ++ ++config PREEMPT_LAZY ++ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT ++ + config PREEMPT_NONE_BUILD + bool + +diff -rupN linux.orig/kernel/ksysfs.c linux/kernel/ksysfs.c +--- linux.orig/kernel/ksysfs.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/ksysfs.c 2022-12-04 10:40:26.716034044 -0500 @@ -137,6 +137,15 @@ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_CRASH_CORE */ @@ -5199,20 +33353,19 @@ index b1292a57c2a53..a6514db7ef58e 100644 /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -@@ -227,6 +236,9 @@ static struct attribute * kernel_attrs[] = { - #ifndef CONFIG_TINY_RCU +@@ -228,6 +237,9 @@ static struct attribute * kernel_attrs[] &rcu_expedited_attr.attr, &rcu_normal_attr.attr, -+#endif + #endif +#ifdef CONFIG_PREEMPT_RT + &realtime_attr.attr, - #endif ++#endif NULL }; -diff --git a/kernel/panic.c b/kernel/panic.c -index c6eb8f8db0c05..c4e8896e3caba 100644 ---- a/kernel/panic.c -+++ b/kernel/panic.c + +diff -rupN linux.orig/kernel/panic.c linux/kernel/panic.c +--- linux.orig/kernel/panic.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/panic.c 2022-12-04 10:40:26.716034044 -0500 @@ -257,7 +257,6 @@ void panic(const char *fmt, ...) panic_smp_self_stop(); @@ -5249,7 +33402,7 @@ index c6eb8f8db0c05..c4e8896e3caba 100644 crash_smp_send_stop(); } -@@ -604,6 +610,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, +@@ -604,6 +610,8 @@ void __warn(const char *file, int line, { disable_trace_on_warning(); @@ -5258,7 +33411,7 @@ index c6eb8f8db0c05..c4e8896e3caba 100644 if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, -@@ -633,6 +641,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, +@@ -633,6 +641,8 @@ void __warn(const char *file, int line, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); @@ -5267,10 +33420,9 @@ index c6eb8f8db0c05..c4e8896e3caba 100644 } #ifndef __WARN_FLAGS -diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h -index d947ca6c84f99..e7d8578860adf 100644 ---- a/kernel/printk/internal.h -+++ b/kernel/printk/internal.h +diff -rupN linux.orig/kernel/printk/internal.h linux/kernel/printk/internal.h +--- linux.orig/kernel/printk/internal.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/printk/internal.h 2022-12-04 10:40:26.716034044 -0500 @@ -20,6 +20,8 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; @@ -5280,10 +33432,9 @@ index d947ca6c84f99..e7d8578860adf 100644 __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, -diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c -index a1a81fd9889bb..f1f9ce9b23f60 100644 ---- a/kernel/printk/printk.c -+++ b/kernel/printk/printk.c +diff -rupN linux.orig/kernel/printk/printk.c linux/kernel/printk/printk.c +--- linux.orig/kernel/printk/printk.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/printk/printk.c 2022-12-04 10:40:26.720034034 -0500 @@ -44,6 +44,7 @@ #include #include @@ -5292,11 +33443,10 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 #include #include #include -@@ -223,6 +224,36 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - /* Number of registered extended console drivers. */ +@@ -224,6 +225,36 @@ int devkmsg_sysctl_set_loglvl(struct ctl static int nr_ext_console_drivers; -+/* + /* + * Used to synchronize printing kthreads against direct printing via + * console_trylock/console_unlock. + * @@ -5326,9 +33476,10 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 +/* Block console kthreads to avoid processing new messages. */ +bool block_console_kthreads; + - /* ++/* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. + */ @@ -271,14 +302,49 @@ static bool panic_in_progress(void) } @@ -5342,15 +33493,15 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 + * Tracks whether kthread printers are all blocked. A value of true implies + * that the console is locked via console_lock() or the console is suspended. + * Writing to this variable requires holding @console_sem. - */ --static int console_locked, console_suspended; ++ */ +static bool console_kthreads_blocked; + +/* + * Block all kthread printers from a schedulable context. + * + * Requires holding @console_sem. -+ */ + */ +-static int console_locked, console_suspended; +static void console_kthreads_block(void) +{ + struct console *con; @@ -5386,7 +33537,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 /* * Array of consoles built from command line options (console=) -@@ -361,7 +427,75 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; +@@ -361,7 +427,75 @@ static int console_msg_format = MSG_FORM /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); @@ -5462,7 +33613,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ -@@ -1850,6 +1984,7 @@ static int console_lock_spinning_disable_and_check(void) +@@ -1850,6 +1984,7 @@ static int console_lock_spinning_disable return 1; } @@ -5470,7 +33621,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 /** * console_trylock_spinning - try to get console_lock by busy waiting * -@@ -1923,6 +2058,7 @@ static int console_trylock_spinning(void) +@@ -1923,6 +2058,7 @@ static int console_trylock_spinning(void return 1; } @@ -5478,7 +33629,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 /* * Call the specified console driver, asking it to write out the specified -@@ -1930,19 +2066,28 @@ static int console_trylock_spinning(void) +@@ -1930,19 +2066,28 @@ static int console_trylock_spinning(void * dropped, a dropped message will be written out first. */ static void call_console_driver(struct console *con, const char *text, size_t len, @@ -5513,7 +33664,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } /* -@@ -2252,10 +2397,22 @@ asmlinkage int vprintk_emit(int facility, int level, +@@ -2252,10 +2397,22 @@ asmlinkage int vprintk_emit(int facility printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ @@ -5538,7 +33689,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. -@@ -2270,6 +2427,7 @@ asmlinkage int vprintk_emit(int facility, int level, +@@ -2270,6 +2427,7 @@ asmlinkage int vprintk_emit(int facility if (console_trylock_spinning()) console_unlock(); preempt_enable(); @@ -5546,7 +33697,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } wake_up_klogd(); -@@ -2296,8 +2454,80 @@ asmlinkage __visible int _printk(const char *fmt, ...) +@@ -2296,8 +2454,80 @@ asmlinkage __visible int _printk(const c } EXPORT_SYMBOL(_printk); @@ -5627,7 +33778,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 -@@ -2308,6 +2538,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre +@@ -2308,6 +2538,8 @@ static bool __pr_flush(struct console *c #define prb_first_valid_seq(rb) 0 #define prb_next_seq(rb) 0 @@ -5636,7 +33787,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 static u64 syslog_seq; static size_t record_print_text(const struct printk_record *r, -@@ -2326,11 +2558,13 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, +@@ -2326,11 +2558,13 @@ static ssize_t msg_print_ext_body(char * static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_driver(struct console *con, const char *text, size_t len, @@ -5651,7 +33802,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 #endif /* CONFIG_PRINTK */ -@@ -2549,6 +2783,14 @@ static int console_cpu_notify(unsigned int cpu) +@@ -2549,6 +2783,14 @@ static int console_cpu_notify(unsigned i /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); @@ -5708,7 +33859,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } EXPORT_SYMBOL(is_console_locked); -@@ -2620,18 +2877,9 @@ static bool abandon_console_lock_in_panic(void) +@@ -2620,18 +2877,9 @@ static bool abandon_console_lock_in_pani return atomic_read(&panic_cpu) != raw_smp_processor_id(); } @@ -5729,7 +33880,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 return false; /* -@@ -2640,18 +2888,116 @@ static inline bool console_is_usable(struct console *con) +@@ -2640,18 +2888,116 @@ static inline bool console_is_usable(str * cope (CON_ANYTIME) don't call them until this CPU is officially up. */ if (!cpu_online(raw_smp_processor_id()) && @@ -5907,7 +34058,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 suppress_panic_printk = 1; pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); } -@@ -2701,7 +3058,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ +@@ -2701,7 +3058,7 @@ static bool console_emit_next_record(str /* Skip record that has level above the console loglevel. */ if (suppress_message_printing(r.info->level)) { @@ -5916,7 +34067,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 goto skip; } -@@ -2715,31 +3072,65 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ +@@ -2715,32 +3072,66 @@ static bool console_emit_next_record(str len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } @@ -5969,7 +34120,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 return true; } -+/* + /* + * Print a record for a given console, but allow another printk() caller to + * take over the console_lock and continue printing. + * @@ -5997,10 +34148,11 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 + return __console_emit_next_record(con, text, ext_text, dropped_text, false, handover); +} + - /* ++/* * Print out all remaining records to all consoles. * -@@ -2758,8 +3149,8 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ + * @do_cond_resched is set by the caller. It can be true only in schedulable +@@ -2758,8 +3149,8 @@ skip: * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this @@ -6011,7 +34163,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 * * Requires the console_lock. */ -@@ -2776,24 +3167,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove +@@ -2776,24 +3167,26 @@ static bool console_flush_all(bool do_co *handover = false; do { @@ -6045,7 +34197,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } if (*handover) return false; -@@ -2818,6 +3211,68 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove +@@ -2818,6 +3211,68 @@ static bool console_flush_all(bool do_co return any_usable; } @@ -6141,7 +34293,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 /* * If someone else is holding the console lock, trylock will fail * and may_schedule may be set. Ignore and proceed to unlock so -@@ -2946,7 +3409,7 @@ void console_flush_on_panic(enum con_flush_mode mode) +@@ -2946,7 +3409,7 @@ void console_flush_on_panic(enum con_flu seq = prb_first_valid_seq(prb); for_each_console(c) @@ -6150,7 +34302,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } console_unlock(); } -@@ -3189,16 +3652,27 @@ void register_console(struct console *newcon) +@@ -3189,16 +3652,27 @@ void register_console(struct console *ne if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; @@ -6189,7 +34341,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 struct console *con; int res; -@@ -3265,9 +3740,26 @@ int unregister_console(struct console *console) +@@ -3265,9 +3740,26 @@ int unregister_console(struct console *c console_drivers->flags |= CON_CONSDEV; console->flags &= ~CON_ENABLED; @@ -6237,7 +34389,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) -@@ -3384,7 +3890,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre +@@ -3384,7 +3890,7 @@ static bool __pr_flush(struct console *c for_each_console(c) { if (con && con != c) continue; @@ -6246,7 +34398,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 continue; printk_seq = c->seq; if (printk_seq < seq) -@@ -3444,11 +3950,215 @@ bool pr_flush(int timeout_ms, bool reset_on_progress) +@@ -3444,11 +3950,215 @@ bool pr_flush(int timeout_ms, bool reset } EXPORT_SYMBOL(pr_flush); @@ -6464,7 +34616,7 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 static DEFINE_PER_CPU(int, printk_pending); -@@ -3456,10 +4166,14 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) +@@ -3456,10 +4166,14 @@ static void wake_up_klogd_work_func(stru { int pending = this_cpu_xchg(printk_pending, 0); @@ -6513,10 +34665,9 @@ index a1a81fd9889bb..f1f9ce9b23f60 100644 } void printk_trigger_flush(void) -diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c -index ef0f9a2044da1..caac4de1ea59a 100644 ---- a/kernel/printk/printk_safe.c -+++ b/kernel/printk/printk_safe.c +diff -rupN linux.orig/kernel/printk/printk_safe.c linux/kernel/printk/printk_safe.c +--- linux.orig/kernel/printk/printk_safe.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/printk/printk_safe.c 2022-12-04 10:40:26.720034034 -0500 @@ -8,7 +8,9 @@ #include #include @@ -6527,7 +34678,7 @@ index ef0f9a2044da1..caac4de1ea59a 100644 #include "internal.h" -@@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt, va_list args) +@@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt, return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); @@ -6561,11 +34712,10 @@ index ef0f9a2044da1..caac4de1ea59a 100644 + timeout_ms -= 1; + } +} -diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c -index d8e1b270a065f..257cb6f5ea622 100644 ---- a/kernel/rcu/rcutorture.c -+++ b/kernel/rcu/rcutorture.c -@@ -2157,6 +2157,12 @@ static int rcutorture_booster_init(unsigned int cpu) +diff -rupN linux.orig/kernel/rcu/rcutorture.c linux/kernel/rcu/rcutorture.c +--- linux.orig/kernel/rcu/rcutorture.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/rcu/rcutorture.c 2022-12-04 10:40:26.720034034 -0500 +@@ -2157,6 +2157,12 @@ static int rcutorture_booster_init(unsig WARN_ON_ONCE(!t); sp.sched_priority = 2; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); @@ -6578,11 +34728,10 @@ index d8e1b270a065f..257cb6f5ea622 100644 } /* Don't allow time recalculation while creating a new task. */ -diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h -index c3fbbcc09327f..195cad14742dd 100644 ---- a/kernel/rcu/tree_stall.h -+++ b/kernel/rcu/tree_stall.h -@@ -643,6 +643,7 @@ static void print_cpu_stall(unsigned long gps) +diff -rupN linux.orig/kernel/rcu/tree_stall.h linux/kernel/rcu/tree_stall.h +--- linux.orig/kernel/rcu/tree_stall.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/rcu/tree_stall.h 2022-12-04 10:40:26.720034034 -0500 +@@ -643,6 +643,7 @@ static void print_cpu_stall(unsigned lon * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ @@ -6590,7 +34739,7 @@ index c3fbbcc09327f..195cad14742dd 100644 trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected")); pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); -@@ -677,6 +678,7 @@ static void print_cpu_stall(unsigned long gps) +@@ -677,6 +678,7 @@ static void print_cpu_stall(unsigned lon */ set_tsk_need_resched(current); set_preempt_need_resched(); @@ -6598,10 +34747,9 @@ index c3fbbcc09327f..195cad14742dd 100644 } static void check_cpu_stall(struct rcu_data *rdp) -diff --git a/kernel/reboot.c b/kernel/reboot.c -index 3c35445bf5ad3..80564ffafabff 100644 ---- a/kernel/reboot.c -+++ b/kernel/reboot.c +diff -rupN linux.orig/kernel/reboot.c linux/kernel/reboot.c +--- linux.orig/kernel/reboot.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/reboot.c 2022-12-04 10:40:26.720034034 -0500 @@ -82,6 +82,7 @@ void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); @@ -6610,7 +34758,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 usermodehelper_disable(); device_shutdown(); } -@@ -270,6 +271,7 @@ static void kernel_shutdown_prepare(enum system_states state) +@@ -270,6 +271,7 @@ static void kernel_shutdown_prepare(enum blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; @@ -6630,7 +34778,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 } return ret; -@@ -834,6 +838,7 @@ static int __orderly_poweroff(bool force) +@@ -834,6 +838,7 @@ static int __orderly_poweroff(bool force ret = run_cmd(poweroff_cmd); if (ret && force) { @@ -6638,7 +34786,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* -@@ -843,6 +848,7 @@ static int __orderly_poweroff(bool force) +@@ -843,6 +848,7 @@ static int __orderly_poweroff(bool force */ emergency_sync(); kernel_power_off(); @@ -6655,7 +34803,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 /* * We have reached here after the emergency shutdown waiting period has * expired. This means orderly_poweroff has not been able to shut off -@@ -916,6 +924,8 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work) +@@ -916,6 +924,8 @@ static void hw_failure_emergency_powerof */ pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); emergency_restart(); @@ -6664,7 +34812,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 } static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, -@@ -954,11 +964,13 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) +@@ -954,11 +964,13 @@ void hw_protection_shutdown(const char * { static atomic_t allow_proceed = ATOMIC_INIT(1); @@ -6679,7 +34827,7 @@ index 3c35445bf5ad3..80564ffafabff 100644 /* * Queue a backup emergency shutdown in the event of -@@ -966,6 +978,8 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) +@@ -966,6 +978,8 @@ void hw_protection_shutdown(const char * */ hw_failure_emergency_poweroff(ms_until_forced); orderly_poweroff(true); @@ -6688,10 +34836,9 @@ index 3c35445bf5ad3..80564ffafabff 100644 } EXPORT_SYMBOL_GPL(hw_protection_shutdown); -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index ee28253c9ac0c..2ce515d3e6f8d 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c +diff -rupN linux.orig/kernel/sched/core.c linux/kernel/sched/core.c +--- linux.orig/kernel/sched/core.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/sched/core.c 2022-12-04 10:40:26.720034034 -0500 @@ -1046,6 +1046,46 @@ void resched_curr(struct rq *rq) trace_sched_wake_idle_without_ipi(cpu); } @@ -6755,7 +34902,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); -@@ -3251,6 +3293,70 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, +@@ -3251,6 +3293,70 @@ out: } #endif /* CONFIG_NUMA_BALANCING */ @@ -6826,7 +34973,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 /* * wait_task_inactive - wait for a thread to unschedule. * -@@ -3269,7 +3375,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, +@@ -3269,7 +3375,7 @@ out: */ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) { @@ -6835,7 +34982,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 struct rq_flags rf; unsigned long ncsw; struct rq *rq; -@@ -3295,7 +3401,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state +@@ -3295,7 +3401,7 @@ unsigned long wait_task_inactive(struct * is actually now running somewhere else! */ while (task_running(rq, p)) { @@ -6844,7 +34991,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 return 0; cpu_relax(); } -@@ -3308,10 +3414,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state +@@ -3308,10 +3414,12 @@ unsigned long wait_task_inactive(struct rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); running = task_running(rq, p); @@ -6859,7 +35006,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 task_rq_unlock(rq, p, &rf); /* -@@ -3340,7 +3448,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state +@@ -3340,7 +3448,7 @@ unsigned long wait_task_inactive(struct * running right now), it's preempted, and we should * yield - it could be a while. */ @@ -6868,7 +35015,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBLE); -@@ -4589,6 +4697,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4589,6 +4697,9 @@ int sched_fork(unsigned long clone_flags p->on_cpu = 0; #endif init_task_preempt_count(p); @@ -6878,7 +35025,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); -@@ -6457,6 +6568,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) +@@ -6457,6 +6568,7 @@ static void __sched notrace __schedule(u next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); @@ -6886,7 +35033,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; -@@ -6671,6 +6783,30 @@ static void __sched notrace preempt_schedule_common(void) +@@ -6671,6 +6783,30 @@ static void __sched notrace preempt_sche } while (need_resched()); } @@ -6917,7 +35064,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 #ifdef CONFIG_PREEMPTION /* * This is the entry point to schedule() from in-kernel preemption -@@ -6684,6 +6820,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) +@@ -6684,6 +6820,8 @@ asmlinkage __visible void __sched notrac */ if (likely(!preemptible())) return; @@ -6926,7 +35073,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); -@@ -6731,6 +6869,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) +@@ -6731,6 +6869,9 @@ asmlinkage __visible void __sched notrac if (likely(!preemptible())) return; @@ -6936,7 +35083,7 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 do { /* * Because the function tracer can trace preempt_count_sub() -@@ -8988,7 +9129,9 @@ void __init init_idle(struct task_struct *idle, int cpu) +@@ -8988,7 +9129,9 @@ void __init init_idle(struct task_struct /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); @@ -6947,11 +35094,10 @@ index ee28253c9ac0c..2ce515d3e6f8d 100644 /* * The idle tasks have their own, simple scheduling class: */ -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 914096c5b1ae1..3cb55e6ede337 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -4576,7 +4576,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +diff -rupN linux.orig/kernel/sched/fair.c linux/kernel/sched/fair.c +--- linux.orig/kernel/sched/fair.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/sched/fair.c 2022-12-04 10:40:26.720034034 -0500 +@@ -4576,7 +4576,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { @@ -6960,7 +35106,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. -@@ -4600,7 +4600,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +@@ -4600,7 +4600,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq return; if (delta > ideal_runtime) @@ -6969,7 +35115,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 } static void -@@ -4746,7 +4746,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +@@ -4746,7 +4746,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc * validating it and just reschedule. */ if (queued) { @@ -6978,7 +35124,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 return; } /* -@@ -4895,7 +4895,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +@@ -4895,7 +4895,7 @@ static void __account_cfs_rq_runtime(str * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) @@ -6987,7 +35133,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 } static __always_inline -@@ -5646,7 +5646,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) +@@ -5646,7 +5646,7 @@ static void hrtick_start_fair(struct rq if (delta < 0) { if (task_current(rq, p)) @@ -6996,7 +35142,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 return; } hrtick_start(rq, delta); -@@ -7307,7 +7307,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7307,7 +7307,7 @@ static void check_preempt_wakeup(struct return; preempt: @@ -7005,7 +35151,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved -@@ -11454,7 +11454,7 @@ static void task_fork_fair(struct task_struct *p) +@@ -11454,7 +11454,7 @@ static void task_fork_fair(struct task_s * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); @@ -7014,7 +35160,7 @@ index 914096c5b1ae1..3cb55e6ede337 100644 } se->vruntime -= cfs_rq->min_vruntime; -@@ -11481,7 +11481,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +@@ -11481,7 +11481,7 @@ prio_changed_fair(struct rq *rq, struct */ if (task_current(rq, p)) { if (p->prio > oldprio) @@ -7023,10 +35169,9 @@ index 914096c5b1ae1..3cb55e6ede337 100644 } else check_preempt_curr(rq, p, 0); } -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index ee7f23c76bd33..e13090e33f3c4 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h +diff -rupN linux.orig/kernel/sched/features.h linux/kernel/sched/features.h +--- linux.orig/kernel/sched/features.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/sched/features.h 2022-12-04 10:40:26.720034034 -0500 @@ -48,6 +48,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true) #ifdef CONFIG_PREEMPT_RT @@ -7037,11 +35182,10 @@ index ee7f23c76bd33..e13090e33f3c4 100644 #else /* -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index e26688d387aeb..5b889de29e3c9 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2356,6 +2356,15 @@ extern void reweight_task(struct task_struct *p, int prio); +diff -rupN linux.orig/kernel/sched/sched.h linux/kernel/sched/sched.h +--- linux.orig/kernel/sched/sched.h 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/sched/sched.h 2022-12-04 10:40:26.724034024 -0500 +@@ -2356,6 +2356,15 @@ extern void reweight_task(struct task_st extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -7057,11 +35201,10 @@ index e26688d387aeb..5b889de29e3c9 100644 extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -diff --git a/kernel/signal.c b/kernel/signal.c -index 6f86fda5e432a..139b965e4fafc 100644 ---- a/kernel/signal.c -+++ b/kernel/signal.c -@@ -2297,13 +2297,13 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, +diff -rupN linux.orig/kernel/signal.c linux/kernel/signal.c +--- linux.orig/kernel/signal.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/signal.c 2022-12-04 10:40:26.724034024 -0500 +@@ -2297,13 +2297,13 @@ static int ptrace_stop(int exit_code, in /* * Don't want to allow preemption here, because * sys_ptrace() needs this task to be inactive. @@ -7079,10 +35222,9 @@ index 6f86fda5e432a..139b965e4fafc 100644 freezable_schedule(); cgroup_leave_frozen(true); -diff --git a/kernel/softirq.c b/kernel/softirq.c -index c8a6913c067d9..ab1fe34326bab 100644 ---- a/kernel/softirq.c -+++ b/kernel/softirq.c +diff -rupN linux.orig/kernel/softirq.c linux/kernel/softirq.c +--- linux.orig/kernel/softirq.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/softirq.c 2022-12-04 10:40:26.724034024 -0500 @@ -637,6 +637,24 @@ static inline void tick_irq_exit(void) #endif } @@ -7124,7 +35266,7 @@ index c8a6913c067d9..ab1fe34326bab 100644 tick_irq_exit(); } -@@ -976,12 +999,70 @@ static struct smp_hotplug_thread softirq_threads = { +@@ -976,12 +999,70 @@ static struct smp_hotplug_thread softirq .thread_comm = "ksoftirqd/%u", }; @@ -7196,11 +35338,10 @@ index c8a6913c067d9..ab1fe34326bab 100644 return 0; } early_initcall(spawn_ksoftirqd); -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 23af5eca11b14..b0b4e44dd0968 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1805,7 +1805,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) +diff -rupN linux.orig/kernel/time/hrtimer.c linux/kernel/time/hrtimer.c +--- linux.orig/kernel/time/hrtimer.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/time/hrtimer.c 2022-12-04 10:40:26.724034024 -0500 +@@ -1805,7 +1805,7 @@ retry: if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; @@ -7218,11 +35359,10 @@ index 23af5eca11b14..b0b4e44dd0968 100644 } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); -diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c -index b0e3c9205946f..133e4160ed54b 100644 ---- a/kernel/time/tick-sched.c -+++ b/kernel/time/tick-sched.c -@@ -779,7 +779,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +diff -rupN linux.orig/kernel/time/tick-sched.c linux/kernel/time/tick-sched.c +--- linux.orig/kernel/time/tick-sched.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/time/tick-sched.c 2022-12-04 10:40:26.724034024 -0500 +@@ -779,7 +779,7 @@ static void tick_nohz_restart(struct tic static inline bool local_timer_softirq_pending(void) { @@ -7231,10 +35371,9 @@ index b0e3c9205946f..133e4160ed54b 100644 } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index 717fcb9fb14aa..e6219da89933d 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c +diff -rupN linux.orig/kernel/time/timer.c linux/kernel/time/timer.c +--- linux.orig/kernel/time/timer.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/time/timer.c 2022-12-04 10:40:26.724034024 -0500 @@ -1822,7 +1822,7 @@ static void run_local_timers(void) if (time_before(jiffies, base->next_expiry)) return; @@ -7244,11 +35383,10 @@ index 717fcb9fb14aa..e6219da89933d 100644 } /* -diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c -index cc65887b31bd9..1d01756752676 100644 ---- a/kernel/trace/trace.c -+++ b/kernel/trace/trace.c -@@ -2640,11 +2640,19 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) +diff -rupN linux.orig/kernel/trace/trace.c linux/kernel/trace/trace.c +--- linux.orig/kernel/trace/trace.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/trace/trace.c 2022-12-04 10:40:26.724034024 -0500 +@@ -2640,11 +2640,19 @@ unsigned int tracing_gen_ctx_irq_test(un if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) trace_flags |= TRACE_FLAG_BH_OFF; @@ -7270,7 +35408,7 @@ index cc65887b31bd9..1d01756752676 100644 (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } -@@ -4230,15 +4238,17 @@ unsigned long trace_total_entries(struct trace_array *tr) +@@ -4230,15 +4238,17 @@ unsigned long trace_total_entries(struct static void print_lat_help_header(struct seq_file *m) { @@ -7297,7 +35435,7 @@ index cc65887b31bd9..1d01756752676 100644 } static void print_event_info(struct array_buffer *buf, struct seq_file *m) -@@ -4272,14 +4282,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file +@@ -4272,14 +4282,16 @@ static void print_func_help_header_irq(s print_event_info(buf, m); @@ -7322,11 +35460,10 @@ index cc65887b31bd9..1d01756752676 100644 } void -diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c -index 0356cae0cf74e..585380a3db753 100644 ---- a/kernel/trace/trace_events.c -+++ b/kernel/trace/trace_events.c -@@ -193,6 +193,7 @@ static int trace_define_common_fields(void) +diff -rupN linux.orig/kernel/trace/trace_events.c linux/kernel/trace/trace_events.c +--- linux.orig/kernel/trace/trace_events.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/trace/trace_events.c 2022-12-04 10:40:26.724034024 -0500 +@@ -193,6 +193,7 @@ static int trace_define_common_fields(vo /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); @@ -7334,11 +35471,10 @@ index 0356cae0cf74e..585380a3db753 100644 return ret; } -diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c -index 67f47ea27921d..de58eaaf1ac7a 100644 ---- a/kernel/trace/trace_output.c -+++ b/kernel/trace/trace_output.c -@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +diff -rupN linux.orig/kernel/trace/trace_output.c linux/kernel/trace/trace_output.c +--- linux.orig/kernel/trace/trace_output.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/trace/trace_output.c 2022-12-04 10:40:26.724034024 -0500 +@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq { char hardsoft_irq; char need_resched; @@ -7346,7 +35482,7 @@ index 67f47ea27921d..de58eaaf1ac7a 100644 char irqs_off; int hardirq; int softirq; -@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED)) { @@ -7374,7 +35510,7 @@ index 67f47ea27921d..de58eaaf1ac7a 100644 hardsoft_irq = (nmi && hardirq) ? 'Z' : nmi ? 'z' : -@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq softirq ? 's' : '.' ; @@ -7397,11 +35533,10 @@ index 67f47ea27921d..de58eaaf1ac7a 100644 if (entry->preempt_count & 0xf0) trace_seq_printf(s, "%x", entry->preempt_count >> 4); else -diff --git a/kernel/watchdog.c b/kernel/watchdog.c -index 8e61f21e7e33e..41596c415111b 100644 ---- a/kernel/watchdog.c -+++ b/kernel/watchdog.c -@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +diff -rupN linux.orig/kernel/watchdog.c linux/kernel/watchdog.c +--- linux.orig/kernel/watchdog.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/watchdog.c 2022-12-04 10:40:26.724034024 -0500 +@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_tim /* Start period for the next softlockup warning. */ update_report_ts(); @@ -7410,7 +35545,7 @@ index 8e61f21e7e33e..41596c415111b 100644 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); -@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_tim add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); @@ -7419,11 +35554,10 @@ index 8e61f21e7e33e..41596c415111b 100644 } return HRTIMER_RESTART; -diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c -index 247bf0b1582ca..701f35f0e2d44 100644 ---- a/kernel/watchdog_hld.c -+++ b/kernel/watchdog_hld.c -@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event, +diff -rupN linux.orig/kernel/watchdog_hld.c linux/kernel/watchdog_hld.c +--- linux.orig/kernel/watchdog_hld.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/kernel/watchdog_hld.c 2022-12-04 10:40:26.724034024 -0500 +@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(s if (__this_cpu_read(hard_watchdog_warn) == true) return; @@ -7432,7 +35566,7 @@ index 247bf0b1582ca..701f35f0e2d44 100644 pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", this_cpu); print_modules(); -@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(struct perf_event *event, +@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(s if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); @@ -7441,10 +35575,28 @@ index 247bf0b1582ca..701f35f0e2d44 100644 __this_cpu_write(hard_watchdog_warn, true); return; } -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index cb131fad117cc..c65e69bf4eebb 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug +diff -rupN linux.orig/lib/flex_proportions.c linux/lib/flex_proportions.c +--- linux.orig/lib/flex_proportions.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/lib/flex_proportions.c 2022-12-04 10:40:26.728034014 -0500 +@@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_globa + */ + if (events <= 1) + return false; ++ preempt_disable_nested(); + write_seqcount_begin(&p->sequence); + if (periods < 64) + events -= events >> periods; +@@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_globa + percpu_counter_add(&p->events, -events); + p->period += periods; + write_seqcount_end(&p->sequence); ++ preempt_enable_nested(); + + return true; + } +diff -rupN linux.orig/lib/Kconfig.debug linux/lib/Kconfig.debug +--- linux.orig/lib/Kconfig.debug 2022-12-02 11:43:18.000000000 -0500 ++++ linux/lib/Kconfig.debug 2022-12-04 10:40:26.724034024 -0500 @@ -811,6 +811,9 @@ config ARCH_HAS_DEBUG_VM_PGTABLE An architecture should select this when it can successfully build and run DEBUG_VM_PGTABLE. @@ -7455,31 +35607,10 @@ index cb131fad117cc..c65e69bf4eebb 100644 config DEBUG_VM bool "Debug VM" depends on DEBUG_KERNEL -diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c -index 05cccbcf1661a..83332fefa6f42 100644 ---- a/lib/flex_proportions.c -+++ b/lib/flex_proportions.c -@@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_global *p, int periods) - */ - if (events <= 1) - return false; -+ preempt_disable_nested(); - write_seqcount_begin(&p->sequence); - if (periods < 64) - events -= events >> periods; -@@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_global *p, int periods) - percpu_counter_add(&p->events, -events); - p->period += periods; - write_seqcount_end(&p->sequence); -+ preempt_enable_nested(); - - return true; - } -diff --git a/lib/vsprintf.c b/lib/vsprintf.c -index 3c1853a9d1c09..ffaba68e6a290 100644 ---- a/lib/vsprintf.c -+++ b/lib/vsprintf.c -@@ -750,37 +750,42 @@ static int __init debug_boot_weak_hash_enable(char *str) +diff -rupN linux.orig/lib/vsprintf.c linux/lib/vsprintf.c +--- linux.orig/lib/vsprintf.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/lib/vsprintf.c 2022-12-04 10:40:26.728034014 -0500 +@@ -750,37 +750,42 @@ static int __init debug_boot_weak_hash_e } early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable); @@ -7543,34 +35674,14 @@ index 3c1853a9d1c09..ffaba68e6a290 100644 #ifdef CONFIG_64BIT hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); -diff --git a/localversion-rt b/localversion-rt -new file mode 100644 -index 0000000000000..08b3e75841adc ---- /dev/null -+++ b/localversion-rt +diff -rupN linux.orig/localversion-rt linux/localversion-rt +--- linux.orig/localversion-rt 1969-12-31 19:00:00.000000000 -0500 ++++ linux/localversion-rt 2022-12-04 10:40:26.728034014 -0500 @@ -0,0 +1 @@ +-rt14 -diff --git a/mm/Kconfig b/mm/Kconfig -index 0331f1461f81c..3897e924e40f2 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -579,6 +579,12 @@ config COMPACTION - it and then we would be really interested to hear about that at - linux-mm@kvack.org. - -+config COMPACT_UNEVICTABLE_DEFAULT -+ int -+ depends on COMPACTION -+ default 0 if PREEMPT_RT -+ default 1 -+ - # - # support for free page reporting - config PAGE_REPORTING -diff --git a/mm/compaction.c b/mm/compaction.c -index 640fa76228dd9..10561cb1aaad9 100644 ---- a/mm/compaction.c -+++ b/mm/compaction.c +diff -rupN linux.orig/mm/compaction.c linux/mm/compaction.c +--- linux.orig/mm/compaction.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/compaction.c 2022-12-04 10:40:26.728034014 -0500 @@ -1727,11 +1727,7 @@ typedef enum { * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. @@ -7584,10 +35695,25 @@ index 640fa76228dd9..10561cb1aaad9 100644 static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index b69979c9ced5c..d35b6fa560f0a 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c +diff -rupN linux.orig/mm/Kconfig linux/mm/Kconfig +--- linux.orig/mm/Kconfig 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/Kconfig 2022-12-04 10:40:26.728034014 -0500 +@@ -579,6 +579,12 @@ config COMPACTION + it and then we would be really interested to hear about that at + linux-mm@kvack.org. + ++config COMPACT_UNEVICTABLE_DEFAULT ++ int ++ depends on COMPACTION ++ default 0 if PREEMPT_RT ++ default 1 ++ + # + # support for free page reporting + config PAGE_REPORTING +diff -rupN linux.orig/mm/memcontrol.c linux/mm/memcontrol.c +--- linux.orig/mm/memcontrol.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/memcontrol.c 2022-12-04 10:40:26.728034014 -0500 @@ -597,25 +597,18 @@ static u64 flush_next_time; */ static void memcg_stats_lock(void) @@ -7618,7 +35744,7 @@ index b69979c9ced5c..d35b6fa560f0a 100644 } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) -@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lru * interrupt context while other caller need to have disabled interrupt. */ __memcg_stats_lock(); @@ -7627,7 +35753,7 @@ index b69979c9ced5c..d35b6fa560f0a 100644 switch (idx) { case NR_ANON_MAPPED: case NR_FILE_MAPPED: -@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lru WARN_ON_ONCE(!in_task()); break; default: @@ -7636,10 +35762,9 @@ index b69979c9ced5c..d35b6fa560f0a 100644 } } -diff --git a/mm/slub.c b/mm/slub.c -index 4b98dff9be8e3..59173fa5901a0 100644 ---- a/mm/slub.c -+++ b/mm/slub.c +diff -rupN linux.orig/mm/slub.c linux/mm/slub.c +--- linux.orig/mm/slub.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/slub.c 2022-12-04 10:40:26.728034014 -0500 @@ -50,7 +50,7 @@ * 1. slab_mutex (Global Mutex) * 2. node->list_lock (Spinlock) @@ -7705,7 +35830,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 #endif #ifdef CONFIG_SLUB_DEBUG -@@ -447,7 +455,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +@@ -447,7 +455,7 @@ slub_set_cpu_partial(struct kmem_cache * /* * Per slab locking using the pagelock */ @@ -7714,7 +35839,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 { struct page *page = slab_page(slab); -@@ -455,7 +463,7 @@ static __always_inline void __slab_lock(struct slab *slab) +@@ -455,7 +463,7 @@ static __always_inline void __slab_lock( bit_spin_lock(PG_locked, &page->flags); } @@ -7723,7 +35848,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 { struct page *page = slab_page(slab); -@@ -463,31 +471,19 @@ static __always_inline void __slab_unlock(struct slab *slab) +@@ -463,31 +471,19 @@ static __always_inline void __slab_unloc __bit_spin_unlock(PG_locked, &page->flags); } @@ -7760,7 +35885,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) -@@ -499,18 +495,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab +@@ -499,18 +495,15 @@ static inline bool __cmpxchg_double_slab } else #endif { @@ -7782,7 +35907,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } cpu_relax(); -@@ -541,16 +534,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, +@@ -541,16 +534,16 @@ static inline bool cmpxchg_double_slab(s unsigned long flags; local_irq_save(flags); @@ -7802,7 +35927,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 local_irq_restore(flags); } -@@ -566,7 +559,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, +@@ -566,7 +559,7 @@ static inline bool cmpxchg_double_slab(s #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; @@ -7842,7 +35967,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) -@@ -1329,17 +1298,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, +@@ -1329,17 +1298,14 @@ static inline int alloc_consistency_chec } static noinline int alloc_debug_processing(struct kmem_cache *s, @@ -7862,7 +35987,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 trace(s, slab, object, 1); init_object(s, object, SLUB_RED_ACTIVE); return 1; -@@ -1390,63 +1356,6 @@ static inline int free_consistency_checks(struct kmem_cache *s, +@@ -1390,63 +1356,6 @@ static inline int free_consistency_check return 1; } @@ -7948,7 +36073,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, -@@ -1981,11 +1892,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +@@ -1981,11 +1892,13 @@ static struct slab *allocate_slab(struct */ slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) @@ -7963,7 +36088,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 account_slab(slab, oo_order(oo), s, flags); -@@ -2012,15 +1925,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +@@ -2012,15 +1925,6 @@ static struct slab *allocate_slab(struct set_freepointer(s, p, NULL); } @@ -7979,11 +36104,10 @@ index 4b98dff9be8e3..59173fa5901a0 100644 return slab; } -@@ -2107,6 +2011,75 @@ static inline void remove_partial(struct kmem_cache_node *n, - n->nr_partial--; +@@ -2108,6 +2012,75 @@ static inline void remove_partial(struct } -+/* + /* + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move @@ -8052,10 +36176,11 @@ index 4b98dff9be8e3..59173fa5901a0 100644 + return object; +} + - /* ++/* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. -@@ -2187,6 +2160,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + * +@@ -2187,6 +2160,13 @@ static void *get_partial_node(struct kme if (!pfmemalloc_match(slab, gfpflags)) continue; @@ -8069,7 +36194,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 t = acquire_slab(s, n, slab, object == NULL); if (!t) break; -@@ -2793,6 +2773,109 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +@@ -2793,6 +2773,109 @@ static inline unsigned long node_nr_objs { return atomic_long_read(&n->total_objects); } @@ -8179,7 +36304,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 #endif /* CONFIG_SLUB_DEBUG */ #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) -@@ -3041,36 +3124,52 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +@@ -3041,36 +3124,52 @@ new_objects: return NULL; } @@ -8245,7 +36370,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 retry_load_slab: -@@ -3094,11 +3193,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +@@ -3094,11 +3193,6 @@ retry_load_slab: c->slab = slab; goto load_freelist; @@ -8257,7 +36382,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } /* -@@ -3202,14 +3296,8 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_l +@@ -3202,14 +3296,8 @@ redo: object = c->freelist; slab = c->slab; @@ -8274,7 +36399,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { -@@ -3346,9 +3434,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, +@@ -3346,9 +3434,10 @@ static void __slab_free(struct kmem_cach if (kfence_free(head)) return; @@ -8287,7 +36412,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 do { if (unlikely(n)) { -@@ -3468,6 +3557,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, +@@ -3468,6 +3557,7 @@ static __always_inline void do_slab_free void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; @@ -8295,7 +36420,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 redo: /* -@@ -3482,9 +3572,13 @@ static __always_inline void do_slab_free(struct kmem_cache *s, +@@ -3482,9 +3572,13 @@ redo: /* Same with comment on barrier() in slab_alloc_node() */ barrier(); @@ -8312,7 +36437,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 set_freepointer(s, tail_obj, freelist); -@@ -3496,16 +3590,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, +@@ -3496,16 +3590,8 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } @@ -8331,7 +36456,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { -@@ -3520,11 +3606,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, +@@ -3520,11 +3606,8 @@ redo: c->tid = next_tid(tid); local_unlock(&s->cpu_slab->lock); @@ -8345,7 +36470,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, -@@ -3941,6 +4024,7 @@ static void early_kmem_cache_node_alloc(int node) +@@ -3941,6 +4024,7 @@ static void early_kmem_cache_node_alloc( slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!slab); @@ -8353,7 +36478,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); -@@ -3955,7 +4039,6 @@ static void early_kmem_cache_node_alloc(int node) +@@ -3955,7 +4039,6 @@ static void early_kmem_cache_node_alloc( n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; @@ -8361,7 +36486,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); -@@ -4242,23 +4325,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, +@@ -4242,23 +4325,21 @@ static void list_slab_objects(struct kme { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); @@ -8390,7 +36515,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 #endif } -@@ -4616,6 +4697,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) +@@ -4616,6 +4697,7 @@ static int __kmem_cache_do_shrink(struct if (free == slab->objects) { list_move(&slab->slab_list, &discard); n->nr_partial--; @@ -8398,7 +36523,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } else if (free <= SHRINK_PROMOTE_MAX) list_move(&slab->slab_list, promote + free - 1); } -@@ -4631,7 +4713,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) +@@ -4631,7 +4713,7 @@ static int __kmem_cache_do_shrink(struct /* Release empty slabs */ list_for_each_entry_safe(slab, t, &discard, slab_list) @@ -8407,7 +36532,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 if (slabs_node(s, node)) ret = 1; -@@ -4991,12 +5073,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, +@@ -4991,12 +5073,9 @@ static void validate_slab(struct kmem_ca { void *p; void *addr = slab_address(slab); @@ -8421,7 +36546,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 /* Now we know that a valid freelist exists */ __fill_map(obj_map, s, slab); -@@ -5007,8 +5086,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, +@@ -5007,8 +5086,6 @@ static void validate_slab(struct kmem_ca if (!check_object(s, slab, p, val)) break; } @@ -8430,7 +36555,7 @@ index 4b98dff9be8e3..59173fa5901a0 100644 } static int validate_slab_node(struct kmem_cache *s, -@@ -5612,7 +5689,7 @@ static ssize_t validate_store(struct kmem_cache *s, +@@ -5612,7 +5689,7 @@ static ssize_t validate_store(struct kme { int ret = -EINVAL; @@ -8439,11 +36564,10 @@ index 4b98dff9be8e3..59173fa5901a0 100644 ret = validate_slab_cache(s); if (ret >= 0) ret = length; -diff --git a/mm/vmstat.c b/mm/vmstat.c -index 90af9a8572f5a..7a2d73f152304 100644 ---- a/mm/vmstat.c -+++ b/mm/vmstat.c -@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, +diff -rupN linux.orig/mm/vmstat.c linux/mm/vmstat.c +--- linux.orig/mm/vmstat.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/mm/vmstat.c 2022-12-04 10:40:26.728034014 -0500 +@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone * * CPU migrations and preemption potentially corrupts a counter so * disable preemption. */ @@ -8453,7 +36577,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 x = delta + __this_cpu_read(*p); -@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, +@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone * } __this_cpu_write(*p, x); @@ -8463,7 +36587,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } EXPORT_SYMBOL(__mod_zone_page_state); -@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, +@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist } /* See __mod_node_page_state */ @@ -8473,7 +36597,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 x = delta + __this_cpu_read(*p); -@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, +@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist } __this_cpu_write(*p, x); @@ -8483,7 +36607,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } EXPORT_SYMBOL(__mod_node_page_state); -@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) +@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, s8 v, t; /* See __mod_node_page_state */ @@ -8493,7 +36617,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); -@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) +@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, __this_cpu_write(*p, -overstep); } @@ -8503,7 +36627,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) -@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ @@ -8513,7 +36637,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); -@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data __this_cpu_write(*p, -overstep); } @@ -8523,7 +36647,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) -@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) +@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, s8 v, t; /* See __mod_node_page_state */ @@ -8533,7 +36657,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); -@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) +@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, __this_cpu_write(*p, overstep); } @@ -8543,7 +36667,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) -@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ @@ -8553,7 +36677,7 @@ index 90af9a8572f5a..7a2d73f152304 100644 v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); -@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data __this_cpu_write(*p, overstep); } @@ -8563,11 +36687,10 @@ index 90af9a8572f5a..7a2d73f152304 100644 } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) -diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c -index 035812b0461cc..ecdb47712d956 100644 ---- a/net/8021q/vlan_dev.c -+++ b/net/8021q/vlan_dev.c -@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct net_device *dev, +diff -rupN linux.orig/net/8021q/vlan_dev.c linux/net/8021q/vlan_dev.c +--- linux.orig/net/8021q/vlan_dev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/8021q/vlan_dev.c 2022-12-04 10:40:26.728034014 -0500 +@@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); do { @@ -8583,11 +36706,10 @@ index 035812b0461cc..ecdb47712d956 100644 stats->rx_packets += rxpackets; stats->rx_bytes += rxbytes; -diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c -index db4f2641d1cd1..7e2a9fb5786c9 100644 ---- a/net/bridge/br_multicast.c -+++ b/net/bridge/br_multicast.c -@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct net_bridge *br, +diff -rupN linux.orig/net/bridge/br_multicast.c linux/net/bridge/br_multicast.c +--- linux.orig/net/bridge/br_multicast.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/bridge/br_multicast.c 2022-12-04 10:40:26.728034014 -0500 +@@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct unsigned int start; do { @@ -8599,11 +36721,10 @@ index db4f2641d1cd1..7e2a9fb5786c9 100644 mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries); -diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c -index 6e53dc9914094..f2fc284abab38 100644 ---- a/net/bridge/br_vlan.c -+++ b/net/bridge/br_vlan.c -@@ -1378,12 +1378,12 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, +diff -rupN linux.orig/net/bridge/br_vlan.c linux/net/bridge/br_vlan.c +--- linux.orig/net/bridge/br_vlan.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/bridge/br_vlan.c 2022-12-04 10:40:26.728034014 -0500 +@@ -1389,12 +1389,12 @@ void br_vlan_get_stats(const struct net_ cpu_stats = per_cpu_ptr(v->stats, i); do { @@ -8618,11 +36739,2324 @@ index 6e53dc9914094..f2fc284abab38 100644 u64_stats_add(&stats->rx_packets, rxpackets); u64_stats_add(&stats->rx_bytes, rxbytes); -diff --git a/net/core/dev.c b/net/core/dev.c -index 56c8b0921c9fd..d96506980d2f2 100644 ---- a/net/core/dev.c -+++ b/net/core/dev.c -@@ -4582,15 +4582,6 @@ static void rps_trigger_softirq(void *data) +diff -rupN linux.orig/net/bridge/br_vlan.c.orig linux/net/bridge/br_vlan.c.orig +--- linux.orig/net/bridge/br_vlan.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/net/bridge/br_vlan.c.orig 2022-12-04 10:40:18.724054527 -0500 +@@ -0,0 +1,2310 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++#include ++#include ++#include ++ ++#include "br_private.h" ++#include "br_private_tunnel.h" ++ ++static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid); ++ ++static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, ++ const void *ptr) ++{ ++ const struct net_bridge_vlan *vle = ptr; ++ u16 vid = *(u16 *)arg->key; ++ ++ return vle->vid != vid; ++} ++ ++static const struct rhashtable_params br_vlan_rht_params = { ++ .head_offset = offsetof(struct net_bridge_vlan, vnode), ++ .key_offset = offsetof(struct net_bridge_vlan, vid), ++ .key_len = sizeof(u16), ++ .nelem_hint = 3, ++ .max_size = VLAN_N_VID, ++ .obj_cmpfn = br_vlan_cmp, ++ .automatic_shrinking = true, ++}; ++ ++static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) ++{ ++ return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); ++} ++ ++static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, ++ const struct net_bridge_vlan *v) ++{ ++ if (vg->pvid == v->vid) ++ return; ++ ++ smp_wmb(); ++ br_vlan_set_pvid_state(vg, v->state); ++ vg->pvid = v->vid; ++} ++ ++static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) ++{ ++ if (vg->pvid != vid) ++ return; ++ ++ smp_wmb(); ++ vg->pvid = 0; ++} ++ ++/* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v. ++ * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and ++ * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v. ++ */ ++static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags, ++ bool commit) ++{ ++ struct net_bridge_vlan_group *vg; ++ bool change; ++ ++ if (br_vlan_is_master(v)) ++ vg = br_vlan_group(v->br); ++ else ++ vg = nbp_vlan_group(v->port); ++ ++ /* check if anything would be changed on commit */ ++ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) || ++ ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED); ++ ++ if (!commit) ++ goto out; ++ ++ if (flags & BRIDGE_VLAN_INFO_PVID) ++ __vlan_add_pvid(vg, v); ++ else ++ __vlan_delete_pvid(vg, v->vid); ++ ++ if (flags & BRIDGE_VLAN_INFO_UNTAGGED) ++ v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; ++ else ++ v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; ++ ++out: ++ return change; ++} ++ ++static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags) ++{ ++ return __vlan_flags_update(v, flags, false); ++} ++ ++static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags) ++{ ++ __vlan_flags_update(v, flags, true); ++} ++ ++static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, ++ struct net_bridge_vlan *v, u16 flags, ++ struct netlink_ext_ack *extack) ++{ ++ int err; ++ ++ /* Try switchdev op first. In case it is not supported, fallback to ++ * 8021q add. ++ */ ++ err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); ++ if (err == -EOPNOTSUPP) ++ return vlan_vid_add(dev, br->vlan_proto, v->vid); ++ v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; ++ return err; ++} ++ ++static void __vlan_add_list(struct net_bridge_vlan *v) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct list_head *headp, *hpos; ++ struct net_bridge_vlan *vent; ++ ++ if (br_vlan_is_master(v)) ++ vg = br_vlan_group(v->br); ++ else ++ vg = nbp_vlan_group(v->port); ++ ++ headp = &vg->vlan_list; ++ list_for_each_prev(hpos, headp) { ++ vent = list_entry(hpos, struct net_bridge_vlan, vlist); ++ if (v->vid >= vent->vid) ++ break; ++ } ++ list_add_rcu(&v->vlist, hpos); ++} ++ ++static void __vlan_del_list(struct net_bridge_vlan *v) ++{ ++ list_del_rcu(&v->vlist); ++} ++ ++static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, ++ const struct net_bridge_vlan *v) ++{ ++ int err; ++ ++ /* Try switchdev op first. In case it is not supported, fallback to ++ * 8021q del. ++ */ ++ err = br_switchdev_port_vlan_del(dev, v->vid); ++ if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)) ++ vlan_vid_del(dev, br->vlan_proto, v->vid); ++ return err == -EOPNOTSUPP ? 0 : err; ++} ++ ++/* Returns a master vlan, if it didn't exist it gets created. In all cases ++ * a reference is taken to the master vlan before returning. ++ */ ++static struct net_bridge_vlan * ++br_vlan_get_master(struct net_bridge *br, u16 vid, ++ struct netlink_ext_ack *extack) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *masterv; ++ ++ vg = br_vlan_group(br); ++ masterv = br_vlan_find(vg, vid); ++ if (!masterv) { ++ bool changed; ++ ++ /* missing global ctx, create it now */ ++ if (br_vlan_add(br, vid, 0, &changed, extack)) ++ return NULL; ++ masterv = br_vlan_find(vg, vid); ++ if (WARN_ON(!masterv)) ++ return NULL; ++ refcount_set(&masterv->refcnt, 1); ++ return masterv; ++ } ++ refcount_inc(&masterv->refcnt); ++ ++ return masterv; ++} ++ ++static void br_master_vlan_rcu_free(struct rcu_head *rcu) ++{ ++ struct net_bridge_vlan *v; ++ ++ v = container_of(rcu, struct net_bridge_vlan, rcu); ++ WARN_ON(!br_vlan_is_master(v)); ++ free_percpu(v->stats); ++ v->stats = NULL; ++ kfree(v); ++} ++ ++static void br_vlan_put_master(struct net_bridge_vlan *masterv) ++{ ++ struct net_bridge_vlan_group *vg; ++ ++ if (!br_vlan_is_master(masterv)) ++ return; ++ ++ vg = br_vlan_group(masterv->br); ++ if (refcount_dec_and_test(&masterv->refcnt)) { ++ rhashtable_remove_fast(&vg->vlan_hash, ++ &masterv->vnode, br_vlan_rht_params); ++ __vlan_del_list(masterv); ++ br_multicast_toggle_one_vlan(masterv, false); ++ br_multicast_ctx_deinit(&masterv->br_mcast_ctx); ++ call_rcu(&masterv->rcu, br_master_vlan_rcu_free); ++ } ++} ++ ++static void nbp_vlan_rcu_free(struct rcu_head *rcu) ++{ ++ struct net_bridge_vlan *v; ++ ++ v = container_of(rcu, struct net_bridge_vlan, rcu); ++ WARN_ON(br_vlan_is_master(v)); ++ /* if we had per-port stats configured then free them here */ ++ if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) ++ free_percpu(v->stats); ++ v->stats = NULL; ++ kfree(v); ++} ++ ++static void br_vlan_init_state(struct net_bridge_vlan *v) ++{ ++ struct net_bridge *br; ++ ++ if (br_vlan_is_master(v)) ++ br = v->br; ++ else ++ br = v->port->br; ++ ++ if (br_opt_get(br, BROPT_MST_ENABLED)) { ++ br_mst_vlan_init_state(v); ++ return; ++ } ++ ++ v->state = BR_STATE_FORWARDING; ++ v->msti = 0; ++} ++ ++/* This is the shared VLAN add function which works for both ports and bridge ++ * devices. There are four possible calls to this function in terms of the ++ * vlan entry type: ++ * 1. vlan is being added on a port (no master flags, global entry exists) ++ * 2. vlan is being added on a bridge (both master and brentry flags) ++ * 3. vlan is being added on a port, but a global entry didn't exist which ++ * is being created right now (master flag set, brentry flag unset), the ++ * global entry is used for global per-vlan features, but not for filtering ++ * 4. same as 3 but with both master and brentry flags set so the entry ++ * will be used for filtering in both the port and the bridge ++ */ ++static int __vlan_add(struct net_bridge_vlan *v, u16 flags, ++ struct netlink_ext_ack *extack) ++{ ++ struct net_bridge_vlan *masterv = NULL; ++ struct net_bridge_port *p = NULL; ++ struct net_bridge_vlan_group *vg; ++ struct net_device *dev; ++ struct net_bridge *br; ++ int err; ++ ++ if (br_vlan_is_master(v)) { ++ br = v->br; ++ dev = br->dev; ++ vg = br_vlan_group(br); ++ } else { ++ p = v->port; ++ br = p->br; ++ dev = p->dev; ++ vg = nbp_vlan_group(p); ++ } ++ ++ if (p) { ++ /* Add VLAN to the device filter if it is supported. ++ * This ensures tagged traffic enters the bridge when ++ * promiscuous mode is disabled by br_manage_promisc(). ++ */ ++ err = __vlan_vid_add(dev, br, v, flags, extack); ++ if (err) ++ goto out; ++ ++ /* need to work on the master vlan too */ ++ if (flags & BRIDGE_VLAN_INFO_MASTER) { ++ bool changed; ++ ++ err = br_vlan_add(br, v->vid, ++ flags | BRIDGE_VLAN_INFO_BRENTRY, ++ &changed, extack); ++ if (err) ++ goto out_filt; ++ ++ if (changed) ++ br_vlan_notify(br, NULL, v->vid, 0, ++ RTM_NEWVLAN); ++ } ++ ++ masterv = br_vlan_get_master(br, v->vid, extack); ++ if (!masterv) { ++ err = -ENOMEM; ++ goto out_filt; ++ } ++ v->brvlan = masterv; ++ if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { ++ v->stats = ++ netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); ++ if (!v->stats) { ++ err = -ENOMEM; ++ goto out_filt; ++ } ++ v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; ++ } else { ++ v->stats = masterv->stats; ++ } ++ br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); ++ } else { ++ if (br_vlan_should_use(v)) { ++ err = br_switchdev_port_vlan_add(dev, v->vid, flags, ++ false, extack); ++ if (err && err != -EOPNOTSUPP) ++ goto out; ++ } ++ br_multicast_ctx_init(br, v, &v->br_mcast_ctx); ++ v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; ++ } ++ ++ /* Add the dev mac and count the vlan only if it's usable */ ++ if (br_vlan_should_use(v)) { ++ err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); ++ if (err) { ++ br_err(br, "failed insert local address into bridge forwarding table\n"); ++ goto out_filt; ++ } ++ vg->num_vlans++; ++ } ++ ++ /* set the state before publishing */ ++ br_vlan_init_state(v); ++ ++ err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, ++ br_vlan_rht_params); ++ if (err) ++ goto out_fdb_insert; ++ ++ __vlan_add_list(v); ++ __vlan_flags_commit(v, flags); ++ br_multicast_toggle_one_vlan(v, true); ++ ++ if (p) ++ nbp_vlan_set_vlan_dev_state(p, v->vid); ++out: ++ return err; ++ ++out_fdb_insert: ++ if (br_vlan_should_use(v)) { ++ br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); ++ vg->num_vlans--; ++ } ++ ++out_filt: ++ if (p) { ++ __vlan_vid_del(dev, br, v); ++ if (masterv) { ++ if (v->stats && masterv->stats != v->stats) ++ free_percpu(v->stats); ++ v->stats = NULL; ++ ++ br_vlan_put_master(masterv); ++ v->brvlan = NULL; ++ } ++ } else { ++ br_switchdev_port_vlan_del(dev, v->vid); ++ } ++ ++ goto out; ++} ++ ++static int __vlan_del(struct net_bridge_vlan *v) ++{ ++ struct net_bridge_vlan *masterv = v; ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p = NULL; ++ int err = 0; ++ ++ if (br_vlan_is_master(v)) { ++ vg = br_vlan_group(v->br); ++ } else { ++ p = v->port; ++ vg = nbp_vlan_group(v->port); ++ masterv = v->brvlan; ++ } ++ ++ __vlan_delete_pvid(vg, v->vid); ++ if (p) { ++ err = __vlan_vid_del(p->dev, p->br, v); ++ if (err) ++ goto out; ++ } else { ++ err = br_switchdev_port_vlan_del(v->br->dev, v->vid); ++ if (err && err != -EOPNOTSUPP) ++ goto out; ++ err = 0; ++ } ++ ++ if (br_vlan_should_use(v)) { ++ v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; ++ vg->num_vlans--; ++ } ++ ++ if (masterv != v) { ++ vlan_tunnel_info_del(vg, v); ++ rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, ++ br_vlan_rht_params); ++ __vlan_del_list(v); ++ nbp_vlan_set_vlan_dev_state(p, v->vid); ++ br_multicast_toggle_one_vlan(v, false); ++ br_multicast_port_ctx_deinit(&v->port_mcast_ctx); ++ call_rcu(&v->rcu, nbp_vlan_rcu_free); ++ } ++ ++ br_vlan_put_master(masterv); ++out: ++ return err; ++} ++ ++static void __vlan_group_free(struct net_bridge_vlan_group *vg) ++{ ++ WARN_ON(!list_empty(&vg->vlan_list)); ++ rhashtable_destroy(&vg->vlan_hash); ++ vlan_tunnel_deinit(vg); ++ kfree(vg); ++} ++ ++static void __vlan_flush(const struct net_bridge *br, ++ const struct net_bridge_port *p, ++ struct net_bridge_vlan_group *vg) ++{ ++ struct net_bridge_vlan *vlan, *tmp; ++ u16 v_start = 0, v_end = 0; ++ int err; ++ ++ __vlan_delete_pvid(vg, vg->pvid); ++ list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { ++ /* take care of disjoint ranges */ ++ if (!v_start) { ++ v_start = vlan->vid; ++ } else if (vlan->vid - v_end != 1) { ++ /* found range end, notify and start next one */ ++ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); ++ v_start = vlan->vid; ++ } ++ v_end = vlan->vid; ++ ++ err = __vlan_del(vlan); ++ if (err) { ++ br_err(br, ++ "port %u(%s) failed to delete vlan %d: %pe\n", ++ (unsigned int) p->port_no, p->dev->name, ++ vlan->vid, ERR_PTR(err)); ++ } ++ } ++ ++ /* notify about the last/whole vlan range */ ++ if (v_start) ++ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); ++} ++ ++struct sk_buff *br_handle_vlan(struct net_bridge *br, ++ const struct net_bridge_port *p, ++ struct net_bridge_vlan_group *vg, ++ struct sk_buff *skb) ++{ ++ struct pcpu_sw_netstats *stats; ++ struct net_bridge_vlan *v; ++ u16 vid; ++ ++ /* If this packet was not filtered at input, let it pass */ ++ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) ++ goto out; ++ ++ /* At this point, we know that the frame was filtered and contains ++ * a valid vlan id. If the vlan id has untagged flag set, ++ * send untagged; otherwise, send tagged. ++ */ ++ br_vlan_get_tag(skb, &vid); ++ v = br_vlan_find(vg, vid); ++ /* Vlan entry must be configured at this point. The ++ * only exception is the bridge is set in promisc mode and the ++ * packet is destined for the bridge device. In this case ++ * pass the packet as is. ++ */ ++ if (!v || !br_vlan_should_use(v)) { ++ if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { ++ goto out; ++ } else { ++ kfree_skb(skb); ++ return NULL; ++ } ++ } ++ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { ++ stats = this_cpu_ptr(v->stats); ++ u64_stats_update_begin(&stats->syncp); ++ u64_stats_add(&stats->tx_bytes, skb->len); ++ u64_stats_inc(&stats->tx_packets); ++ u64_stats_update_end(&stats->syncp); ++ } ++ ++ /* If the skb will be sent using forwarding offload, the assumption is ++ * that the switchdev will inject the packet into hardware together ++ * with the bridge VLAN, so that it can be forwarded according to that ++ * VLAN. The switchdev should deal with popping the VLAN header in ++ * hardware on each egress port as appropriate. So only strip the VLAN ++ * header if forwarding offload is not being used. ++ */ ++ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && ++ !br_switchdev_frame_uses_tx_fwd_offload(skb)) ++ __vlan_hwaccel_clear_tag(skb); ++ ++ if (p && (p->flags & BR_VLAN_TUNNEL) && ++ br_handle_egress_vlan_tunnel(skb, v)) { ++ kfree_skb(skb); ++ return NULL; ++ } ++out: ++ return skb; ++} ++ ++/* Called under RCU */ ++static bool __allowed_ingress(const struct net_bridge *br, ++ struct net_bridge_vlan_group *vg, ++ struct sk_buff *skb, u16 *vid, ++ u8 *state, ++ struct net_bridge_vlan **vlan) ++{ ++ struct pcpu_sw_netstats *stats; ++ struct net_bridge_vlan *v; ++ bool tagged; ++ ++ BR_INPUT_SKB_CB(skb)->vlan_filtered = true; ++ /* If vlan tx offload is disabled on bridge device and frame was ++ * sent from vlan device on the bridge device, it does not have ++ * HW accelerated vlan tag. ++ */ ++ if (unlikely(!skb_vlan_tag_present(skb) && ++ skb->protocol == br->vlan_proto)) { ++ skb = skb_vlan_untag(skb); ++ if (unlikely(!skb)) ++ return false; ++ } ++ ++ if (!br_vlan_get_tag(skb, vid)) { ++ /* Tagged frame */ ++ if (skb->vlan_proto != br->vlan_proto) { ++ /* Protocol-mismatch, empty out vlan_tci for new tag */ ++ skb_push(skb, ETH_HLEN); ++ skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, ++ skb_vlan_tag_get(skb)); ++ if (unlikely(!skb)) ++ return false; ++ ++ skb_pull(skb, ETH_HLEN); ++ skb_reset_mac_len(skb); ++ *vid = 0; ++ tagged = false; ++ } else { ++ tagged = true; ++ } ++ } else { ++ /* Untagged frame */ ++ tagged = false; ++ } ++ ++ if (!*vid) { ++ u16 pvid = br_get_pvid(vg); ++ ++ /* Frame had a tag with VID 0 or did not have a tag. ++ * See if pvid is set on this port. That tells us which ++ * vlan untagged or priority-tagged traffic belongs to. ++ */ ++ if (!pvid) ++ goto drop; ++ ++ /* PVID is set on this port. Any untagged or priority-tagged ++ * ingress frame is considered to belong to this vlan. ++ */ ++ *vid = pvid; ++ if (likely(!tagged)) ++ /* Untagged Frame. */ ++ __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); ++ else ++ /* Priority-tagged Frame. ++ * At this point, we know that skb->vlan_tci VID ++ * field was 0. ++ * We update only VID field and preserve PCP field. ++ */ ++ skb->vlan_tci |= pvid; ++ ++ /* if snooping and stats are disabled we can avoid the lookup */ ++ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && ++ !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { ++ if (*state == BR_STATE_FORWARDING) { ++ *state = br_vlan_get_pvid_state(vg); ++ if (!br_vlan_state_allowed(*state, true)) ++ goto drop; ++ } ++ return true; ++ } ++ } ++ v = br_vlan_find(vg, *vid); ++ if (!v || !br_vlan_should_use(v)) ++ goto drop; ++ ++ if (*state == BR_STATE_FORWARDING) { ++ *state = br_vlan_get_state(v); ++ if (!br_vlan_state_allowed(*state, true)) ++ goto drop; ++ } ++ ++ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { ++ stats = this_cpu_ptr(v->stats); ++ u64_stats_update_begin(&stats->syncp); ++ u64_stats_add(&stats->rx_bytes, skb->len); ++ u64_stats_inc(&stats->rx_packets); ++ u64_stats_update_end(&stats->syncp); ++ } ++ ++ *vlan = v; ++ ++ return true; ++ ++drop: ++ kfree_skb(skb); ++ return false; ++} ++ ++bool br_allowed_ingress(const struct net_bridge *br, ++ struct net_bridge_vlan_group *vg, struct sk_buff *skb, ++ u16 *vid, u8 *state, ++ struct net_bridge_vlan **vlan) ++{ ++ /* If VLAN filtering is disabled on the bridge, all packets are ++ * permitted. ++ */ ++ *vlan = NULL; ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { ++ BR_INPUT_SKB_CB(skb)->vlan_filtered = false; ++ return true; ++ } ++ ++ return __allowed_ingress(br, vg, skb, vid, state, vlan); ++} ++ ++/* Called under RCU. */ ++bool br_allowed_egress(struct net_bridge_vlan_group *vg, ++ const struct sk_buff *skb) ++{ ++ const struct net_bridge_vlan *v; ++ u16 vid; ++ ++ /* If this packet was not filtered at input, let it pass */ ++ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) ++ return true; ++ ++ br_vlan_get_tag(skb, &vid); ++ v = br_vlan_find(vg, vid); ++ if (v && br_vlan_should_use(v) && ++ br_vlan_state_allowed(br_vlan_get_state(v), false)) ++ return true; ++ ++ return false; ++} ++ ++/* Called under RCU */ ++bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge *br = p->br; ++ struct net_bridge_vlan *v; ++ ++ /* If filtering was disabled at input, let it pass. */ ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) ++ return true; ++ ++ vg = nbp_vlan_group_rcu(p); ++ if (!vg || !vg->num_vlans) ++ return false; ++ ++ if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) ++ *vid = 0; ++ ++ if (!*vid) { ++ *vid = br_get_pvid(vg); ++ if (!*vid || ++ !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) ++ return false; ++ ++ return true; ++ } ++ ++ v = br_vlan_find(vg, *vid); ++ if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) ++ return true; ++ ++ return false; ++} ++ ++static int br_vlan_add_existing(struct net_bridge *br, ++ struct net_bridge_vlan_group *vg, ++ struct net_bridge_vlan *vlan, ++ u16 flags, bool *changed, ++ struct netlink_ext_ack *extack) ++{ ++ bool would_change = __vlan_flags_would_change(vlan, flags); ++ bool becomes_brentry = false; ++ int err; ++ ++ if (!br_vlan_is_brentry(vlan)) { ++ /* Trying to change flags of non-existent bridge vlan */ ++ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) ++ return -EINVAL; ++ ++ becomes_brentry = true; ++ } ++ ++ /* Master VLANs that aren't brentries weren't notified before, ++ * time to notify them now. ++ */ ++ if (becomes_brentry || would_change) { ++ err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, ++ would_change, extack); ++ if (err && err != -EOPNOTSUPP) ++ return err; ++ } ++ ++ if (becomes_brentry) { ++ /* It was only kept for port vlans, now make it real */ ++ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); ++ if (err) { ++ br_err(br, "failed to insert local address into bridge forwarding table\n"); ++ goto err_fdb_insert; ++ } ++ ++ refcount_inc(&vlan->refcnt); ++ vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; ++ vg->num_vlans++; ++ *changed = true; ++ br_multicast_toggle_one_vlan(vlan, true); ++ } ++ ++ __vlan_flags_commit(vlan, flags); ++ if (would_change) ++ *changed = true; ++ ++ return 0; ++ ++err_fdb_insert: ++ br_switchdev_port_vlan_del(br->dev, vlan->vid); ++ return err; ++} ++ ++/* Must be protected by RTNL. ++ * Must be called with vid in range from 1 to 4094 inclusive. ++ * changed must be true only if the vlan was created or updated ++ */ ++int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, ++ struct netlink_ext_ack *extack) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *vlan; ++ int ret; ++ ++ ASSERT_RTNL(); ++ ++ *changed = false; ++ vg = br_vlan_group(br); ++ vlan = br_vlan_find(vg, vid); ++ if (vlan) ++ return br_vlan_add_existing(br, vg, vlan, flags, changed, ++ extack); ++ ++ vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); ++ if (!vlan) ++ return -ENOMEM; ++ ++ vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); ++ if (!vlan->stats) { ++ kfree(vlan); ++ return -ENOMEM; ++ } ++ vlan->vid = vid; ++ vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; ++ vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; ++ vlan->br = br; ++ if (flags & BRIDGE_VLAN_INFO_BRENTRY) ++ refcount_set(&vlan->refcnt, 1); ++ ret = __vlan_add(vlan, flags, extack); ++ if (ret) { ++ free_percpu(vlan->stats); ++ kfree(vlan); ++ } else { ++ *changed = true; ++ } ++ ++ return ret; ++} ++ ++/* Must be protected by RTNL. ++ * Must be called with vid in range from 1 to 4094 inclusive. ++ */ ++int br_vlan_delete(struct net_bridge *br, u16 vid) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v; ++ ++ ASSERT_RTNL(); ++ ++ vg = br_vlan_group(br); ++ v = br_vlan_find(vg, vid); ++ if (!v || !br_vlan_is_brentry(v)) ++ return -ENOENT; ++ ++ br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); ++ br_fdb_delete_by_port(br, NULL, vid, 0); ++ ++ vlan_tunnel_info_del(vg, v); ++ ++ return __vlan_del(v); ++} ++ ++void br_vlan_flush(struct net_bridge *br) ++{ ++ struct net_bridge_vlan_group *vg; ++ ++ ASSERT_RTNL(); ++ ++ vg = br_vlan_group(br); ++ __vlan_flush(br, NULL, vg); ++ RCU_INIT_POINTER(br->vlgrp, NULL); ++ synchronize_rcu(); ++ __vlan_group_free(vg); ++} ++ ++struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) ++{ ++ if (!vg) ++ return NULL; ++ ++ return br_vlan_lookup(&vg->vlan_hash, vid); ++} ++ ++/* Must be protected by RTNL. */ ++static void recalculate_group_addr(struct net_bridge *br) ++{ ++ if (br_opt_get(br, BROPT_GROUP_ADDR_SET)) ++ return; ++ ++ spin_lock_bh(&br->lock); ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED) || ++ br->vlan_proto == htons(ETH_P_8021Q)) { ++ /* Bridge Group Address */ ++ br->group_addr[5] = 0x00; ++ } else { /* vlan_enabled && ETH_P_8021AD */ ++ /* Provider Bridge Group Address */ ++ br->group_addr[5] = 0x08; ++ } ++ spin_unlock_bh(&br->lock); ++} ++ ++/* Must be protected by RTNL. */ ++void br_recalculate_fwd_mask(struct net_bridge *br) ++{ ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED) || ++ br->vlan_proto == htons(ETH_P_8021Q)) ++ br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; ++ else /* vlan_enabled && ETH_P_8021AD */ ++ br->group_fwd_mask_required = BR_GROUPFWD_8021AD & ++ ~(1u << br->group_addr[5]); ++} ++ ++int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, ++ struct netlink_ext_ack *extack) ++{ ++ struct switchdev_attr attr = { ++ .orig_dev = br->dev, ++ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, ++ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, ++ .u.vlan_filtering = val, ++ }; ++ int err; ++ ++ if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) ++ return 0; ++ ++ br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); ++ ++ err = switchdev_port_attr_set(br->dev, &attr, extack); ++ if (err && err != -EOPNOTSUPP) { ++ br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); ++ return err; ++ } ++ ++ br_manage_promisc(br); ++ recalculate_group_addr(br); ++ br_recalculate_fwd_mask(br); ++ if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { ++ br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); ++ br_multicast_toggle_vlan_snooping(br, false, NULL); ++ } ++ ++ return 0; ++} ++ ++bool br_vlan_enabled(const struct net_device *dev) ++{ ++ struct net_bridge *br = netdev_priv(dev); ++ ++ return br_opt_get(br, BROPT_VLAN_ENABLED); ++} ++EXPORT_SYMBOL_GPL(br_vlan_enabled); ++ ++int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) ++{ ++ struct net_bridge *br = netdev_priv(dev); ++ ++ *p_proto = ntohs(br->vlan_proto); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_proto); ++ ++int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, ++ struct netlink_ext_ack *extack) ++{ ++ struct switchdev_attr attr = { ++ .orig_dev = br->dev, ++ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, ++ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, ++ .u.vlan_protocol = ntohs(proto), ++ }; ++ int err = 0; ++ struct net_bridge_port *p; ++ struct net_bridge_vlan *vlan; ++ struct net_bridge_vlan_group *vg; ++ __be16 oldproto = br->vlan_proto; ++ ++ if (br->vlan_proto == proto) ++ return 0; ++ ++ err = switchdev_port_attr_set(br->dev, &attr, extack); ++ if (err && err != -EOPNOTSUPP) ++ return err; ++ ++ /* Add VLANs for the new proto to the device filter. */ ++ list_for_each_entry(p, &br->port_list, list) { ++ vg = nbp_vlan_group(p); ++ list_for_each_entry(vlan, &vg->vlan_list, vlist) { ++ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ continue; ++ err = vlan_vid_add(p->dev, proto, vlan->vid); ++ if (err) ++ goto err_filt; ++ } ++ } ++ ++ br->vlan_proto = proto; ++ ++ recalculate_group_addr(br); ++ br_recalculate_fwd_mask(br); ++ ++ /* Delete VLANs for the old proto from the device filter. */ ++ list_for_each_entry(p, &br->port_list, list) { ++ vg = nbp_vlan_group(p); ++ list_for_each_entry(vlan, &vg->vlan_list, vlist) { ++ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ continue; ++ vlan_vid_del(p->dev, oldproto, vlan->vid); ++ } ++ } ++ ++ return 0; ++ ++err_filt: ++ attr.u.vlan_protocol = ntohs(oldproto); ++ switchdev_port_attr_set(br->dev, &attr, NULL); ++ ++ list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) { ++ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ continue; ++ vlan_vid_del(p->dev, proto, vlan->vid); ++ } ++ ++ list_for_each_entry_continue_reverse(p, &br->port_list, list) { ++ vg = nbp_vlan_group(p); ++ list_for_each_entry(vlan, &vg->vlan_list, vlist) { ++ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ continue; ++ vlan_vid_del(p->dev, proto, vlan->vid); ++ } ++ } ++ ++ return err; ++} ++ ++int br_vlan_set_proto(struct net_bridge *br, unsigned long val, ++ struct netlink_ext_ack *extack) ++{ ++ if (!eth_type_vlan(htons(val))) ++ return -EPROTONOSUPPORT; ++ ++ return __br_vlan_set_proto(br, htons(val), extack); ++} ++ ++int br_vlan_set_stats(struct net_bridge *br, unsigned long val) ++{ ++ switch (val) { ++ case 0: ++ case 1: ++ br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) ++{ ++ struct net_bridge_port *p; ++ ++ /* allow to change the option if there are no port vlans configured */ ++ list_for_each_entry(p, &br->port_list, list) { ++ struct net_bridge_vlan_group *vg = nbp_vlan_group(p); ++ ++ if (vg->num_vlans) ++ return -EBUSY; ++ } ++ ++ switch (val) { ++ case 0: ++ case 1: ++ br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) ++{ ++ struct net_bridge_vlan *v; ++ ++ if (vid != vg->pvid) ++ return false; ++ ++ v = br_vlan_lookup(&vg->vlan_hash, vid); ++ if (v && br_vlan_should_use(v) && ++ (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) ++ return true; ++ ++ return false; ++} ++ ++static void br_vlan_disable_default_pvid(struct net_bridge *br) ++{ ++ struct net_bridge_port *p; ++ u16 pvid = br->default_pvid; ++ ++ /* Disable default_pvid on all ports where it is still ++ * configured. ++ */ ++ if (vlan_default_pvid(br_vlan_group(br), pvid)) { ++ if (!br_vlan_delete(br, pvid)) ++ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); ++ } ++ ++ list_for_each_entry(p, &br->port_list, list) { ++ if (vlan_default_pvid(nbp_vlan_group(p), pvid) && ++ !nbp_vlan_delete(p, pvid)) ++ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); ++ } ++ ++ br->default_pvid = 0; ++} ++ ++int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, ++ struct netlink_ext_ack *extack) ++{ ++ const struct net_bridge_vlan *pvent; ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p; ++ unsigned long *changed; ++ bool vlchange; ++ u16 old_pvid; ++ int err = 0; ++ ++ if (!pvid) { ++ br_vlan_disable_default_pvid(br); ++ return 0; ++ } ++ ++ changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); ++ if (!changed) ++ return -ENOMEM; ++ ++ old_pvid = br->default_pvid; ++ ++ /* Update default_pvid config only if we do not conflict with ++ * user configuration. ++ */ ++ vg = br_vlan_group(br); ++ pvent = br_vlan_find(vg, pvid); ++ if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) && ++ (!pvent || !br_vlan_should_use(pvent))) { ++ err = br_vlan_add(br, pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED | ++ BRIDGE_VLAN_INFO_BRENTRY, ++ &vlchange, extack); ++ if (err) ++ goto out; ++ ++ if (br_vlan_delete(br, old_pvid)) ++ br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); ++ br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); ++ __set_bit(0, changed); ++ } ++ ++ list_for_each_entry(p, &br->port_list, list) { ++ /* Update default_pvid config only if we do not conflict with ++ * user configuration. ++ */ ++ vg = nbp_vlan_group(p); ++ if ((old_pvid && ++ !vlan_default_pvid(vg, old_pvid)) || ++ br_vlan_find(vg, pvid)) ++ continue; ++ ++ err = nbp_vlan_add(p, pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED, ++ &vlchange, extack); ++ if (err) ++ goto err_port; ++ if (nbp_vlan_delete(p, old_pvid)) ++ br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); ++ br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); ++ __set_bit(p->port_no, changed); ++ } ++ ++ br->default_pvid = pvid; ++ ++out: ++ bitmap_free(changed); ++ return err; ++ ++err_port: ++ list_for_each_entry_continue_reverse(p, &br->port_list, list) { ++ if (!test_bit(p->port_no, changed)) ++ continue; ++ ++ if (old_pvid) { ++ nbp_vlan_add(p, old_pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED, ++ &vlchange, NULL); ++ br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); ++ } ++ nbp_vlan_delete(p, pvid); ++ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); ++ } ++ ++ if (test_bit(0, changed)) { ++ if (old_pvid) { ++ br_vlan_add(br, old_pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED | ++ BRIDGE_VLAN_INFO_BRENTRY, ++ &vlchange, NULL); ++ br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); ++ } ++ br_vlan_delete(br, pvid); ++ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); ++ } ++ goto out; ++} ++ ++int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, ++ struct netlink_ext_ack *extack) ++{ ++ u16 pvid = val; ++ int err = 0; ++ ++ if (val >= VLAN_VID_MASK) ++ return -EINVAL; ++ ++ if (pvid == br->default_pvid) ++ goto out; ++ ++ /* Only allow default pvid change when filtering is disabled */ ++ if (br_opt_get(br, BROPT_VLAN_ENABLED)) { ++ pr_info_once("Please disable vlan filtering to change default_pvid\n"); ++ err = -EPERM; ++ goto out; ++ } ++ err = __br_vlan_set_default_pvid(br, pvid, extack); ++out: ++ return err; ++} ++ ++int br_vlan_init(struct net_bridge *br) ++{ ++ struct net_bridge_vlan_group *vg; ++ int ret = -ENOMEM; ++ ++ vg = kzalloc(sizeof(*vg), GFP_KERNEL); ++ if (!vg) ++ goto out; ++ ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); ++ if (ret) ++ goto err_rhtbl; ++ ret = vlan_tunnel_init(vg); ++ if (ret) ++ goto err_tunnel_init; ++ INIT_LIST_HEAD(&vg->vlan_list); ++ br->vlan_proto = htons(ETH_P_8021Q); ++ br->default_pvid = 1; ++ rcu_assign_pointer(br->vlgrp, vg); ++ ++out: ++ return ret; ++ ++err_tunnel_init: ++ rhashtable_destroy(&vg->vlan_hash); ++err_rhtbl: ++ kfree(vg); ++ ++ goto out; ++} ++ ++int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) ++{ ++ struct switchdev_attr attr = { ++ .orig_dev = p->br->dev, ++ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, ++ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, ++ .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED), ++ }; ++ struct net_bridge_vlan_group *vg; ++ int ret = -ENOMEM; ++ ++ vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); ++ if (!vg) ++ goto out; ++ ++ ret = switchdev_port_attr_set(p->dev, &attr, extack); ++ if (ret && ret != -EOPNOTSUPP) ++ goto err_vlan_enabled; ++ ++ ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); ++ if (ret) ++ goto err_rhtbl; ++ ret = vlan_tunnel_init(vg); ++ if (ret) ++ goto err_tunnel_init; ++ INIT_LIST_HEAD(&vg->vlan_list); ++ rcu_assign_pointer(p->vlgrp, vg); ++ if (p->br->default_pvid) { ++ bool changed; ++ ++ ret = nbp_vlan_add(p, p->br->default_pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED, ++ &changed, extack); ++ if (ret) ++ goto err_vlan_add; ++ br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); ++ } ++out: ++ return ret; ++ ++err_vlan_add: ++ RCU_INIT_POINTER(p->vlgrp, NULL); ++ synchronize_rcu(); ++ vlan_tunnel_deinit(vg); ++err_tunnel_init: ++ rhashtable_destroy(&vg->vlan_hash); ++err_rhtbl: ++err_vlan_enabled: ++ kfree(vg); ++ ++ goto out; ++} ++ ++/* Must be protected by RTNL. ++ * Must be called with vid in range from 1 to 4094 inclusive. ++ * changed must be true only if the vlan was created or updated ++ */ ++int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, ++ bool *changed, struct netlink_ext_ack *extack) ++{ ++ struct net_bridge_vlan *vlan; ++ int ret; ++ ++ ASSERT_RTNL(); ++ ++ *changed = false; ++ vlan = br_vlan_find(nbp_vlan_group(port), vid); ++ if (vlan) { ++ bool would_change = __vlan_flags_would_change(vlan, flags); ++ ++ if (would_change) { ++ /* Pass the flags to the hardware bridge */ ++ ret = br_switchdev_port_vlan_add(port->dev, vid, flags, ++ true, extack); ++ if (ret && ret != -EOPNOTSUPP) ++ return ret; ++ } ++ ++ __vlan_flags_commit(vlan, flags); ++ *changed = would_change; ++ ++ return 0; ++ } ++ ++ vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); ++ if (!vlan) ++ return -ENOMEM; ++ ++ vlan->vid = vid; ++ vlan->port = port; ++ ret = __vlan_add(vlan, flags, extack); ++ if (ret) ++ kfree(vlan); ++ else ++ *changed = true; ++ ++ return ret; ++} ++ ++/* Must be protected by RTNL. ++ * Must be called with vid in range from 1 to 4094 inclusive. ++ */ ++int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) ++{ ++ struct net_bridge_vlan *v; ++ ++ ASSERT_RTNL(); ++ ++ v = br_vlan_find(nbp_vlan_group(port), vid); ++ if (!v) ++ return -ENOENT; ++ br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); ++ br_fdb_delete_by_port(port->br, port, vid, 0); ++ ++ return __vlan_del(v); ++} ++ ++void nbp_vlan_flush(struct net_bridge_port *port) ++{ ++ struct net_bridge_vlan_group *vg; ++ ++ ASSERT_RTNL(); ++ ++ vg = nbp_vlan_group(port); ++ __vlan_flush(port->br, port, vg); ++ RCU_INIT_POINTER(port->vlgrp, NULL); ++ synchronize_rcu(); ++ __vlan_group_free(vg); ++} ++ ++void br_vlan_get_stats(const struct net_bridge_vlan *v, ++ struct pcpu_sw_netstats *stats) ++{ ++ int i; ++ ++ memset(stats, 0, sizeof(*stats)); ++ for_each_possible_cpu(i) { ++ u64 rxpackets, rxbytes, txpackets, txbytes; ++ struct pcpu_sw_netstats *cpu_stats; ++ unsigned int start; ++ ++ cpu_stats = per_cpu_ptr(v->stats, i); ++ do { ++ start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); ++ rxpackets = u64_stats_read(&cpu_stats->rx_packets); ++ rxbytes = u64_stats_read(&cpu_stats->rx_bytes); ++ txbytes = u64_stats_read(&cpu_stats->tx_bytes); ++ txpackets = u64_stats_read(&cpu_stats->tx_packets); ++ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); ++ ++ u64_stats_add(&stats->rx_packets, rxpackets); ++ u64_stats_add(&stats->rx_bytes, rxbytes); ++ u64_stats_add(&stats->tx_bytes, txbytes); ++ u64_stats_add(&stats->tx_packets, txpackets); ++ } ++} ++ ++int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p; ++ ++ ASSERT_RTNL(); ++ p = br_port_get_check_rtnl(dev); ++ if (p) ++ vg = nbp_vlan_group(p); ++ else if (netif_is_bridge_master(dev)) ++ vg = br_vlan_group(netdev_priv(dev)); ++ else ++ return -EINVAL; ++ ++ *p_pvid = br_get_pvid(vg); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_pvid); ++ ++int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p; ++ ++ p = br_port_get_check_rcu(dev); ++ if (p) ++ vg = nbp_vlan_group_rcu(p); ++ else if (netif_is_bridge_master(dev)) ++ vg = br_vlan_group_rcu(netdev_priv(dev)); ++ else ++ return -EINVAL; ++ ++ *p_pvid = br_get_pvid(vg); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); ++ ++void br_vlan_fill_forward_path_pvid(struct net_bridge *br, ++ struct net_device_path_ctx *ctx, ++ struct net_device_path *path) ++{ ++ struct net_bridge_vlan_group *vg; ++ int idx = ctx->num_vlans - 1; ++ u16 vid; ++ ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; ++ ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) ++ return; ++ ++ vg = br_vlan_group(br); ++ ++ if (idx >= 0 && ++ ctx->vlan[idx].proto == br->vlan_proto) { ++ vid = ctx->vlan[idx].id; ++ } else { ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG; ++ vid = br_get_pvid(vg); ++ } ++ ++ path->bridge.vlan_id = vid; ++ path->bridge.vlan_proto = br->vlan_proto; ++} ++ ++int br_vlan_fill_forward_path_mode(struct net_bridge *br, ++ struct net_bridge_port *dst, ++ struct net_device_path *path) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v; ++ ++ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) ++ return 0; ++ ++ vg = nbp_vlan_group_rcu(dst); ++ v = br_vlan_find(vg, path->bridge.vlan_id); ++ if (!v || !br_vlan_should_use(v)) ++ return -EINVAL; ++ ++ if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) ++ return 0; ++ ++ if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG) ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; ++ else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW; ++ else ++ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG; ++ ++ return 0; ++} ++ ++int br_vlan_get_info(const struct net_device *dev, u16 vid, ++ struct bridge_vlan_info *p_vinfo) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v; ++ struct net_bridge_port *p; ++ ++ ASSERT_RTNL(); ++ p = br_port_get_check_rtnl(dev); ++ if (p) ++ vg = nbp_vlan_group(p); ++ else if (netif_is_bridge_master(dev)) ++ vg = br_vlan_group(netdev_priv(dev)); ++ else ++ return -EINVAL; ++ ++ v = br_vlan_find(vg, vid); ++ if (!v) ++ return -ENOENT; ++ ++ p_vinfo->vid = vid; ++ p_vinfo->flags = v->flags; ++ if (vid == br_get_pvid(vg)) ++ p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_info); ++ ++int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, ++ struct bridge_vlan_info *p_vinfo) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v; ++ struct net_bridge_port *p; ++ ++ p = br_port_get_check_rcu(dev); ++ if (p) ++ vg = nbp_vlan_group_rcu(p); ++ else if (netif_is_bridge_master(dev)) ++ vg = br_vlan_group_rcu(netdev_priv(dev)); ++ else ++ return -EINVAL; ++ ++ v = br_vlan_find(vg, vid); ++ if (!v) ++ return -ENOENT; ++ ++ p_vinfo->vid = vid; ++ p_vinfo->flags = v->flags; ++ if (vid == br_get_pvid(vg)) ++ p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu); ++ ++static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) ++{ ++ return is_vlan_dev(dev) && ++ !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING); ++} ++ ++static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, ++ __always_unused struct netdev_nested_priv *priv) ++{ ++ return br_vlan_is_bind_vlan_dev(dev); ++} ++ ++static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev) ++{ ++ int found; ++ ++ rcu_read_lock(); ++ found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn, ++ NULL); ++ rcu_read_unlock(); ++ ++ return !!found; ++} ++ ++struct br_vlan_bind_walk_data { ++ u16 vid; ++ struct net_device *result; ++}; ++ ++static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, ++ struct netdev_nested_priv *priv) ++{ ++ struct br_vlan_bind_walk_data *data = priv->data; ++ int found = 0; ++ ++ if (br_vlan_is_bind_vlan_dev(dev) && ++ vlan_dev_priv(dev)->vlan_id == data->vid) { ++ data->result = dev; ++ found = 1; ++ } ++ ++ return found; ++} ++ ++static struct net_device * ++br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) ++{ ++ struct br_vlan_bind_walk_data data = { ++ .vid = vid, ++ }; ++ struct netdev_nested_priv priv = { ++ .data = (void *)&data, ++ }; ++ ++ rcu_read_lock(); ++ netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, ++ &priv); ++ rcu_read_unlock(); ++ ++ return data.result; ++} ++ ++static bool br_vlan_is_dev_up(const struct net_device *dev) ++{ ++ return !!(dev->flags & IFF_UP) && netif_oper_up(dev); ++} ++ ++static void br_vlan_set_vlan_dev_state(const struct net_bridge *br, ++ struct net_device *vlan_dev) ++{ ++ u16 vid = vlan_dev_priv(vlan_dev)->vlan_id; ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p; ++ bool has_carrier = false; ++ ++ if (!netif_carrier_ok(br->dev)) { ++ netif_carrier_off(vlan_dev); ++ return; ++ } ++ ++ list_for_each_entry(p, &br->port_list, list) { ++ vg = nbp_vlan_group(p); ++ if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) { ++ has_carrier = true; ++ break; ++ } ++ } ++ ++ if (has_carrier) ++ netif_carrier_on(vlan_dev); ++ else ++ netif_carrier_off(vlan_dev); ++} ++ ++static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) ++{ ++ struct net_bridge_vlan_group *vg = nbp_vlan_group(p); ++ struct net_bridge_vlan *vlan; ++ struct net_device *vlan_dev; ++ ++ list_for_each_entry(vlan, &vg->vlan_list, vlist) { ++ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, ++ vlan->vid); ++ if (vlan_dev) { ++ if (br_vlan_is_dev_up(p->dev)) { ++ if (netif_carrier_ok(p->br->dev)) ++ netif_carrier_on(vlan_dev); ++ } else { ++ br_vlan_set_vlan_dev_state(p->br, vlan_dev); ++ } ++ } ++ } ++} ++ ++static void br_vlan_upper_change(struct net_device *dev, ++ struct net_device *upper_dev, ++ bool linking) ++{ ++ struct net_bridge *br = netdev_priv(dev); ++ ++ if (!br_vlan_is_bind_vlan_dev(upper_dev)) ++ return; ++ ++ if (linking) { ++ br_vlan_set_vlan_dev_state(br, upper_dev); ++ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); ++ } else { ++ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, ++ br_vlan_has_upper_bind_vlan_dev(dev)); ++ } ++} ++ ++struct br_vlan_link_state_walk_data { ++ struct net_bridge *br; ++}; ++ ++static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, ++ struct netdev_nested_priv *priv) ++{ ++ struct br_vlan_link_state_walk_data *data = priv->data; ++ ++ if (br_vlan_is_bind_vlan_dev(vlan_dev)) ++ br_vlan_set_vlan_dev_state(data->br, vlan_dev); ++ ++ return 0; ++} ++ ++static void br_vlan_link_state_change(struct net_device *dev, ++ struct net_bridge *br) ++{ ++ struct br_vlan_link_state_walk_data data = { ++ .br = br ++ }; ++ struct netdev_nested_priv priv = { ++ .data = (void *)&data, ++ }; ++ ++ rcu_read_lock(); ++ netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, ++ &priv); ++ rcu_read_unlock(); ++} ++ ++/* Must be protected by RTNL. */ ++static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid) ++{ ++ struct net_device *vlan_dev; ++ ++ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) ++ return; ++ ++ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid); ++ if (vlan_dev) ++ br_vlan_set_vlan_dev_state(p->br, vlan_dev); ++} ++ ++/* Must be protected by RTNL. */ ++int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) ++{ ++ struct netdev_notifier_changeupper_info *info; ++ struct net_bridge *br = netdev_priv(dev); ++ int vlcmd = 0, ret = 0; ++ bool changed = false; ++ ++ switch (event) { ++ case NETDEV_REGISTER: ++ ret = br_vlan_add(br, br->default_pvid, ++ BRIDGE_VLAN_INFO_PVID | ++ BRIDGE_VLAN_INFO_UNTAGGED | ++ BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); ++ vlcmd = RTM_NEWVLAN; ++ break; ++ case NETDEV_UNREGISTER: ++ changed = !br_vlan_delete(br, br->default_pvid); ++ vlcmd = RTM_DELVLAN; ++ break; ++ case NETDEV_CHANGEUPPER: ++ info = ptr; ++ br_vlan_upper_change(dev, info->upper_dev, info->linking); ++ break; ++ ++ case NETDEV_CHANGE: ++ case NETDEV_UP: ++ if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING)) ++ break; ++ br_vlan_link_state_change(dev, br); ++ break; ++ } ++ if (changed) ++ br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd); ++ ++ return ret; ++} ++ ++/* Must be protected by RTNL. */ ++void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) ++{ ++ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) ++ return; ++ ++ switch (event) { ++ case NETDEV_CHANGE: ++ case NETDEV_DOWN: ++ case NETDEV_UP: ++ br_vlan_set_all_vlan_dev_state(p); ++ break; ++ } ++} ++ ++static bool br_vlan_stats_fill(struct sk_buff *skb, ++ const struct net_bridge_vlan *v) ++{ ++ struct pcpu_sw_netstats stats; ++ struct nlattr *nest; ++ ++ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS); ++ if (!nest) ++ return false; ++ ++ br_vlan_get_stats(v, &stats); ++ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES, ++ u64_stats_read(&stats.rx_bytes), ++ BRIDGE_VLANDB_STATS_PAD) || ++ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS, ++ u64_stats_read(&stats.rx_packets), ++ BRIDGE_VLANDB_STATS_PAD) || ++ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES, ++ u64_stats_read(&stats.tx_bytes), ++ BRIDGE_VLANDB_STATS_PAD) || ++ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS, ++ u64_stats_read(&stats.tx_packets), ++ BRIDGE_VLANDB_STATS_PAD)) ++ goto out_err; ++ ++ nla_nest_end(skb, nest); ++ ++ return true; ++ ++out_err: ++ nla_nest_cancel(skb, nest); ++ return false; ++} ++ ++/* v_opts is used to dump the options which must be equal in the whole range */ ++static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, ++ const struct net_bridge_vlan *v_opts, ++ u16 flags, ++ bool dump_stats) ++{ ++ struct bridge_vlan_info info; ++ struct nlattr *nest; ++ ++ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY); ++ if (!nest) ++ return false; ++ ++ memset(&info, 0, sizeof(info)); ++ info.vid = vid; ++ if (flags & BRIDGE_VLAN_INFO_UNTAGGED) ++ info.flags |= BRIDGE_VLAN_INFO_UNTAGGED; ++ if (flags & BRIDGE_VLAN_INFO_PVID) ++ info.flags |= BRIDGE_VLAN_INFO_PVID; ++ ++ if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) ++ goto out_err; ++ ++ if (vid_range && vid < vid_range && ++ !(flags & BRIDGE_VLAN_INFO_PVID) && ++ nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) ++ goto out_err; ++ ++ if (v_opts) { ++ if (!br_vlan_opts_fill(skb, v_opts)) ++ goto out_err; ++ ++ if (dump_stats && !br_vlan_stats_fill(skb, v_opts)) ++ goto out_err; ++ } ++ ++ nla_nest_end(skb, nest); ++ ++ return true; ++ ++out_err: ++ nla_nest_cancel(skb, nest); ++ return false; ++} ++ ++static size_t rtnl_vlan_nlmsg_size(void) ++{ ++ return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) ++ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ ++ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ ++ + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */ ++ + br_vlan_opts_nl_size(); /* bridge vlan options */ ++} ++ ++void br_vlan_notify(const struct net_bridge *br, ++ const struct net_bridge_port *p, ++ u16 vid, u16 vid_range, ++ int cmd) ++{ ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_vlan *v = NULL; ++ struct br_vlan_msg *bvm; ++ struct nlmsghdr *nlh; ++ struct sk_buff *skb; ++ int err = -ENOBUFS; ++ struct net *net; ++ u16 flags = 0; ++ int ifindex; ++ ++ /* right now notifications are done only with rtnl held */ ++ ASSERT_RTNL(); ++ ++ if (p) { ++ ifindex = p->dev->ifindex; ++ vg = nbp_vlan_group(p); ++ net = dev_net(p->dev); ++ } else { ++ ifindex = br->dev->ifindex; ++ vg = br_vlan_group(br); ++ net = dev_net(br->dev); ++ } ++ ++ skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL); ++ if (!skb) ++ goto out_err; ++ ++ err = -EMSGSIZE; ++ nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0); ++ if (!nlh) ++ goto out_err; ++ bvm = nlmsg_data(nlh); ++ memset(bvm, 0, sizeof(*bvm)); ++ bvm->family = AF_BRIDGE; ++ bvm->ifindex = ifindex; ++ ++ switch (cmd) { ++ case RTM_NEWVLAN: ++ /* need to find the vlan due to flags/options */ ++ v = br_vlan_find(vg, vid); ++ if (!v || !br_vlan_should_use(v)) ++ goto out_kfree; ++ ++ flags = v->flags; ++ if (br_get_pvid(vg) == v->vid) ++ flags |= BRIDGE_VLAN_INFO_PVID; ++ break; ++ case RTM_DELVLAN: ++ break; ++ default: ++ goto out_kfree; ++ } ++ ++ if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags, false)) ++ goto out_err; ++ ++ nlmsg_end(skb, nlh); ++ rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); ++ return; ++ ++out_err: ++ rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err); ++out_kfree: ++ kfree_skb(skb); ++} ++ ++/* check if v_curr can enter a range ending in range_end */ ++bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, ++ const struct net_bridge_vlan *range_end) ++{ ++ return v_curr->vid - range_end->vid == 1 && ++ range_end->flags == v_curr->flags && ++ br_vlan_opts_eq_range(v_curr, range_end); ++} ++ ++static int br_vlan_dump_dev(const struct net_device *dev, ++ struct sk_buff *skb, ++ struct netlink_callback *cb, ++ u32 dump_flags) ++{ ++ struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; ++ bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL); ++ bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); ++ struct net_bridge_vlan_group *vg; ++ int idx = 0, s_idx = cb->args[1]; ++ struct nlmsghdr *nlh = NULL; ++ struct net_bridge_port *p; ++ struct br_vlan_msg *bvm; ++ struct net_bridge *br; ++ int err = 0; ++ u16 pvid; ++ ++ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) ++ return -EINVAL; ++ ++ if (netif_is_bridge_master(dev)) { ++ br = netdev_priv(dev); ++ vg = br_vlan_group_rcu(br); ++ p = NULL; ++ } else { ++ /* global options are dumped only for bridge devices */ ++ if (dump_global) ++ return 0; ++ ++ p = br_port_get_rcu(dev); ++ if (WARN_ON(!p)) ++ return -EINVAL; ++ vg = nbp_vlan_group_rcu(p); ++ br = p->br; ++ } ++ ++ if (!vg) ++ return 0; ++ ++ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, ++ RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI); ++ if (!nlh) ++ return -EMSGSIZE; ++ bvm = nlmsg_data(nlh); ++ memset(bvm, 0, sizeof(*bvm)); ++ bvm->family = PF_BRIDGE; ++ bvm->ifindex = dev->ifindex; ++ pvid = br_get_pvid(vg); ++ ++ /* idx must stay at range's beginning until it is filled in */ ++ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { ++ if (!dump_global && !br_vlan_should_use(v)) ++ continue; ++ if (idx < s_idx) { ++ idx++; ++ continue; ++ } ++ ++ if (!range_start) { ++ range_start = v; ++ range_end = v; ++ continue; ++ } ++ ++ if (dump_global) { ++ if (br_vlan_global_opts_can_enter_range(v, range_end)) ++ goto update_end; ++ if (!br_vlan_global_opts_fill(skb, range_start->vid, ++ range_end->vid, ++ range_start)) { ++ err = -EMSGSIZE; ++ break; ++ } ++ /* advance number of filled vlans */ ++ idx += range_end->vid - range_start->vid + 1; ++ ++ range_start = v; ++ } else if (dump_stats || v->vid == pvid || ++ !br_vlan_can_enter_range(v, range_end)) { ++ u16 vlan_flags = br_vlan_flags(range_start, pvid); ++ ++ if (!br_vlan_fill_vids(skb, range_start->vid, ++ range_end->vid, range_start, ++ vlan_flags, dump_stats)) { ++ err = -EMSGSIZE; ++ break; ++ } ++ /* advance number of filled vlans */ ++ idx += range_end->vid - range_start->vid + 1; ++ ++ range_start = v; ++ } ++update_end: ++ range_end = v; ++ } ++ ++ /* err will be 0 and range_start will be set in 3 cases here: ++ * - first vlan (range_start == range_end) ++ * - last vlan (range_start == range_end, not in range) ++ * - last vlan range (range_start != range_end, in range) ++ */ ++ if (!err && range_start) { ++ if (dump_global && ++ !br_vlan_global_opts_fill(skb, range_start->vid, ++ range_end->vid, range_start)) ++ err = -EMSGSIZE; ++ else if (!dump_global && ++ !br_vlan_fill_vids(skb, range_start->vid, ++ range_end->vid, range_start, ++ br_vlan_flags(range_start, pvid), ++ dump_stats)) ++ err = -EMSGSIZE; ++ } ++ ++ cb->args[1] = err ? idx : 0; ++ ++ nlmsg_end(skb, nlh); ++ ++ return err; ++} ++ ++static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = { ++ [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 }, ++}; ++ ++static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) ++{ ++ struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1]; ++ int idx = 0, err = 0, s_idx = cb->args[0]; ++ struct net *net = sock_net(skb->sk); ++ struct br_vlan_msg *bvm; ++ struct net_device *dev; ++ u32 dump_flags = 0; ++ ++ err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX, ++ br_vlan_db_dump_pol, cb->extack); ++ if (err < 0) ++ return err; ++ ++ bvm = nlmsg_data(cb->nlh); ++ if (dtb[BRIDGE_VLANDB_DUMP_FLAGS]) ++ dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]); ++ ++ rcu_read_lock(); ++ if (bvm->ifindex) { ++ dev = dev_get_by_index_rcu(net, bvm->ifindex); ++ if (!dev) { ++ err = -ENODEV; ++ goto out_err; ++ } ++ err = br_vlan_dump_dev(dev, skb, cb, dump_flags); ++ /* if the dump completed without an error we return 0 here */ ++ if (err != -EMSGSIZE) ++ goto out_err; ++ } else { ++ for_each_netdev_rcu(net, dev) { ++ if (idx < s_idx) ++ goto skip; ++ ++ err = br_vlan_dump_dev(dev, skb, cb, dump_flags); ++ if (err == -EMSGSIZE) ++ break; ++skip: ++ idx++; ++ } ++ } ++ cb->args[0] = idx; ++ rcu_read_unlock(); ++ ++ return skb->len; ++ ++out_err: ++ rcu_read_unlock(); ++ ++ return err; ++} ++ ++static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { ++ [BRIDGE_VLANDB_ENTRY_INFO] = ++ NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)), ++ [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, ++ [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, ++ [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, ++ [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, ++}; ++ ++static int br_vlan_rtm_process_one(struct net_device *dev, ++ const struct nlattr *attr, ++ int cmd, struct netlink_ext_ack *extack) ++{ ++ struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; ++ struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; ++ bool changed = false, skip_processing = false; ++ struct net_bridge_vlan_group *vg; ++ struct net_bridge_port *p = NULL; ++ int err = 0, cmdmap = 0; ++ struct net_bridge *br; ++ ++ if (netif_is_bridge_master(dev)) { ++ br = netdev_priv(dev); ++ vg = br_vlan_group(br); ++ } else { ++ p = br_port_get_rtnl(dev); ++ if (WARN_ON(!p)) ++ return -ENODEV; ++ br = p->br; ++ vg = nbp_vlan_group(p); ++ } ++ ++ if (WARN_ON(!vg)) ++ return -ENODEV; ++ ++ err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr, ++ br_vlan_db_policy, extack); ++ if (err) ++ return err; ++ ++ if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) { ++ NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); ++ return -EINVAL; ++ } ++ memset(&vrange_end, 0, sizeof(vrange_end)); ++ ++ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); ++ if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | ++ BRIDGE_VLAN_INFO_RANGE_END)) { ++ NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls"); ++ return -EINVAL; ++ } ++ if (!br_vlan_valid_id(vinfo->vid, extack)) ++ return -EINVAL; ++ ++ if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) { ++ vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]); ++ /* validate user-provided flags without RANGE_BEGIN */ ++ vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags; ++ vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; ++ ++ /* vinfo_last is the range start, vinfo the range end */ ++ vinfo_last = vinfo; ++ vinfo = &vrange_end; ++ ++ if (!br_vlan_valid_id(vinfo->vid, extack) || ++ !br_vlan_valid_range(vinfo, vinfo_last, extack)) ++ return -EINVAL; ++ } ++ ++ switch (cmd) { ++ case RTM_NEWVLAN: ++ cmdmap = RTM_SETLINK; ++ skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS); ++ break; ++ case RTM_DELVLAN: ++ cmdmap = RTM_DELLINK; ++ break; ++ } ++ ++ if (!skip_processing) { ++ struct bridge_vlan_info *tmp_last = vinfo_last; ++ ++ /* br_process_vlan_info may overwrite vinfo_last */ ++ err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last, ++ &changed, extack); ++ ++ /* notify first if anything changed */ ++ if (changed) ++ br_ifinfo_notify(cmdmap, br, p); ++ ++ if (err) ++ return err; ++ } ++ ++ /* deal with options */ ++ if (cmd == RTM_NEWVLAN) { ++ struct net_bridge_vlan *range_start, *range_end; ++ ++ if (vinfo_last) { ++ range_start = br_vlan_find(vg, vinfo_last->vid); ++ range_end = br_vlan_find(vg, vinfo->vid); ++ } else { ++ range_start = br_vlan_find(vg, vinfo->vid); ++ range_end = range_start; ++ } ++ ++ err = br_vlan_process_options(br, p, range_start, range_end, ++ tb, extack); ++ } ++ ++ return err; ++} ++ ++static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, ++ struct netlink_ext_ack *extack) ++{ ++ struct net *net = sock_net(skb->sk); ++ struct br_vlan_msg *bvm; ++ struct net_device *dev; ++ struct nlattr *attr; ++ int err, vlans = 0; ++ int rem; ++ ++ /* this should validate the header and check for remaining bytes */ ++ err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL, ++ extack); ++ if (err < 0) ++ return err; ++ ++ bvm = nlmsg_data(nlh); ++ dev = __dev_get_by_index(net, bvm->ifindex); ++ if (!dev) ++ return -ENODEV; ++ ++ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { ++ NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port"); ++ return -EINVAL; ++ } ++ ++ nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { ++ switch (nla_type(attr)) { ++ case BRIDGE_VLANDB_ENTRY: ++ err = br_vlan_rtm_process_one(dev, attr, ++ nlh->nlmsg_type, ++ extack); ++ break; ++ case BRIDGE_VLANDB_GLOBAL_OPTIONS: ++ err = br_vlan_rtm_process_global_options(dev, attr, ++ nlh->nlmsg_type, ++ extack); ++ break; ++ default: ++ continue; ++ } ++ ++ vlans++; ++ if (err) ++ break; ++ } ++ if (!vlans) { ++ NL_SET_ERR_MSG_MOD(extack, "No vlans found to process"); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++void br_vlan_rtnl_init(void) ++{ ++ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, ++ br_vlan_rtm_dump, 0); ++ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, ++ br_vlan_rtm_process, NULL, 0); ++ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, ++ br_vlan_rtm_process, NULL, 0); ++} ++ ++void br_vlan_rtnl_uninit(void) ++{ ++ rtnl_unregister(PF_BRIDGE, RTM_GETVLAN); ++ rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN); ++ rtnl_unregister(PF_BRIDGE, RTM_DELVLAN); ++} +diff -rupN linux.orig/net/core/dev.c linux/net/core/dev.c +--- linux.orig/net/core/dev.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/dev.c 2022-12-04 10:40:26.732034003 -0500 +@@ -4582,15 +4582,6 @@ static void rps_trigger_softirq(void *da #endif /* CONFIG_RPS */ @@ -8638,7 +39072,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644 /* * Check if this softnet_data structure is another cpu one * If yes, queue it to our IPI list and return 1 -@@ -6661,6 +6652,30 @@ static void skb_defer_free_flush(struct softnet_data *sd) +@@ -6665,6 +6656,30 @@ static void skb_defer_free_flush(struct } } @@ -8669,7 +39103,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644 static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); -@@ -10492,12 +10507,12 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, +@@ -10496,12 +10511,12 @@ void dev_fetch_sw_netstats(struct rtnl_l stats = per_cpu_ptr(netstats, cpu); do { @@ -8684,7 +39118,7 @@ index 56c8b0921c9fd..d96506980d2f2 100644 s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; -@@ -11412,7 +11427,11 @@ static int __init net_dev_init(void) +@@ -11416,7 +11431,11 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif @@ -8696,11 +39130,11469 @@ index 56c8b0921c9fd..d96506980d2f2 100644 spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); -diff --git a/net/core/devlink.c b/net/core/devlink.c -index b50bcc18b8d9e..cfa6a099457ae 100644 ---- a/net/core/devlink.c -+++ b/net/core/devlink.c -@@ -8268,10 +8268,10 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, +diff -rupN linux.orig/net/core/dev.c.orig linux/net/core/dev.c.orig +--- linux.orig/net/core/dev.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/net/core/dev.c.orig 2022-12-04 10:40:18.728054516 -0500 +@@ -0,0 +1,11455 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * NET3 Protocol independent device support routines. ++ * ++ * Derived from the non IP parts of dev.c 1.0.19 ++ * Authors: Ross Biro ++ * Fred N. van Kempen, ++ * Mark Evans, ++ * ++ * Additional Authors: ++ * Florian la Roche ++ * Alan Cox ++ * David Hinds ++ * Alexey Kuznetsov ++ * Adam Sulmicki ++ * Pekka Riikonen ++ * ++ * Changes: ++ * D.J. Barrow : Fixed bug where dev->refcnt gets set ++ * to 2 if register_netdev gets called ++ * before net_dev_init & also removed a ++ * few lines of code in the process. ++ * Alan Cox : device private ioctl copies fields back. ++ * Alan Cox : Transmit queue code does relevant ++ * stunts to keep the queue safe. ++ * Alan Cox : Fixed double lock. ++ * Alan Cox : Fixed promisc NULL pointer trap ++ * ???????? : Support the full private ioctl range ++ * Alan Cox : Moved ioctl permission check into ++ * drivers ++ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI ++ * Alan Cox : 100 backlog just doesn't cut it when ++ * you start doing multicast video 8) ++ * Alan Cox : Rewrote net_bh and list manager. ++ * Alan Cox : Fix ETH_P_ALL echoback lengths. ++ * Alan Cox : Took out transmit every packet pass ++ * Saved a few bytes in the ioctl handler ++ * Alan Cox : Network driver sets packet type before ++ * calling netif_rx. Saves a function ++ * call a packet. ++ * Alan Cox : Hashed net_bh() ++ * Richard Kooijman: Timestamp fixes. ++ * Alan Cox : Wrong field in SIOCGIFDSTADDR ++ * Alan Cox : Device lock protection. ++ * Alan Cox : Fixed nasty side effect of device close ++ * changes. ++ * Rudi Cilibrasi : Pass the right thing to ++ * set_mac_address() ++ * Dave Miller : 32bit quantity for the device lock to ++ * make it work out on a Sparc. ++ * Bjorn Ekwall : Added KERNELD hack. ++ * Alan Cox : Cleaned up the backlog initialise. ++ * Craig Metz : SIOCGIFCONF fix if space for under ++ * 1 device. ++ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there ++ * is no device open function. ++ * Andi Kleen : Fix error reporting for SIOCGIFCONF ++ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF ++ * Cyrus Durgin : Cleaned for KMOD ++ * Adam Sulmicki : Bug Fix : Network Device Unload ++ * A network device unload needs to purge ++ * the backlog queue. ++ * Paul Rusty Russell : SIOCSIFNAME ++ * Pekka Riikonen : Netdev boot-time settings code ++ * Andrew Morton : Make unregister_netdevice wait ++ * indefinitely on dev->refcnt ++ * J Hadi Salim : - Backlog queue sampling ++ * - netif_rx() feedback ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "dev.h" ++#include "net-sysfs.h" ++ ++ ++static DEFINE_SPINLOCK(ptype_lock); ++struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; ++struct list_head ptype_all __read_mostly; /* Taps */ ++ ++static int netif_rx_internal(struct sk_buff *skb); ++static int call_netdevice_notifiers_info(unsigned long val, ++ struct netdev_notifier_info *info); ++static int call_netdevice_notifiers_extack(unsigned long val, ++ struct net_device *dev, ++ struct netlink_ext_ack *extack); ++static struct napi_struct *napi_by_id(unsigned int napi_id); ++ ++/* ++ * The @dev_base_head list is protected by @dev_base_lock and the rtnl ++ * semaphore. ++ * ++ * Pure readers hold dev_base_lock for reading, or rcu_read_lock() ++ * ++ * Writers must hold the rtnl semaphore while they loop through the ++ * dev_base_head list, and hold dev_base_lock for writing when they do the ++ * actual updates. This allows pure readers to access the list even ++ * while a writer is preparing to update it. ++ * ++ * To put it another way, dev_base_lock is held for writing only to ++ * protect against pure readers; the rtnl semaphore provides the ++ * protection against other writers. ++ * ++ * See, for example usages, register_netdevice() and ++ * unregister_netdevice(), which must be called with the rtnl ++ * semaphore held. ++ */ ++DEFINE_RWLOCK(dev_base_lock); ++EXPORT_SYMBOL(dev_base_lock); ++ ++static DEFINE_MUTEX(ifalias_mutex); ++ ++/* protects napi_hash addition/deletion and napi_gen_id */ ++static DEFINE_SPINLOCK(napi_hash_lock); ++ ++static unsigned int napi_gen_id = NR_CPUS; ++static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); ++ ++static DECLARE_RWSEM(devnet_rename_sem); ++ ++static inline void dev_base_seq_inc(struct net *net) ++{ ++ while (++net->dev_base_seq == 0) ++ ; ++} ++ ++static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) ++{ ++ unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); ++ ++ return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; ++} ++ ++static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) ++{ ++ return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; ++} ++ ++static inline void rps_lock_irqsave(struct softnet_data *sd, ++ unsigned long *flags) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_save(*flags); ++} ++ ++static inline void rps_lock_irq_disable(struct softnet_data *sd) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_lock_irq(&sd->input_pkt_queue.lock); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); ++} ++ ++static inline void rps_unlock_irq_restore(struct softnet_data *sd, ++ unsigned long *flags) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_restore(*flags); ++} ++ ++static inline void rps_unlock_irq_enable(struct softnet_data *sd) ++{ ++ if (IS_ENABLED(CONFIG_RPS)) ++ spin_unlock_irq(&sd->input_pkt_queue.lock); ++ else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); ++} ++ ++static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, ++ const char *name) ++{ ++ struct netdev_name_node *name_node; ++ ++ name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); ++ if (!name_node) ++ return NULL; ++ INIT_HLIST_NODE(&name_node->hlist); ++ name_node->dev = dev; ++ name_node->name = name; ++ return name_node; ++} ++ ++static struct netdev_name_node * ++netdev_name_node_head_alloc(struct net_device *dev) ++{ ++ struct netdev_name_node *name_node; ++ ++ name_node = netdev_name_node_alloc(dev, dev->name); ++ if (!name_node) ++ return NULL; ++ INIT_LIST_HEAD(&name_node->list); ++ return name_node; ++} ++ ++static void netdev_name_node_free(struct netdev_name_node *name_node) ++{ ++ kfree(name_node); ++} ++ ++static void netdev_name_node_add(struct net *net, ++ struct netdev_name_node *name_node) ++{ ++ hlist_add_head_rcu(&name_node->hlist, ++ dev_name_hash(net, name_node->name)); ++} ++ ++static void netdev_name_node_del(struct netdev_name_node *name_node) ++{ ++ hlist_del_rcu(&name_node->hlist); ++} ++ ++static struct netdev_name_node *netdev_name_node_lookup(struct net *net, ++ const char *name) ++{ ++ struct hlist_head *head = dev_name_hash(net, name); ++ struct netdev_name_node *name_node; ++ ++ hlist_for_each_entry(name_node, head, hlist) ++ if (!strcmp(name_node->name, name)) ++ return name_node; ++ return NULL; ++} ++ ++static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, ++ const char *name) ++{ ++ struct hlist_head *head = dev_name_hash(net, name); ++ struct netdev_name_node *name_node; ++ ++ hlist_for_each_entry_rcu(name_node, head, hlist) ++ if (!strcmp(name_node->name, name)) ++ return name_node; ++ return NULL; ++} ++ ++bool netdev_name_in_use(struct net *net, const char *name) ++{ ++ return netdev_name_node_lookup(net, name); ++} ++EXPORT_SYMBOL(netdev_name_in_use); ++ ++int netdev_name_node_alt_create(struct net_device *dev, const char *name) ++{ ++ struct netdev_name_node *name_node; ++ struct net *net = dev_net(dev); ++ ++ name_node = netdev_name_node_lookup(net, name); ++ if (name_node) ++ return -EEXIST; ++ name_node = netdev_name_node_alloc(dev, name); ++ if (!name_node) ++ return -ENOMEM; ++ netdev_name_node_add(net, name_node); ++ /* The node that holds dev->name acts as a head of per-device list. */ ++ list_add_tail(&name_node->list, &dev->name_node->list); ++ ++ return 0; ++} ++ ++static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) ++{ ++ list_del(&name_node->list); ++ netdev_name_node_del(name_node); ++ kfree(name_node->name); ++ netdev_name_node_free(name_node); ++} ++ ++int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) ++{ ++ struct netdev_name_node *name_node; ++ struct net *net = dev_net(dev); ++ ++ name_node = netdev_name_node_lookup(net, name); ++ if (!name_node) ++ return -ENOENT; ++ /* lookup might have found our primary name or a name belonging ++ * to another device. ++ */ ++ if (name_node == dev->name_node || name_node->dev != dev) ++ return -EINVAL; ++ ++ __netdev_name_node_alt_destroy(name_node); ++ ++ return 0; ++} ++ ++static void netdev_name_node_alt_flush(struct net_device *dev) ++{ ++ struct netdev_name_node *name_node, *tmp; ++ ++ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) ++ __netdev_name_node_alt_destroy(name_node); ++} ++ ++/* Device list insertion */ ++static void list_netdevice(struct net_device *dev) ++{ ++ struct net *net = dev_net(dev); ++ ++ ASSERT_RTNL(); ++ ++ write_lock(&dev_base_lock); ++ list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); ++ netdev_name_node_add(net, dev->name_node); ++ hlist_add_head_rcu(&dev->index_hlist, ++ dev_index_hash(net, dev->ifindex)); ++ write_unlock(&dev_base_lock); ++ ++ dev_base_seq_inc(net); ++} ++ ++/* Device list removal ++ * caller must respect a RCU grace period before freeing/reusing dev ++ */ ++static void unlist_netdevice(struct net_device *dev, bool lock) ++{ ++ ASSERT_RTNL(); ++ ++ /* Unlink dev from the device chain */ ++ if (lock) ++ write_lock(&dev_base_lock); ++ list_del_rcu(&dev->dev_list); ++ netdev_name_node_del(dev->name_node); ++ hlist_del_rcu(&dev->index_hlist); ++ if (lock) ++ write_unlock(&dev_base_lock); ++ ++ dev_base_seq_inc(dev_net(dev)); ++} ++ ++/* ++ * Our notifier list ++ */ ++ ++static RAW_NOTIFIER_HEAD(netdev_chain); ++ ++/* ++ * Device drivers call our routines to queue packets here. We empty the ++ * queue in the local softnet handler. ++ */ ++ ++DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); ++EXPORT_PER_CPU_SYMBOL(softnet_data); ++ ++#ifdef CONFIG_LOCKDEP ++/* ++ * register_netdevice() inits txq->_xmit_lock and sets lockdep class ++ * according to dev->type ++ */ ++static const unsigned short netdev_lock_type[] = { ++ ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ++ ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ++ ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ++ ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, ++ ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, ++ ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, ++ ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, ++ ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, ++ ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, ++ ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, ++ ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ++ ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ++ ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ++ ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ++ ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; ++ ++static const char *const netdev_lock_name[] = { ++ "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", ++ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", ++ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", ++ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", ++ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", ++ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", ++ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", ++ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", ++ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", ++ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", ++ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", ++ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", ++ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", ++ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", ++ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; ++ ++static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; ++static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; ++ ++static inline unsigned short netdev_lock_pos(unsigned short dev_type) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) ++ if (netdev_lock_type[i] == dev_type) ++ return i; ++ /* the last key is used by default */ ++ return ARRAY_SIZE(netdev_lock_type) - 1; ++} ++ ++static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, ++ unsigned short dev_type) ++{ ++ int i; ++ ++ i = netdev_lock_pos(dev_type); ++ lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], ++ netdev_lock_name[i]); ++} ++ ++static inline void netdev_set_addr_lockdep_class(struct net_device *dev) ++{ ++ int i; ++ ++ i = netdev_lock_pos(dev->type); ++ lockdep_set_class_and_name(&dev->addr_list_lock, ++ &netdev_addr_lock_key[i], ++ netdev_lock_name[i]); ++} ++#else ++static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, ++ unsigned short dev_type) ++{ ++} ++ ++static inline void netdev_set_addr_lockdep_class(struct net_device *dev) ++{ ++} ++#endif ++ ++/******************************************************************************* ++ * ++ * Protocol management and registration routines ++ * ++ *******************************************************************************/ ++ ++ ++/* ++ * Add a protocol ID to the list. Now that the input handler is ++ * smarter we can dispense with all the messy stuff that used to be ++ * here. ++ * ++ * BEWARE!!! Protocol handlers, mangling input packets, ++ * MUST BE last in hash buckets and checking protocol handlers ++ * MUST start from promiscuous ptype_all chain in net_bh. ++ * It is true now, do not change it. ++ * Explanation follows: if protocol handler, mangling packet, will ++ * be the first on list, it is not able to sense, that packet ++ * is cloned and should be copied-on-write, so that it will ++ * change it and subsequent readers will get broken packet. ++ * --ANK (980803) ++ */ ++ ++static inline struct list_head *ptype_head(const struct packet_type *pt) ++{ ++ if (pt->type == htons(ETH_P_ALL)) ++ return pt->dev ? &pt->dev->ptype_all : &ptype_all; ++ else ++ return pt->dev ? &pt->dev->ptype_specific : ++ &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; ++} ++ ++/** ++ * dev_add_pack - add packet handler ++ * @pt: packet type declaration ++ * ++ * Add a protocol handler to the networking stack. The passed &packet_type ++ * is linked into kernel lists and may not be freed until it has been ++ * removed from the kernel lists. ++ * ++ * This call does not sleep therefore it can not ++ * guarantee all CPU's that are in middle of receiving packets ++ * will see the new packet type (until the next received packet). ++ */ ++ ++void dev_add_pack(struct packet_type *pt) ++{ ++ struct list_head *head = ptype_head(pt); ++ ++ spin_lock(&ptype_lock); ++ list_add_rcu(&pt->list, head); ++ spin_unlock(&ptype_lock); ++} ++EXPORT_SYMBOL(dev_add_pack); ++ ++/** ++ * __dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ * ++ * The packet type might still be in use by receivers ++ * and must not be freed until after all the CPU's have gone ++ * through a quiescent state. ++ */ ++void __dev_remove_pack(struct packet_type *pt) ++{ ++ struct list_head *head = ptype_head(pt); ++ struct packet_type *pt1; ++ ++ spin_lock(&ptype_lock); ++ ++ list_for_each_entry(pt1, head, list) { ++ if (pt == pt1) { ++ list_del_rcu(&pt->list); ++ goto out; ++ } ++ } ++ ++ pr_warn("dev_remove_pack: %p not found\n", pt); ++out: ++ spin_unlock(&ptype_lock); ++} ++EXPORT_SYMBOL(__dev_remove_pack); ++ ++/** ++ * dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ * ++ * This call sleeps to guarantee that no CPU is looking at the packet ++ * type after return. ++ */ ++void dev_remove_pack(struct packet_type *pt) ++{ ++ __dev_remove_pack(pt); ++ ++ synchronize_net(); ++} ++EXPORT_SYMBOL(dev_remove_pack); ++ ++ ++/******************************************************************************* ++ * ++ * Device Interface Subroutines ++ * ++ *******************************************************************************/ ++ ++/** ++ * dev_get_iflink - get 'iflink' value of a interface ++ * @dev: targeted interface ++ * ++ * Indicates the ifindex the interface is linked to. ++ * Physical interfaces have the same 'ifindex' and 'iflink' values. ++ */ ++ ++int dev_get_iflink(const struct net_device *dev) ++{ ++ if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) ++ return dev->netdev_ops->ndo_get_iflink(dev); ++ ++ return dev->ifindex; ++} ++EXPORT_SYMBOL(dev_get_iflink); ++ ++/** ++ * dev_fill_metadata_dst - Retrieve tunnel egress information. ++ * @dev: targeted interface ++ * @skb: The packet. ++ * ++ * For better visibility of tunnel traffic OVS needs to retrieve ++ * egress tunnel information for a packet. Following API allows ++ * user to get this info. ++ */ ++int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) ++{ ++ struct ip_tunnel_info *info; ++ ++ if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) ++ return -EINVAL; ++ ++ info = skb_tunnel_info_unclone(skb); ++ if (!info) ++ return -ENOMEM; ++ if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) ++ return -EINVAL; ++ ++ return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); ++} ++EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); ++ ++static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) ++{ ++ int k = stack->num_paths++; ++ ++ if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) ++ return NULL; ++ ++ return &stack->path[k]; ++} ++ ++int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, ++ struct net_device_path_stack *stack) ++{ ++ const struct net_device *last_dev; ++ struct net_device_path_ctx ctx = { ++ .dev = dev, ++ }; ++ struct net_device_path *path; ++ int ret = 0; ++ ++ memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); ++ stack->num_paths = 0; ++ while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { ++ last_dev = ctx.dev; ++ path = dev_fwd_path(stack); ++ if (!path) ++ return -1; ++ ++ memset(path, 0, sizeof(struct net_device_path)); ++ ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); ++ if (ret < 0) ++ return -1; ++ ++ if (WARN_ON_ONCE(last_dev == ctx.dev)) ++ return -1; ++ } ++ ++ if (!ctx.dev) ++ return ret; ++ ++ path = dev_fwd_path(stack); ++ if (!path) ++ return -1; ++ path->type = DEV_PATH_ETHERNET; ++ path->dev = ctx.dev; ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(dev_fill_forward_path); ++ ++/** ++ * __dev_get_by_name - find a device by its name ++ * @net: the applicable net namespace ++ * @name: name to find ++ * ++ * Find an interface by name. Must be called under RTNL semaphore ++ * or @dev_base_lock. If the name is found a pointer to the device ++ * is returned. If the name is not found then %NULL is returned. The ++ * reference counters are not incremented so the caller must be ++ * careful with locks. ++ */ ++ ++struct net_device *__dev_get_by_name(struct net *net, const char *name) ++{ ++ struct netdev_name_node *node_name; ++ ++ node_name = netdev_name_node_lookup(net, name); ++ return node_name ? node_name->dev : NULL; ++} ++EXPORT_SYMBOL(__dev_get_by_name); ++ ++/** ++ * dev_get_by_name_rcu - find a device by its name ++ * @net: the applicable net namespace ++ * @name: name to find ++ * ++ * Find an interface by name. ++ * If the name is found a pointer to the device is returned. ++ * If the name is not found then %NULL is returned. ++ * The reference counters are not incremented so the caller must be ++ * careful with locks. The caller must hold RCU lock. ++ */ ++ ++struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) ++{ ++ struct netdev_name_node *node_name; ++ ++ node_name = netdev_name_node_lookup_rcu(net, name); ++ return node_name ? node_name->dev : NULL; ++} ++EXPORT_SYMBOL(dev_get_by_name_rcu); ++ ++/** ++ * dev_get_by_name - find a device by its name ++ * @net: the applicable net namespace ++ * @name: name to find ++ * ++ * Find an interface by name. This can be called from any ++ * context and does its own locking. The returned handle has ++ * the usage count incremented and the caller must use dev_put() to ++ * release it when it is no longer needed. %NULL is returned if no ++ * matching device is found. ++ */ ++ ++struct net_device *dev_get_by_name(struct net *net, const char *name) ++{ ++ struct net_device *dev; ++ ++ rcu_read_lock(); ++ dev = dev_get_by_name_rcu(net, name); ++ dev_hold(dev); ++ rcu_read_unlock(); ++ return dev; ++} ++EXPORT_SYMBOL(dev_get_by_name); ++ ++/** ++ * __dev_get_by_index - find a device by its ifindex ++ * @net: the applicable net namespace ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not ++ * had its reference counter increased so the caller must be careful ++ * about locking. The caller must hold either the RTNL semaphore ++ * or @dev_base_lock. ++ */ ++ ++struct net_device *__dev_get_by_index(struct net *net, int ifindex) ++{ ++ struct net_device *dev; ++ struct hlist_head *head = dev_index_hash(net, ifindex); ++ ++ hlist_for_each_entry(dev, head, index_hlist) ++ if (dev->ifindex == ifindex) ++ return dev; ++ ++ return NULL; ++} ++EXPORT_SYMBOL(__dev_get_by_index); ++ ++/** ++ * dev_get_by_index_rcu - find a device by its ifindex ++ * @net: the applicable net namespace ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not ++ * had its reference counter increased so the caller must be careful ++ * about locking. The caller must hold RCU lock. ++ */ ++ ++struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) ++{ ++ struct net_device *dev; ++ struct hlist_head *head = dev_index_hash(net, ifindex); ++ ++ hlist_for_each_entry_rcu(dev, head, index_hlist) ++ if (dev->ifindex == ifindex) ++ return dev; ++ ++ return NULL; ++} ++EXPORT_SYMBOL(dev_get_by_index_rcu); ++ ++ ++/** ++ * dev_get_by_index - find a device by its ifindex ++ * @net: the applicable net namespace ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns NULL if the device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device *dev_get_by_index(struct net *net, int ifindex) ++{ ++ struct net_device *dev; ++ ++ rcu_read_lock(); ++ dev = dev_get_by_index_rcu(net, ifindex); ++ dev_hold(dev); ++ rcu_read_unlock(); ++ return dev; ++} ++EXPORT_SYMBOL(dev_get_by_index); ++ ++/** ++ * dev_get_by_napi_id - find a device by napi_id ++ * @napi_id: ID of the NAPI struct ++ * ++ * Search for an interface by NAPI ID. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not had ++ * its reference counter increased so the caller must be careful ++ * about locking. The caller must hold RCU lock. ++ */ ++ ++struct net_device *dev_get_by_napi_id(unsigned int napi_id) ++{ ++ struct napi_struct *napi; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++ ++ if (napi_id < MIN_NAPI_ID) ++ return NULL; ++ ++ napi = napi_by_id(napi_id); ++ ++ return napi ? napi->dev : NULL; ++} ++EXPORT_SYMBOL(dev_get_by_napi_id); ++ ++/** ++ * netdev_get_name - get a netdevice name, knowing its ifindex. ++ * @net: network namespace ++ * @name: a pointer to the buffer where the name will be stored. ++ * @ifindex: the ifindex of the interface to get the name from. ++ */ ++int netdev_get_name(struct net *net, char *name, int ifindex) ++{ ++ struct net_device *dev; ++ int ret; ++ ++ down_read(&devnet_rename_sem); ++ rcu_read_lock(); ++ ++ dev = dev_get_by_index_rcu(net, ifindex); ++ if (!dev) { ++ ret = -ENODEV; ++ goto out; ++ } ++ ++ strcpy(name, dev->name); ++ ++ ret = 0; ++out: ++ rcu_read_unlock(); ++ up_read(&devnet_rename_sem); ++ return ret; ++} ++ ++/** ++ * dev_getbyhwaddr_rcu - find a device by its hardware address ++ * @net: the applicable net namespace ++ * @type: media type of device ++ * @ha: hardware address ++ * ++ * Search for an interface by MAC address. Returns NULL if the device ++ * is not found or a pointer to the device. ++ * The caller must hold RCU or RTNL. ++ * The returned device has not had its ref count increased ++ * and the caller must therefore be careful about locking ++ * ++ */ ++ ++struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, ++ const char *ha) ++{ ++ struct net_device *dev; ++ ++ for_each_netdev_rcu(net, dev) ++ if (dev->type == type && ++ !memcmp(dev->dev_addr, ha, dev->addr_len)) ++ return dev; ++ ++ return NULL; ++} ++EXPORT_SYMBOL(dev_getbyhwaddr_rcu); ++ ++struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) ++{ ++ struct net_device *dev, *ret = NULL; ++ ++ rcu_read_lock(); ++ for_each_netdev_rcu(net, dev) ++ if (dev->type == type) { ++ dev_hold(dev); ++ ret = dev; ++ break; ++ } ++ rcu_read_unlock(); ++ return ret; ++} ++EXPORT_SYMBOL(dev_getfirstbyhwtype); ++ ++/** ++ * __dev_get_by_flags - find any device with given flags ++ * @net: the applicable net namespace ++ * @if_flags: IFF_* values ++ * @mask: bitmask of bits in if_flags to check ++ * ++ * Search for any interface with the given flags. Returns NULL if a device ++ * is not found or a pointer to the device. Must be called inside ++ * rtnl_lock(), and result refcount is unchanged. ++ */ ++ ++struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, ++ unsigned short mask) ++{ ++ struct net_device *dev, *ret; ++ ++ ASSERT_RTNL(); ++ ++ ret = NULL; ++ for_each_netdev(net, dev) { ++ if (((dev->flags ^ if_flags) & mask) == 0) { ++ ret = dev; ++ break; ++ } ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__dev_get_by_flags); ++ ++/** ++ * dev_valid_name - check if name is okay for network device ++ * @name: name string ++ * ++ * Network device names need to be valid file names to ++ * allow sysfs to work. We also disallow any kind of ++ * whitespace. ++ */ ++bool dev_valid_name(const char *name) ++{ ++ if (*name == '\0') ++ return false; ++ if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) ++ return false; ++ if (!strcmp(name, ".") || !strcmp(name, "..")) ++ return false; ++ ++ while (*name) { ++ if (*name == '/' || *name == ':' || isspace(*name)) ++ return false; ++ name++; ++ } ++ return true; ++} ++EXPORT_SYMBOL(dev_valid_name); ++ ++/** ++ * __dev_alloc_name - allocate a name for a device ++ * @net: network namespace to allocate the device name in ++ * @name: name format string ++ * @buf: scratch buffer and result name string ++ * ++ * Passed a format string - eg "lt%d" it will try and find a suitable ++ * id. It scans list of devices to build up a free map, then chooses ++ * the first empty slot. The caller must hold the dev_base or rtnl lock ++ * while allocating the name and adding the device in order to avoid ++ * duplicates. ++ * Limited to bits_per_byte * page size devices (ie 32K on most platforms). ++ * Returns the number of the unit assigned or a negative errno code. ++ */ ++ ++static int __dev_alloc_name(struct net *net, const char *name, char *buf) ++{ ++ int i = 0; ++ const char *p; ++ const int max_netdevices = 8*PAGE_SIZE; ++ unsigned long *inuse; ++ struct net_device *d; ++ ++ if (!dev_valid_name(name)) ++ return -EINVAL; ++ ++ p = strchr(name, '%'); ++ if (p) { ++ /* ++ * Verify the string as this thing may have come from ++ * the user. There must be either one "%d" and no other "%" ++ * characters. ++ */ ++ if (p[1] != 'd' || strchr(p + 2, '%')) ++ return -EINVAL; ++ ++ /* Use one page as a bit array of possible slots */ ++ inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); ++ if (!inuse) ++ return -ENOMEM; ++ ++ for_each_netdev(net, d) { ++ struct netdev_name_node *name_node; ++ list_for_each_entry(name_node, &d->name_node->list, list) { ++ if (!sscanf(name_node->name, name, &i)) ++ continue; ++ if (i < 0 || i >= max_netdevices) ++ continue; ++ ++ /* avoid cases where sscanf is not exact inverse of printf */ ++ snprintf(buf, IFNAMSIZ, name, i); ++ if (!strncmp(buf, name_node->name, IFNAMSIZ)) ++ __set_bit(i, inuse); ++ } ++ if (!sscanf(d->name, name, &i)) ++ continue; ++ if (i < 0 || i >= max_netdevices) ++ continue; ++ ++ /* avoid cases where sscanf is not exact inverse of printf */ ++ snprintf(buf, IFNAMSIZ, name, i); ++ if (!strncmp(buf, d->name, IFNAMSIZ)) ++ __set_bit(i, inuse); ++ } ++ ++ i = find_first_zero_bit(inuse, max_netdevices); ++ free_page((unsigned long) inuse); ++ } ++ ++ snprintf(buf, IFNAMSIZ, name, i); ++ if (!netdev_name_in_use(net, buf)) ++ return i; ++ ++ /* It is possible to run out of possible slots ++ * when the name is long and there isn't enough space left ++ * for the digits, or if all bits are used. ++ */ ++ return -ENFILE; ++} ++ ++static int dev_alloc_name_ns(struct net *net, ++ struct net_device *dev, ++ const char *name) ++{ ++ char buf[IFNAMSIZ]; ++ int ret; ++ ++ BUG_ON(!net); ++ ret = __dev_alloc_name(net, name, buf); ++ if (ret >= 0) ++ strlcpy(dev->name, buf, IFNAMSIZ); ++ return ret; ++} ++ ++/** ++ * dev_alloc_name - allocate a name for a device ++ * @dev: device ++ * @name: name format string ++ * ++ * Passed a format string - eg "lt%d" it will try and find a suitable ++ * id. It scans list of devices to build up a free map, then chooses ++ * the first empty slot. The caller must hold the dev_base or rtnl lock ++ * while allocating the name and adding the device in order to avoid ++ * duplicates. ++ * Limited to bits_per_byte * page size devices (ie 32K on most platforms). ++ * Returns the number of the unit assigned or a negative errno code. ++ */ ++ ++int dev_alloc_name(struct net_device *dev, const char *name) ++{ ++ return dev_alloc_name_ns(dev_net(dev), dev, name); ++} ++EXPORT_SYMBOL(dev_alloc_name); ++ ++static int dev_get_valid_name(struct net *net, struct net_device *dev, ++ const char *name) ++{ ++ BUG_ON(!net); ++ ++ if (!dev_valid_name(name)) ++ return -EINVAL; ++ ++ if (strchr(name, '%')) ++ return dev_alloc_name_ns(net, dev, name); ++ else if (netdev_name_in_use(net, name)) ++ return -EEXIST; ++ else if (dev->name != name) ++ strlcpy(dev->name, name, IFNAMSIZ); ++ ++ return 0; ++} ++ ++/** ++ * dev_change_name - change name of a device ++ * @dev: device ++ * @newname: name (or format string) must be at least IFNAMSIZ ++ * ++ * Change name of a device, can pass format strings "eth%d". ++ * for wildcarding. ++ */ ++int dev_change_name(struct net_device *dev, const char *newname) ++{ ++ unsigned char old_assign_type; ++ char oldname[IFNAMSIZ]; ++ int err = 0; ++ int ret; ++ struct net *net; ++ ++ ASSERT_RTNL(); ++ BUG_ON(!dev_net(dev)); ++ ++ net = dev_net(dev); ++ ++ /* Some auto-enslaved devices e.g. failover slaves are ++ * special, as userspace might rename the device after ++ * the interface had been brought up and running since ++ * the point kernel initiated auto-enslavement. Allow ++ * live name change even when these slave devices are ++ * up and running. ++ * ++ * Typically, users of these auto-enslaving devices ++ * don't actually care about slave name change, as ++ * they are supposed to operate on master interface ++ * directly. ++ */ ++ if (dev->flags & IFF_UP && ++ likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) ++ return -EBUSY; ++ ++ down_write(&devnet_rename_sem); ++ ++ if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { ++ up_write(&devnet_rename_sem); ++ return 0; ++ } ++ ++ memcpy(oldname, dev->name, IFNAMSIZ); ++ ++ err = dev_get_valid_name(net, dev, newname); ++ if (err < 0) { ++ up_write(&devnet_rename_sem); ++ return err; ++ } ++ ++ if (oldname[0] && !strchr(oldname, '%')) ++ netdev_info(dev, "renamed from %s\n", oldname); ++ ++ old_assign_type = dev->name_assign_type; ++ dev->name_assign_type = NET_NAME_RENAMED; ++ ++rollback: ++ ret = device_rename(&dev->dev, dev->name); ++ if (ret) { ++ memcpy(dev->name, oldname, IFNAMSIZ); ++ dev->name_assign_type = old_assign_type; ++ up_write(&devnet_rename_sem); ++ return ret; ++ } ++ ++ up_write(&devnet_rename_sem); ++ ++ netdev_adjacent_rename_links(dev, oldname); ++ ++ write_lock(&dev_base_lock); ++ netdev_name_node_del(dev->name_node); ++ write_unlock(&dev_base_lock); ++ ++ synchronize_rcu(); ++ ++ write_lock(&dev_base_lock); ++ netdev_name_node_add(net, dev->name_node); ++ write_unlock(&dev_base_lock); ++ ++ ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ++ ret = notifier_to_errno(ret); ++ ++ if (ret) { ++ /* err >= 0 after dev_alloc_name() or stores the first errno */ ++ if (err >= 0) { ++ err = ret; ++ down_write(&devnet_rename_sem); ++ memcpy(dev->name, oldname, IFNAMSIZ); ++ memcpy(oldname, newname, IFNAMSIZ); ++ dev->name_assign_type = old_assign_type; ++ old_assign_type = NET_NAME_RENAMED; ++ goto rollback; ++ } else { ++ netdev_err(dev, "name change rollback failed: %d\n", ++ ret); ++ } ++ } ++ ++ return err; ++} ++ ++/** ++ * dev_set_alias - change ifalias of a device ++ * @dev: device ++ * @alias: name up to IFALIASZ ++ * @len: limit of bytes to copy from info ++ * ++ * Set ifalias for a device, ++ */ ++int dev_set_alias(struct net_device *dev, const char *alias, size_t len) ++{ ++ struct dev_ifalias *new_alias = NULL; ++ ++ if (len >= IFALIASZ) ++ return -EINVAL; ++ ++ if (len) { ++ new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); ++ if (!new_alias) ++ return -ENOMEM; ++ ++ memcpy(new_alias->ifalias, alias, len); ++ new_alias->ifalias[len] = 0; ++ } ++ ++ mutex_lock(&ifalias_mutex); ++ new_alias = rcu_replace_pointer(dev->ifalias, new_alias, ++ mutex_is_locked(&ifalias_mutex)); ++ mutex_unlock(&ifalias_mutex); ++ ++ if (new_alias) ++ kfree_rcu(new_alias, rcuhead); ++ ++ return len; ++} ++EXPORT_SYMBOL(dev_set_alias); ++ ++/** ++ * dev_get_alias - get ifalias of a device ++ * @dev: device ++ * @name: buffer to store name of ifalias ++ * @len: size of buffer ++ * ++ * get ifalias for a device. Caller must make sure dev cannot go ++ * away, e.g. rcu read lock or own a reference count to device. ++ */ ++int dev_get_alias(const struct net_device *dev, char *name, size_t len) ++{ ++ const struct dev_ifalias *alias; ++ int ret = 0; ++ ++ rcu_read_lock(); ++ alias = rcu_dereference(dev->ifalias); ++ if (alias) ++ ret = snprintf(name, len, "%s", alias->ifalias); ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++/** ++ * netdev_features_change - device changes features ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed features. ++ */ ++void netdev_features_change(struct net_device *dev) ++{ ++ call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); ++} ++EXPORT_SYMBOL(netdev_features_change); ++ ++/** ++ * netdev_state_change - device changes state ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed state. This function calls ++ * the notifier chains for netdev_chain and sends a NEWLINK message ++ * to the routing socket. ++ */ ++void netdev_state_change(struct net_device *dev) ++{ ++ if (dev->flags & IFF_UP) { ++ struct netdev_notifier_change_info change_info = { ++ .info.dev = dev, ++ }; ++ ++ call_netdevice_notifiers_info(NETDEV_CHANGE, ++ &change_info.info); ++ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); ++ } ++} ++EXPORT_SYMBOL(netdev_state_change); ++ ++/** ++ * __netdev_notify_peers - notify network peers about existence of @dev, ++ * to be called when rtnl lock is already held. ++ * @dev: network device ++ * ++ * Generate traffic such that interested network peers are aware of ++ * @dev, such as by generating a gratuitous ARP. This may be used when ++ * a device wants to inform the rest of the network about some sort of ++ * reconfiguration such as a failover event or virtual machine ++ * migration. ++ */ ++void __netdev_notify_peers(struct net_device *dev) ++{ ++ ASSERT_RTNL(); ++ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); ++ call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); ++} ++EXPORT_SYMBOL(__netdev_notify_peers); ++ ++/** ++ * netdev_notify_peers - notify network peers about existence of @dev ++ * @dev: network device ++ * ++ * Generate traffic such that interested network peers are aware of ++ * @dev, such as by generating a gratuitous ARP. This may be used when ++ * a device wants to inform the rest of the network about some sort of ++ * reconfiguration such as a failover event or virtual machine ++ * migration. ++ */ ++void netdev_notify_peers(struct net_device *dev) ++{ ++ rtnl_lock(); ++ __netdev_notify_peers(dev); ++ rtnl_unlock(); ++} ++EXPORT_SYMBOL(netdev_notify_peers); ++ ++static int napi_threaded_poll(void *data); ++ ++static int napi_kthread_create(struct napi_struct *n) ++{ ++ int err = 0; ++ ++ /* Create and wake up the kthread once to put it in ++ * TASK_INTERRUPTIBLE mode to avoid the blocked task ++ * warning and work with loadavg. ++ */ ++ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", ++ n->dev->name, n->napi_id); ++ if (IS_ERR(n->thread)) { ++ err = PTR_ERR(n->thread); ++ pr_err("kthread_run failed with err %d\n", err); ++ n->thread = NULL; ++ } ++ ++ return err; ++} ++ ++static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ int ret; ++ ++ ASSERT_RTNL(); ++ dev_addr_check(dev); ++ ++ if (!netif_device_present(dev)) { ++ /* may be detached because parent is runtime-suspended */ ++ if (dev->dev.parent) ++ pm_runtime_resume(dev->dev.parent); ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ } ++ ++ /* Block netpoll from trying to do any rx path servicing. ++ * If we don't do this there is a chance ndo_poll_controller ++ * or ndo_poll may be running while we open the device ++ */ ++ netpoll_poll_disable(dev); ++ ++ ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ++ ret = notifier_to_errno(ret); ++ if (ret) ++ return ret; ++ ++ set_bit(__LINK_STATE_START, &dev->state); ++ ++ if (ops->ndo_validate_addr) ++ ret = ops->ndo_validate_addr(dev); ++ ++ if (!ret && ops->ndo_open) ++ ret = ops->ndo_open(dev); ++ ++ netpoll_poll_enable(dev); ++ ++ if (ret) ++ clear_bit(__LINK_STATE_START, &dev->state); ++ else { ++ dev->flags |= IFF_UP; ++ dev_set_rx_mode(dev); ++ dev_activate(dev); ++ add_device_randomness(dev->dev_addr, dev->addr_len); ++ } ++ ++ return ret; ++} ++ ++/** ++ * dev_open - prepare an interface for use. ++ * @dev: device to open ++ * @extack: netlink extended ack ++ * ++ * Takes a device from down to up state. The device's private open ++ * function is invoked and then the multicast lists are loaded. Finally ++ * the device is moved into the up state and a %NETDEV_UP message is ++ * sent to the netdev notifier chain. ++ * ++ * Calling this function on an active interface is a nop. On a failure ++ * a negative errno code is returned. ++ */ ++int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) ++{ ++ int ret; ++ ++ if (dev->flags & IFF_UP) ++ return 0; ++ ++ ret = __dev_open(dev, extack); ++ if (ret < 0) ++ return ret; ++ ++ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); ++ call_netdevice_notifiers(NETDEV_UP, dev); ++ ++ return ret; ++} ++EXPORT_SYMBOL(dev_open); ++ ++static void __dev_close_many(struct list_head *head) ++{ ++ struct net_device *dev; ++ ++ ASSERT_RTNL(); ++ might_sleep(); ++ ++ list_for_each_entry(dev, head, close_list) { ++ /* Temporarily disable netpoll until the interface is down */ ++ netpoll_poll_disable(dev); ++ ++ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); ++ ++ clear_bit(__LINK_STATE_START, &dev->state); ++ ++ /* Synchronize to scheduled poll. We cannot touch poll list, it ++ * can be even on different cpu. So just clear netif_running(). ++ * ++ * dev->stop() will invoke napi_disable() on all of it's ++ * napi_struct instances on this device. ++ */ ++ smp_mb__after_atomic(); /* Commit netif_running(). */ ++ } ++ ++ dev_deactivate_many(head); ++ ++ list_for_each_entry(dev, head, close_list) { ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ /* ++ * Call the device specific close. This cannot fail. ++ * Only if device is UP ++ * ++ * We allow it to be called even after a DETACH hot-plug ++ * event. ++ */ ++ if (ops->ndo_stop) ++ ops->ndo_stop(dev); ++ ++ dev->flags &= ~IFF_UP; ++ netpoll_poll_enable(dev); ++ } ++} ++ ++static void __dev_close(struct net_device *dev) ++{ ++ LIST_HEAD(single); ++ ++ list_add(&dev->close_list, &single); ++ __dev_close_many(&single); ++ list_del(&single); ++} ++ ++void dev_close_many(struct list_head *head, bool unlink) ++{ ++ struct net_device *dev, *tmp; ++ ++ /* Remove the devices that don't need to be closed */ ++ list_for_each_entry_safe(dev, tmp, head, close_list) ++ if (!(dev->flags & IFF_UP)) ++ list_del_init(&dev->close_list); ++ ++ __dev_close_many(head); ++ ++ list_for_each_entry_safe(dev, tmp, head, close_list) { ++ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); ++ call_netdevice_notifiers(NETDEV_DOWN, dev); ++ if (unlink) ++ list_del_init(&dev->close_list); ++ } ++} ++EXPORT_SYMBOL(dev_close_many); ++ ++/** ++ * dev_close - shutdown an interface. ++ * @dev: device to shutdown ++ * ++ * This function moves an active device into down state. A ++ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device ++ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier ++ * chain. ++ */ ++void dev_close(struct net_device *dev) ++{ ++ if (dev->flags & IFF_UP) { ++ LIST_HEAD(single); ++ ++ list_add(&dev->close_list, &single); ++ dev_close_many(&single, true); ++ list_del(&single); ++ } ++} ++EXPORT_SYMBOL(dev_close); ++ ++ ++/** ++ * dev_disable_lro - disable Large Receive Offload on a device ++ * @dev: device ++ * ++ * Disable Large Receive Offload (LRO) on a net device. Must be ++ * called under RTNL. This is needed if received packets may be ++ * forwarded to another interface. ++ */ ++void dev_disable_lro(struct net_device *dev) ++{ ++ struct net_device *lower_dev; ++ struct list_head *iter; ++ ++ dev->wanted_features &= ~NETIF_F_LRO; ++ netdev_update_features(dev); ++ ++ if (unlikely(dev->features & NETIF_F_LRO)) ++ netdev_WARN(dev, "failed to disable LRO!\n"); ++ ++ netdev_for_each_lower_dev(dev, lower_dev, iter) ++ dev_disable_lro(lower_dev); ++} ++EXPORT_SYMBOL(dev_disable_lro); ++ ++/** ++ * dev_disable_gro_hw - disable HW Generic Receive Offload on a device ++ * @dev: device ++ * ++ * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be ++ * called under RTNL. This is needed if Generic XDP is installed on ++ * the device. ++ */ ++static void dev_disable_gro_hw(struct net_device *dev) ++{ ++ dev->wanted_features &= ~NETIF_F_GRO_HW; ++ netdev_update_features(dev); ++ ++ if (unlikely(dev->features & NETIF_F_GRO_HW)) ++ netdev_WARN(dev, "failed to disable GRO_HW!\n"); ++} ++ ++const char *netdev_cmd_to_name(enum netdev_cmd cmd) ++{ ++#define N(val) \ ++ case NETDEV_##val: \ ++ return "NETDEV_" __stringify(val); ++ switch (cmd) { ++ N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) ++ N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) ++ N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) ++ N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) ++ N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) ++ N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) ++ N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) ++ N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) ++ N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) ++ N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) ++ N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) ++ } ++#undef N ++ return "UNKNOWN_NETDEV_EVENT"; ++} ++EXPORT_SYMBOL_GPL(netdev_cmd_to_name); ++ ++static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, ++ struct net_device *dev) ++{ ++ struct netdev_notifier_info info = { ++ .dev = dev, ++ }; ++ ++ return nb->notifier_call(nb, val, &info); ++} ++ ++static int call_netdevice_register_notifiers(struct notifier_block *nb, ++ struct net_device *dev) ++{ ++ int err; ++ ++ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); ++ err = notifier_to_errno(err); ++ if (err) ++ return err; ++ ++ if (!(dev->flags & IFF_UP)) ++ return 0; ++ ++ call_netdevice_notifier(nb, NETDEV_UP, dev); ++ return 0; ++} ++ ++static void call_netdevice_unregister_notifiers(struct notifier_block *nb, ++ struct net_device *dev) ++{ ++ if (dev->flags & IFF_UP) { ++ call_netdevice_notifier(nb, NETDEV_GOING_DOWN, ++ dev); ++ call_netdevice_notifier(nb, NETDEV_DOWN, dev); ++ } ++ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); ++} ++ ++static int call_netdevice_register_net_notifiers(struct notifier_block *nb, ++ struct net *net) ++{ ++ struct net_device *dev; ++ int err; ++ ++ for_each_netdev(net, dev) { ++ err = call_netdevice_register_notifiers(nb, dev); ++ if (err) ++ goto rollback; ++ } ++ return 0; ++ ++rollback: ++ for_each_netdev_continue_reverse(net, dev) ++ call_netdevice_unregister_notifiers(nb, dev); ++ return err; ++} ++ ++static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, ++ struct net *net) ++{ ++ struct net_device *dev; ++ ++ for_each_netdev(net, dev) ++ call_netdevice_unregister_notifiers(nb, dev); ++} ++ ++static int dev_boot_phase = 1; ++ ++/** ++ * register_netdevice_notifier - register a network notifier block ++ * @nb: notifier ++ * ++ * Register a notifier to be called when network device events occur. ++ * The notifier passed is linked into the kernel structures and must ++ * not be reused until it has been unregistered. A negative errno code ++ * is returned on a failure. ++ * ++ * When registered all registration and up events are replayed ++ * to the new notifier to allow device to have a race free ++ * view of the network device list. ++ */ ++ ++int register_netdevice_notifier(struct notifier_block *nb) ++{ ++ struct net *net; ++ int err; ++ ++ /* Close race with setup_net() and cleanup_net() */ ++ down_write(&pernet_ops_rwsem); ++ rtnl_lock(); ++ err = raw_notifier_chain_register(&netdev_chain, nb); ++ if (err) ++ goto unlock; ++ if (dev_boot_phase) ++ goto unlock; ++ for_each_net(net) { ++ err = call_netdevice_register_net_notifiers(nb, net); ++ if (err) ++ goto rollback; ++ } ++ ++unlock: ++ rtnl_unlock(); ++ up_write(&pernet_ops_rwsem); ++ return err; ++ ++rollback: ++ for_each_net_continue_reverse(net) ++ call_netdevice_unregister_net_notifiers(nb, net); ++ ++ raw_notifier_chain_unregister(&netdev_chain, nb); ++ goto unlock; ++} ++EXPORT_SYMBOL(register_netdevice_notifier); ++ ++/** ++ * unregister_netdevice_notifier - unregister a network notifier block ++ * @nb: notifier ++ * ++ * Unregister a notifier previously registered by ++ * register_netdevice_notifier(). The notifier is unlinked into the ++ * kernel structures and may then be reused. A negative errno code ++ * is returned on a failure. ++ * ++ * After unregistering unregister and down device events are synthesized ++ * for all devices on the device list to the removed notifier to remove ++ * the need for special case cleanup code. ++ */ ++ ++int unregister_netdevice_notifier(struct notifier_block *nb) ++{ ++ struct net *net; ++ int err; ++ ++ /* Close race with setup_net() and cleanup_net() */ ++ down_write(&pernet_ops_rwsem); ++ rtnl_lock(); ++ err = raw_notifier_chain_unregister(&netdev_chain, nb); ++ if (err) ++ goto unlock; ++ ++ for_each_net(net) ++ call_netdevice_unregister_net_notifiers(nb, net); ++ ++unlock: ++ rtnl_unlock(); ++ up_write(&pernet_ops_rwsem); ++ return err; ++} ++EXPORT_SYMBOL(unregister_netdevice_notifier); ++ ++static int __register_netdevice_notifier_net(struct net *net, ++ struct notifier_block *nb, ++ bool ignore_call_fail) ++{ ++ int err; ++ ++ err = raw_notifier_chain_register(&net->netdev_chain, nb); ++ if (err) ++ return err; ++ if (dev_boot_phase) ++ return 0; ++ ++ err = call_netdevice_register_net_notifiers(nb, net); ++ if (err && !ignore_call_fail) ++ goto chain_unregister; ++ ++ return 0; ++ ++chain_unregister: ++ raw_notifier_chain_unregister(&net->netdev_chain, nb); ++ return err; ++} ++ ++static int __unregister_netdevice_notifier_net(struct net *net, ++ struct notifier_block *nb) ++{ ++ int err; ++ ++ err = raw_notifier_chain_unregister(&net->netdev_chain, nb); ++ if (err) ++ return err; ++ ++ call_netdevice_unregister_net_notifiers(nb, net); ++ return 0; ++} ++ ++/** ++ * register_netdevice_notifier_net - register a per-netns network notifier block ++ * @net: network namespace ++ * @nb: notifier ++ * ++ * Register a notifier to be called when network device events occur. ++ * The notifier passed is linked into the kernel structures and must ++ * not be reused until it has been unregistered. A negative errno code ++ * is returned on a failure. ++ * ++ * When registered all registration and up events are replayed ++ * to the new notifier to allow device to have a race free ++ * view of the network device list. ++ */ ++ ++int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) ++{ ++ int err; ++ ++ rtnl_lock(); ++ err = __register_netdevice_notifier_net(net, nb, false); ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(register_netdevice_notifier_net); ++ ++/** ++ * unregister_netdevice_notifier_net - unregister a per-netns ++ * network notifier block ++ * @net: network namespace ++ * @nb: notifier ++ * ++ * Unregister a notifier previously registered by ++ * register_netdevice_notifier(). The notifier is unlinked into the ++ * kernel structures and may then be reused. A negative errno code ++ * is returned on a failure. ++ * ++ * After unregistering unregister and down device events are synthesized ++ * for all devices on the device list to the removed notifier to remove ++ * the need for special case cleanup code. ++ */ ++ ++int unregister_netdevice_notifier_net(struct net *net, ++ struct notifier_block *nb) ++{ ++ int err; ++ ++ rtnl_lock(); ++ err = __unregister_netdevice_notifier_net(net, nb); ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(unregister_netdevice_notifier_net); ++ ++int register_netdevice_notifier_dev_net(struct net_device *dev, ++ struct notifier_block *nb, ++ struct netdev_net_notifier *nn) ++{ ++ int err; ++ ++ rtnl_lock(); ++ err = __register_netdevice_notifier_net(dev_net(dev), nb, false); ++ if (!err) { ++ nn->nb = nb; ++ list_add(&nn->list, &dev->net_notifier_list); ++ } ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(register_netdevice_notifier_dev_net); ++ ++int unregister_netdevice_notifier_dev_net(struct net_device *dev, ++ struct notifier_block *nb, ++ struct netdev_net_notifier *nn) ++{ ++ int err; ++ ++ rtnl_lock(); ++ list_del(&nn->list); ++ err = __unregister_netdevice_notifier_net(dev_net(dev), nb); ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); ++ ++static void move_netdevice_notifiers_dev_net(struct net_device *dev, ++ struct net *net) ++{ ++ struct netdev_net_notifier *nn; ++ ++ list_for_each_entry(nn, &dev->net_notifier_list, list) { ++ __unregister_netdevice_notifier_net(dev_net(dev), nn->nb); ++ __register_netdevice_notifier_net(net, nn->nb, true); ++ } ++} ++ ++/** ++ * call_netdevice_notifiers_info - call all network notifier blocks ++ * @val: value passed unmodified to notifier function ++ * @info: notifier information data ++ * ++ * Call all network notifier blocks. Parameters and return value ++ * are as for raw_notifier_call_chain(). ++ */ ++ ++static int call_netdevice_notifiers_info(unsigned long val, ++ struct netdev_notifier_info *info) ++{ ++ struct net *net = dev_net(info->dev); ++ int ret; ++ ++ ASSERT_RTNL(); ++ ++ /* Run per-netns notifier block chain first, then run the global one. ++ * Hopefully, one day, the global one is going to be removed after ++ * all notifier block registrators get converted to be per-netns. ++ */ ++ ret = raw_notifier_call_chain(&net->netdev_chain, val, info); ++ if (ret & NOTIFY_STOP_MASK) ++ return ret; ++ return raw_notifier_call_chain(&netdev_chain, val, info); ++} ++ ++/** ++ * call_netdevice_notifiers_info_robust - call per-netns notifier blocks ++ * for and rollback on error ++ * @val_up: value passed unmodified to notifier function ++ * @val_down: value passed unmodified to the notifier function when ++ * recovering from an error on @val_up ++ * @info: notifier information data ++ * ++ * Call all per-netns network notifier blocks, but not notifier blocks on ++ * the global notifier chain. Parameters and return value are as for ++ * raw_notifier_call_chain_robust(). ++ */ ++ ++static int ++call_netdevice_notifiers_info_robust(unsigned long val_up, ++ unsigned long val_down, ++ struct netdev_notifier_info *info) ++{ ++ struct net *net = dev_net(info->dev); ++ ++ ASSERT_RTNL(); ++ ++ return raw_notifier_call_chain_robust(&net->netdev_chain, ++ val_up, val_down, info); ++} ++ ++static int call_netdevice_notifiers_extack(unsigned long val, ++ struct net_device *dev, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_info info = { ++ .dev = dev, ++ .extack = extack, ++ }; ++ ++ return call_netdevice_notifiers_info(val, &info); ++} ++ ++/** ++ * call_netdevice_notifiers - call all network notifier blocks ++ * @val: value passed unmodified to notifier function ++ * @dev: net_device pointer passed unmodified to notifier function ++ * ++ * Call all network notifier blocks. Parameters and return value ++ * are as for raw_notifier_call_chain(). ++ */ ++ ++int call_netdevice_notifiers(unsigned long val, struct net_device *dev) ++{ ++ return call_netdevice_notifiers_extack(val, dev, NULL); ++} ++EXPORT_SYMBOL(call_netdevice_notifiers); ++ ++/** ++ * call_netdevice_notifiers_mtu - call all network notifier blocks ++ * @val: value passed unmodified to notifier function ++ * @dev: net_device pointer passed unmodified to notifier function ++ * @arg: additional u32 argument passed to the notifier function ++ * ++ * Call all network notifier blocks. Parameters and return value ++ * are as for raw_notifier_call_chain(). ++ */ ++static int call_netdevice_notifiers_mtu(unsigned long val, ++ struct net_device *dev, u32 arg) ++{ ++ struct netdev_notifier_info_ext info = { ++ .info.dev = dev, ++ .ext.mtu = arg, ++ }; ++ ++ BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); ++ ++ return call_netdevice_notifiers_info(val, &info.info); ++} ++ ++#ifdef CONFIG_NET_INGRESS ++static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); ++ ++void net_inc_ingress_queue(void) ++{ ++ static_branch_inc(&ingress_needed_key); ++} ++EXPORT_SYMBOL_GPL(net_inc_ingress_queue); ++ ++void net_dec_ingress_queue(void) ++{ ++ static_branch_dec(&ingress_needed_key); ++} ++EXPORT_SYMBOL_GPL(net_dec_ingress_queue); ++#endif ++ ++#ifdef CONFIG_NET_EGRESS ++static DEFINE_STATIC_KEY_FALSE(egress_needed_key); ++ ++void net_inc_egress_queue(void) ++{ ++ static_branch_inc(&egress_needed_key); ++} ++EXPORT_SYMBOL_GPL(net_inc_egress_queue); ++ ++void net_dec_egress_queue(void) ++{ ++ static_branch_dec(&egress_needed_key); ++} ++EXPORT_SYMBOL_GPL(net_dec_egress_queue); ++#endif ++ ++DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); ++EXPORT_SYMBOL(netstamp_needed_key); ++#ifdef CONFIG_JUMP_LABEL ++static atomic_t netstamp_needed_deferred; ++static atomic_t netstamp_wanted; ++static void netstamp_clear(struct work_struct *work) ++{ ++ int deferred = atomic_xchg(&netstamp_needed_deferred, 0); ++ int wanted; ++ ++ wanted = atomic_add_return(deferred, &netstamp_wanted); ++ if (wanted > 0) ++ static_branch_enable(&netstamp_needed_key); ++ else ++ static_branch_disable(&netstamp_needed_key); ++} ++static DECLARE_WORK(netstamp_work, netstamp_clear); ++#endif ++ ++void net_enable_timestamp(void) ++{ ++#ifdef CONFIG_JUMP_LABEL ++ int wanted; ++ ++ while (1) { ++ wanted = atomic_read(&netstamp_wanted); ++ if (wanted <= 0) ++ break; ++ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) ++ return; ++ } ++ atomic_inc(&netstamp_needed_deferred); ++ schedule_work(&netstamp_work); ++#else ++ static_branch_inc(&netstamp_needed_key); ++#endif ++} ++EXPORT_SYMBOL(net_enable_timestamp); ++ ++void net_disable_timestamp(void) ++{ ++#ifdef CONFIG_JUMP_LABEL ++ int wanted; ++ ++ while (1) { ++ wanted = atomic_read(&netstamp_wanted); ++ if (wanted <= 1) ++ break; ++ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) ++ return; ++ } ++ atomic_dec(&netstamp_needed_deferred); ++ schedule_work(&netstamp_work); ++#else ++ static_branch_dec(&netstamp_needed_key); ++#endif ++} ++EXPORT_SYMBOL(net_disable_timestamp); ++ ++static inline void net_timestamp_set(struct sk_buff *skb) ++{ ++ skb->tstamp = 0; ++ skb->mono_delivery_time = 0; ++ if (static_branch_unlikely(&netstamp_needed_key)) ++ skb->tstamp = ktime_get_real(); ++} ++ ++#define net_timestamp_check(COND, SKB) \ ++ if (static_branch_unlikely(&netstamp_needed_key)) { \ ++ if ((COND) && !(SKB)->tstamp) \ ++ (SKB)->tstamp = ktime_get_real(); \ ++ } \ ++ ++bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) ++{ ++ return __is_skb_forwardable(dev, skb, true); ++} ++EXPORT_SYMBOL_GPL(is_skb_forwardable); ++ ++static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, ++ bool check_mtu) ++{ ++ int ret = ____dev_forward_skb(dev, skb, check_mtu); ++ ++ if (likely(!ret)) { ++ skb->protocol = eth_type_trans(skb, dev); ++ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); ++ } ++ ++ return ret; ++} ++ ++int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) ++{ ++ return __dev_forward_skb2(dev, skb, true); ++} ++EXPORT_SYMBOL_GPL(__dev_forward_skb); ++ ++/** ++ * dev_forward_skb - loopback an skb to another netif ++ * ++ * @dev: destination network device ++ * @skb: buffer to forward ++ * ++ * return values: ++ * NET_RX_SUCCESS (no congestion) ++ * NET_RX_DROP (packet was dropped, but freed) ++ * ++ * dev_forward_skb can be used for injecting an skb from the ++ * start_xmit function of one device into the receive queue ++ * of another device. ++ * ++ * The receiving device may be in another namespace, so ++ * we have to clear all information in the skb that could ++ * impact namespace isolation. ++ */ ++int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) ++{ ++ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); ++} ++EXPORT_SYMBOL_GPL(dev_forward_skb); ++ ++int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) ++{ ++ return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); ++} ++ ++static inline int deliver_skb(struct sk_buff *skb, ++ struct packet_type *pt_prev, ++ struct net_device *orig_dev) ++{ ++ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) ++ return -ENOMEM; ++ refcount_inc(&skb->users); ++ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); ++} ++ ++static inline void deliver_ptype_list_skb(struct sk_buff *skb, ++ struct packet_type **pt, ++ struct net_device *orig_dev, ++ __be16 type, ++ struct list_head *ptype_list) ++{ ++ struct packet_type *ptype, *pt_prev = *pt; ++ ++ list_for_each_entry_rcu(ptype, ptype_list, list) { ++ if (ptype->type != type) ++ continue; ++ if (pt_prev) ++ deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = ptype; ++ } ++ *pt = pt_prev; ++} ++ ++static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) ++{ ++ if (!ptype->af_packet_priv || !skb->sk) ++ return false; ++ ++ if (ptype->id_match) ++ return ptype->id_match(ptype, skb->sk); ++ else if ((struct sock *)ptype->af_packet_priv == skb->sk) ++ return true; ++ ++ return false; ++} ++ ++/** ++ * dev_nit_active - return true if any network interface taps are in use ++ * ++ * @dev: network device to check for the presence of taps ++ */ ++bool dev_nit_active(struct net_device *dev) ++{ ++ return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); ++} ++EXPORT_SYMBOL_GPL(dev_nit_active); ++ ++/* ++ * Support routine. Sends outgoing frames to any network ++ * taps currently in use. ++ */ ++ ++void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct packet_type *ptype; ++ struct sk_buff *skb2 = NULL; ++ struct packet_type *pt_prev = NULL; ++ struct list_head *ptype_list = &ptype_all; ++ ++ rcu_read_lock(); ++again: ++ list_for_each_entry_rcu(ptype, ptype_list, list) { ++ if (ptype->ignore_outgoing) ++ continue; ++ ++ /* Never send packets back to the socket ++ * they originated from - MvS (miquels@drinkel.ow.org) ++ */ ++ if (skb_loop_sk(ptype, skb)) ++ continue; ++ ++ if (pt_prev) { ++ deliver_skb(skb2, pt_prev, skb->dev); ++ pt_prev = ptype; ++ continue; ++ } ++ ++ /* need to clone skb, done only once */ ++ skb2 = skb_clone(skb, GFP_ATOMIC); ++ if (!skb2) ++ goto out_unlock; ++ ++ net_timestamp_set(skb2); ++ ++ /* skb->nh should be correctly ++ * set by sender, so that the second statement is ++ * just protection against buggy protocols. ++ */ ++ skb_reset_mac_header(skb2); ++ ++ if (skb_network_header(skb2) < skb2->data || ++ skb_network_header(skb2) > skb_tail_pointer(skb2)) { ++ net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ++ ntohs(skb2->protocol), ++ dev->name); ++ skb_reset_network_header(skb2); ++ } ++ ++ skb2->transport_header = skb2->network_header; ++ skb2->pkt_type = PACKET_OUTGOING; ++ pt_prev = ptype; ++ } ++ ++ if (ptype_list == &ptype_all) { ++ ptype_list = &dev->ptype_all; ++ goto again; ++ } ++out_unlock: ++ if (pt_prev) { ++ if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) ++ pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); ++ else ++ kfree_skb(skb2); ++ } ++ rcu_read_unlock(); ++} ++EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); ++ ++/** ++ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change ++ * @dev: Network device ++ * @txq: number of queues available ++ * ++ * If real_num_tx_queues is changed the tc mappings may no longer be ++ * valid. To resolve this verify the tc mapping remains valid and if ++ * not NULL the mapping. With no priorities mapping to this ++ * offset/count pair it will no longer be used. In the worst case TC0 ++ * is invalid nothing can be done so disable priority mappings. If is ++ * expected that drivers will fix this mapping if they can before ++ * calling netif_set_real_num_tx_queues. ++ */ ++static void netif_setup_tc(struct net_device *dev, unsigned int txq) ++{ ++ int i; ++ struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; ++ ++ /* If TC0 is invalidated disable TC mapping */ ++ if (tc->offset + tc->count > txq) { ++ netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); ++ dev->num_tc = 0; ++ return; ++ } ++ ++ /* Invalidated prio to tc mappings set to TC0 */ ++ for (i = 1; i < TC_BITMASK + 1; i++) { ++ int q = netdev_get_prio_tc_map(dev, i); ++ ++ tc = &dev->tc_to_txq[q]; ++ if (tc->offset + tc->count > txq) { ++ netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", ++ i, q); ++ netdev_set_prio_tc_map(dev, i, 0); ++ } ++ } ++} ++ ++int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) ++{ ++ if (dev->num_tc) { ++ struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; ++ int i; ++ ++ /* walk through the TCs and see if it falls into any of them */ ++ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { ++ if ((txq - tc->offset) < tc->count) ++ return i; ++ } ++ ++ /* didn't find it, just return -1 to indicate no match */ ++ return -1; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(netdev_txq_to_tc); ++ ++#ifdef CONFIG_XPS ++static struct static_key xps_needed __read_mostly; ++static struct static_key xps_rxqs_needed __read_mostly; ++static DEFINE_MUTEX(xps_map_mutex); ++#define xmap_dereference(P) \ ++ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) ++ ++static bool remove_xps_queue(struct xps_dev_maps *dev_maps, ++ struct xps_dev_maps *old_maps, int tci, u16 index) ++{ ++ struct xps_map *map = NULL; ++ int pos; ++ ++ if (dev_maps) ++ map = xmap_dereference(dev_maps->attr_map[tci]); ++ if (!map) ++ return false; ++ ++ for (pos = map->len; pos--;) { ++ if (map->queues[pos] != index) ++ continue; ++ ++ if (map->len > 1) { ++ map->queues[pos] = map->queues[--map->len]; ++ break; ++ } ++ ++ if (old_maps) ++ RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); ++ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); ++ kfree_rcu(map, rcu); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool remove_xps_queue_cpu(struct net_device *dev, ++ struct xps_dev_maps *dev_maps, ++ int cpu, u16 offset, u16 count) ++{ ++ int num_tc = dev_maps->num_tc; ++ bool active = false; ++ int tci; ++ ++ for (tci = cpu * num_tc; num_tc--; tci++) { ++ int i, j; ++ ++ for (i = count, j = offset; i--; j++) { ++ if (!remove_xps_queue(dev_maps, NULL, tci, j)) ++ break; ++ } ++ ++ active |= i < 0; ++ } ++ ++ return active; ++} ++ ++static void reset_xps_maps(struct net_device *dev, ++ struct xps_dev_maps *dev_maps, ++ enum xps_map_type type) ++{ ++ static_key_slow_dec_cpuslocked(&xps_needed); ++ if (type == XPS_RXQS) ++ static_key_slow_dec_cpuslocked(&xps_rxqs_needed); ++ ++ RCU_INIT_POINTER(dev->xps_maps[type], NULL); ++ ++ kfree_rcu(dev_maps, rcu); ++} ++ ++static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, ++ u16 offset, u16 count) ++{ ++ struct xps_dev_maps *dev_maps; ++ bool active = false; ++ int i, j; ++ ++ dev_maps = xmap_dereference(dev->xps_maps[type]); ++ if (!dev_maps) ++ return; ++ ++ for (j = 0; j < dev_maps->nr_ids; j++) ++ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); ++ if (!active) ++ reset_xps_maps(dev, dev_maps, type); ++ ++ if (type == XPS_CPUS) { ++ for (i = offset + (count - 1); count--; i--) ++ netdev_queue_numa_node_write( ++ netdev_get_tx_queue(dev, i), NUMA_NO_NODE); ++ } ++} ++ ++static void netif_reset_xps_queues(struct net_device *dev, u16 offset, ++ u16 count) ++{ ++ if (!static_key_false(&xps_needed)) ++ return; ++ ++ cpus_read_lock(); ++ mutex_lock(&xps_map_mutex); ++ ++ if (static_key_false(&xps_rxqs_needed)) ++ clean_xps_maps(dev, XPS_RXQS, offset, count); ++ ++ clean_xps_maps(dev, XPS_CPUS, offset, count); ++ ++ mutex_unlock(&xps_map_mutex); ++ cpus_read_unlock(); ++} ++ ++static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) ++{ ++ netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); ++} ++ ++static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, ++ u16 index, bool is_rxqs_map) ++{ ++ struct xps_map *new_map; ++ int alloc_len = XPS_MIN_MAP_ALLOC; ++ int i, pos; ++ ++ for (pos = 0; map && pos < map->len; pos++) { ++ if (map->queues[pos] != index) ++ continue; ++ return map; ++ } ++ ++ /* Need to add tx-queue to this CPU's/rx-queue's existing map */ ++ if (map) { ++ if (pos < map->alloc_len) ++ return map; ++ ++ alloc_len = map->alloc_len * 2; ++ } ++ ++ /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's ++ * map ++ */ ++ if (is_rxqs_map) ++ new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); ++ else ++ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, ++ cpu_to_node(attr_index)); ++ if (!new_map) ++ return NULL; ++ ++ for (i = 0; i < pos; i++) ++ new_map->queues[i] = map->queues[i]; ++ new_map->alloc_len = alloc_len; ++ new_map->len = pos; ++ ++ return new_map; ++} ++ ++/* Copy xps maps at a given index */ ++static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, ++ struct xps_dev_maps *new_dev_maps, int index, ++ int tc, bool skip_tc) ++{ ++ int i, tci = index * dev_maps->num_tc; ++ struct xps_map *map; ++ ++ /* copy maps belonging to foreign traffic classes */ ++ for (i = 0; i < dev_maps->num_tc; i++, tci++) { ++ if (i == tc && skip_tc) ++ continue; ++ ++ /* fill in the new device map from the old device map */ ++ map = xmap_dereference(dev_maps->attr_map[tci]); ++ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); ++ } ++} ++ ++/* Must be called under cpus_read_lock */ ++int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, ++ u16 index, enum xps_map_type type) ++{ ++ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; ++ const unsigned long *online_mask = NULL; ++ bool active = false, copy = false; ++ int i, j, tci, numa_node_id = -2; ++ int maps_sz, num_tc = 1, tc = 0; ++ struct xps_map *map, *new_map; ++ unsigned int nr_ids; ++ ++ if (dev->num_tc) { ++ /* Do not allow XPS on subordinate device directly */ ++ num_tc = dev->num_tc; ++ if (num_tc < 0) ++ return -EINVAL; ++ ++ /* If queue belongs to subordinate dev use its map */ ++ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; ++ ++ tc = netdev_txq_to_tc(dev, index); ++ if (tc < 0) ++ return -EINVAL; ++ } ++ ++ mutex_lock(&xps_map_mutex); ++ ++ dev_maps = xmap_dereference(dev->xps_maps[type]); ++ if (type == XPS_RXQS) { ++ maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); ++ nr_ids = dev->num_rx_queues; ++ } else { ++ maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); ++ if (num_possible_cpus() > 1) ++ online_mask = cpumask_bits(cpu_online_mask); ++ nr_ids = nr_cpu_ids; ++ } ++ ++ if (maps_sz < L1_CACHE_BYTES) ++ maps_sz = L1_CACHE_BYTES; ++ ++ /* The old dev_maps could be larger or smaller than the one we're ++ * setting up now, as dev->num_tc or nr_ids could have been updated in ++ * between. We could try to be smart, but let's be safe instead and only ++ * copy foreign traffic classes if the two map sizes match. ++ */ ++ if (dev_maps && ++ dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) ++ copy = true; ++ ++ /* allocate memory for queue storage */ ++ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), ++ j < nr_ids;) { ++ if (!new_dev_maps) { ++ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); ++ if (!new_dev_maps) { ++ mutex_unlock(&xps_map_mutex); ++ return -ENOMEM; ++ } ++ ++ new_dev_maps->nr_ids = nr_ids; ++ new_dev_maps->num_tc = num_tc; ++ } ++ ++ tci = j * num_tc + tc; ++ map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; ++ ++ map = expand_xps_map(map, j, index, type == XPS_RXQS); ++ if (!map) ++ goto error; ++ ++ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); ++ } ++ ++ if (!new_dev_maps) ++ goto out_no_new_maps; ++ ++ if (!dev_maps) { ++ /* Increment static keys at most once per type */ ++ static_key_slow_inc_cpuslocked(&xps_needed); ++ if (type == XPS_RXQS) ++ static_key_slow_inc_cpuslocked(&xps_rxqs_needed); ++ } ++ ++ for (j = 0; j < nr_ids; j++) { ++ bool skip_tc = false; ++ ++ tci = j * num_tc + tc; ++ if (netif_attr_test_mask(j, mask, nr_ids) && ++ netif_attr_test_online(j, online_mask, nr_ids)) { ++ /* add tx-queue to CPU/rx-queue maps */ ++ int pos = 0; ++ ++ skip_tc = true; ++ ++ map = xmap_dereference(new_dev_maps->attr_map[tci]); ++ while ((pos < map->len) && (map->queues[pos] != index)) ++ pos++; ++ ++ if (pos == map->len) ++ map->queues[map->len++] = index; ++#ifdef CONFIG_NUMA ++ if (type == XPS_CPUS) { ++ if (numa_node_id == -2) ++ numa_node_id = cpu_to_node(j); ++ else if (numa_node_id != cpu_to_node(j)) ++ numa_node_id = -1; ++ } ++#endif ++ } ++ ++ if (copy) ++ xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, ++ skip_tc); ++ } ++ ++ rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); ++ ++ /* Cleanup old maps */ ++ if (!dev_maps) ++ goto out_no_old_maps; ++ ++ for (j = 0; j < dev_maps->nr_ids; j++) { ++ for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { ++ map = xmap_dereference(dev_maps->attr_map[tci]); ++ if (!map) ++ continue; ++ ++ if (copy) { ++ new_map = xmap_dereference(new_dev_maps->attr_map[tci]); ++ if (map == new_map) ++ continue; ++ } ++ ++ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); ++ kfree_rcu(map, rcu); ++ } ++ } ++ ++ old_dev_maps = dev_maps; ++ ++out_no_old_maps: ++ dev_maps = new_dev_maps; ++ active = true; ++ ++out_no_new_maps: ++ if (type == XPS_CPUS) ++ /* update Tx queue numa node */ ++ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), ++ (numa_node_id >= 0) ? ++ numa_node_id : NUMA_NO_NODE); ++ ++ if (!dev_maps) ++ goto out_no_maps; ++ ++ /* removes tx-queue from unused CPUs/rx-queues */ ++ for (j = 0; j < dev_maps->nr_ids; j++) { ++ tci = j * dev_maps->num_tc; ++ ++ for (i = 0; i < dev_maps->num_tc; i++, tci++) { ++ if (i == tc && ++ netif_attr_test_mask(j, mask, dev_maps->nr_ids) && ++ netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) ++ continue; ++ ++ active |= remove_xps_queue(dev_maps, ++ copy ? old_dev_maps : NULL, ++ tci, index); ++ } ++ } ++ ++ if (old_dev_maps) ++ kfree_rcu(old_dev_maps, rcu); ++ ++ /* free map if not active */ ++ if (!active) ++ reset_xps_maps(dev, dev_maps, type); ++ ++out_no_maps: ++ mutex_unlock(&xps_map_mutex); ++ ++ return 0; ++error: ++ /* remove any maps that we added */ ++ for (j = 0; j < nr_ids; j++) { ++ for (i = num_tc, tci = j * num_tc; i--; tci++) { ++ new_map = xmap_dereference(new_dev_maps->attr_map[tci]); ++ map = copy ? ++ xmap_dereference(dev_maps->attr_map[tci]) : ++ NULL; ++ if (new_map && new_map != map) ++ kfree(new_map); ++ } ++ } ++ ++ mutex_unlock(&xps_map_mutex); ++ ++ kfree(new_dev_maps); ++ return -ENOMEM; ++} ++EXPORT_SYMBOL_GPL(__netif_set_xps_queue); ++ ++int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, ++ u16 index) ++{ ++ int ret; ++ ++ cpus_read_lock(); ++ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); ++ cpus_read_unlock(); ++ ++ return ret; ++} ++EXPORT_SYMBOL(netif_set_xps_queue); ++ ++#endif ++static void netdev_unbind_all_sb_channels(struct net_device *dev) ++{ ++ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; ++ ++ /* Unbind any subordinate channels */ ++ while (txq-- != &dev->_tx[0]) { ++ if (txq->sb_dev) ++ netdev_unbind_sb_channel(dev, txq->sb_dev); ++ } ++} ++ ++void netdev_reset_tc(struct net_device *dev) ++{ ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues_gt(dev, 0); ++#endif ++ netdev_unbind_all_sb_channels(dev); ++ ++ /* Reset TC configuration of device */ ++ dev->num_tc = 0; ++ memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); ++ memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); ++} ++EXPORT_SYMBOL(netdev_reset_tc); ++ ++int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) ++{ ++ if (tc >= dev->num_tc) ++ return -EINVAL; ++ ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues(dev, offset, count); ++#endif ++ dev->tc_to_txq[tc].count = count; ++ dev->tc_to_txq[tc].offset = offset; ++ return 0; ++} ++EXPORT_SYMBOL(netdev_set_tc_queue); ++ ++int netdev_set_num_tc(struct net_device *dev, u8 num_tc) ++{ ++ if (num_tc > TC_MAX_QUEUE) ++ return -EINVAL; ++ ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues_gt(dev, 0); ++#endif ++ netdev_unbind_all_sb_channels(dev); ++ ++ dev->num_tc = num_tc; ++ return 0; ++} ++EXPORT_SYMBOL(netdev_set_num_tc); ++ ++void netdev_unbind_sb_channel(struct net_device *dev, ++ struct net_device *sb_dev) ++{ ++ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; ++ ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues_gt(sb_dev, 0); ++#endif ++ memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); ++ memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); ++ ++ while (txq-- != &dev->_tx[0]) { ++ if (txq->sb_dev == sb_dev) ++ txq->sb_dev = NULL; ++ } ++} ++EXPORT_SYMBOL(netdev_unbind_sb_channel); ++ ++int netdev_bind_sb_channel_queue(struct net_device *dev, ++ struct net_device *sb_dev, ++ u8 tc, u16 count, u16 offset) ++{ ++ /* Make certain the sb_dev and dev are already configured */ ++ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) ++ return -EINVAL; ++ ++ /* We cannot hand out queues we don't have */ ++ if ((offset + count) > dev->real_num_tx_queues) ++ return -EINVAL; ++ ++ /* Record the mapping */ ++ sb_dev->tc_to_txq[tc].count = count; ++ sb_dev->tc_to_txq[tc].offset = offset; ++ ++ /* Provide a way for Tx queue to find the tc_to_txq map or ++ * XPS map for itself. ++ */ ++ while (count--) ++ netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; ++ ++ return 0; ++} ++EXPORT_SYMBOL(netdev_bind_sb_channel_queue); ++ ++int netdev_set_sb_channel(struct net_device *dev, u16 channel) ++{ ++ /* Do not use a multiqueue device to represent a subordinate channel */ ++ if (netif_is_multiqueue(dev)) ++ return -ENODEV; ++ ++ /* We allow channels 1 - 32767 to be used for subordinate channels. ++ * Channel 0 is meant to be "native" mode and used only to represent ++ * the main root device. We allow writing 0 to reset the device back ++ * to normal mode after being used as a subordinate channel. ++ */ ++ if (channel > S16_MAX) ++ return -EINVAL; ++ ++ dev->num_tc = -channel; ++ ++ return 0; ++} ++EXPORT_SYMBOL(netdev_set_sb_channel); ++ ++/* ++ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues ++ * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. ++ */ ++int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) ++{ ++ bool disabling; ++ int rc; ++ ++ disabling = txq < dev->real_num_tx_queues; ++ ++ if (txq < 1 || txq > dev->num_tx_queues) ++ return -EINVAL; ++ ++ if (dev->reg_state == NETREG_REGISTERED || ++ dev->reg_state == NETREG_UNREGISTERING) { ++ ASSERT_RTNL(); ++ ++ rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, ++ txq); ++ if (rc) ++ return rc; ++ ++ if (dev->num_tc) ++ netif_setup_tc(dev, txq); ++ ++ dev_qdisc_change_real_num_tx(dev, txq); ++ ++ dev->real_num_tx_queues = txq; ++ ++ if (disabling) { ++ synchronize_net(); ++ qdisc_reset_all_tx_gt(dev, txq); ++#ifdef CONFIG_XPS ++ netif_reset_xps_queues_gt(dev, txq); ++#endif ++ } ++ } else { ++ dev->real_num_tx_queues = txq; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(netif_set_real_num_tx_queues); ++ ++#ifdef CONFIG_SYSFS ++/** ++ * netif_set_real_num_rx_queues - set actual number of RX queues used ++ * @dev: Network device ++ * @rxq: Actual number of RX queues ++ * ++ * This must be called either with the rtnl_lock held or before ++ * registration of the net device. Returns 0 on success, or a ++ * negative error code. If called before registration, it always ++ * succeeds. ++ */ ++int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) ++{ ++ int rc; ++ ++ if (rxq < 1 || rxq > dev->num_rx_queues) ++ return -EINVAL; ++ ++ if (dev->reg_state == NETREG_REGISTERED) { ++ ASSERT_RTNL(); ++ ++ rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, ++ rxq); ++ if (rc) ++ return rc; ++ } ++ ++ dev->real_num_rx_queues = rxq; ++ return 0; ++} ++EXPORT_SYMBOL(netif_set_real_num_rx_queues); ++#endif ++ ++/** ++ * netif_set_real_num_queues - set actual number of RX and TX queues used ++ * @dev: Network device ++ * @txq: Actual number of TX queues ++ * @rxq: Actual number of RX queues ++ * ++ * Set the real number of both TX and RX queues. ++ * Does nothing if the number of queues is already correct. ++ */ ++int netif_set_real_num_queues(struct net_device *dev, ++ unsigned int txq, unsigned int rxq) ++{ ++ unsigned int old_rxq = dev->real_num_rx_queues; ++ int err; ++ ++ if (txq < 1 || txq > dev->num_tx_queues || ++ rxq < 1 || rxq > dev->num_rx_queues) ++ return -EINVAL; ++ ++ /* Start from increases, so the error path only does decreases - ++ * decreases can't fail. ++ */ ++ if (rxq > dev->real_num_rx_queues) { ++ err = netif_set_real_num_rx_queues(dev, rxq); ++ if (err) ++ return err; ++ } ++ if (txq > dev->real_num_tx_queues) { ++ err = netif_set_real_num_tx_queues(dev, txq); ++ if (err) ++ goto undo_rx; ++ } ++ if (rxq < dev->real_num_rx_queues) ++ WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); ++ if (txq < dev->real_num_tx_queues) ++ WARN_ON(netif_set_real_num_tx_queues(dev, txq)); ++ ++ return 0; ++undo_rx: ++ WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); ++ return err; ++} ++EXPORT_SYMBOL(netif_set_real_num_queues); ++ ++/** ++ * netif_set_tso_max_size() - set the max size of TSO frames supported ++ * @dev: netdev to update ++ * @size: max skb->len of a TSO frame ++ * ++ * Set the limit on the size of TSO super-frames the device can handle. ++ * Unless explicitly set the stack will assume the value of ++ * %GSO_LEGACY_MAX_SIZE. ++ */ ++void netif_set_tso_max_size(struct net_device *dev, unsigned int size) ++{ ++ dev->tso_max_size = min(GSO_MAX_SIZE, size); ++ if (size < READ_ONCE(dev->gso_max_size)) ++ netif_set_gso_max_size(dev, size); ++} ++EXPORT_SYMBOL(netif_set_tso_max_size); ++ ++/** ++ * netif_set_tso_max_segs() - set the max number of segs supported for TSO ++ * @dev: netdev to update ++ * @segs: max number of TCP segments ++ * ++ * Set the limit on the number of TCP segments the device can generate from ++ * a single TSO super-frame. ++ * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. ++ */ ++void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) ++{ ++ dev->tso_max_segs = segs; ++ if (segs < READ_ONCE(dev->gso_max_segs)) ++ netif_set_gso_max_segs(dev, segs); ++} ++EXPORT_SYMBOL(netif_set_tso_max_segs); ++ ++/** ++ * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper ++ * @to: netdev to update ++ * @from: netdev from which to copy the limits ++ */ ++void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) ++{ ++ netif_set_tso_max_size(to, from->tso_max_size); ++ netif_set_tso_max_segs(to, from->tso_max_segs); ++} ++EXPORT_SYMBOL(netif_inherit_tso_max); ++ ++/** ++ * netif_get_num_default_rss_queues - default number of RSS queues ++ * ++ * Default value is the number of physical cores if there are only 1 or 2, or ++ * divided by 2 if there are more. ++ */ ++int netif_get_num_default_rss_queues(void) ++{ ++ cpumask_var_t cpus; ++ int cpu, count = 0; ++ ++ if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) ++ return 1; ++ ++ cpumask_copy(cpus, cpu_online_mask); ++ for_each_cpu(cpu, cpus) { ++ ++count; ++ cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); ++ } ++ free_cpumask_var(cpus); ++ ++ return count > 2 ? DIV_ROUND_UP(count, 2) : count; ++} ++EXPORT_SYMBOL(netif_get_num_default_rss_queues); ++ ++static void __netif_reschedule(struct Qdisc *q) ++{ ++ struct softnet_data *sd; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ sd = this_cpu_ptr(&softnet_data); ++ q->next_sched = NULL; ++ *sd->output_queue_tailp = q; ++ sd->output_queue_tailp = &q->next_sched; ++ raise_softirq_irqoff(NET_TX_SOFTIRQ); ++ local_irq_restore(flags); ++} ++ ++void __netif_schedule(struct Qdisc *q) ++{ ++ if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) ++ __netif_reschedule(q); ++} ++EXPORT_SYMBOL(__netif_schedule); ++ ++struct dev_kfree_skb_cb { ++ enum skb_free_reason reason; ++}; ++ ++static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) ++{ ++ return (struct dev_kfree_skb_cb *)skb->cb; ++} ++ ++void netif_schedule_queue(struct netdev_queue *txq) ++{ ++ rcu_read_lock(); ++ if (!netif_xmit_stopped(txq)) { ++ struct Qdisc *q = rcu_dereference(txq->qdisc); ++ ++ __netif_schedule(q); ++ } ++ rcu_read_unlock(); ++} ++EXPORT_SYMBOL(netif_schedule_queue); ++ ++void netif_tx_wake_queue(struct netdev_queue *dev_queue) ++{ ++ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { ++ struct Qdisc *q; ++ ++ rcu_read_lock(); ++ q = rcu_dereference(dev_queue->qdisc); ++ __netif_schedule(q); ++ rcu_read_unlock(); ++ } ++} ++EXPORT_SYMBOL(netif_tx_wake_queue); ++ ++void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) ++{ ++ unsigned long flags; ++ ++ if (unlikely(!skb)) ++ return; ++ ++ if (likely(refcount_read(&skb->users) == 1)) { ++ smp_rmb(); ++ refcount_set(&skb->users, 0); ++ } else if (likely(!refcount_dec_and_test(&skb->users))) { ++ return; ++ } ++ get_kfree_skb_cb(skb)->reason = reason; ++ local_irq_save(flags); ++ skb->next = __this_cpu_read(softnet_data.completion_queue); ++ __this_cpu_write(softnet_data.completion_queue, skb); ++ raise_softirq_irqoff(NET_TX_SOFTIRQ); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL(__dev_kfree_skb_irq); ++ ++void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) ++{ ++ if (in_hardirq() || irqs_disabled()) ++ __dev_kfree_skb_irq(skb, reason); ++ else ++ dev_kfree_skb(skb); ++} ++EXPORT_SYMBOL(__dev_kfree_skb_any); ++ ++ ++/** ++ * netif_device_detach - mark device as removed ++ * @dev: network device ++ * ++ * Mark device as removed from system and therefore no longer available. ++ */ ++void netif_device_detach(struct net_device *dev) ++{ ++ if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && ++ netif_running(dev)) { ++ netif_tx_stop_all_queues(dev); ++ } ++} ++EXPORT_SYMBOL(netif_device_detach); ++ ++/** ++ * netif_device_attach - mark device as attached ++ * @dev: network device ++ * ++ * Mark device as attached from system and restart if needed. ++ */ ++void netif_device_attach(struct net_device *dev) ++{ ++ if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && ++ netif_running(dev)) { ++ netif_tx_wake_all_queues(dev); ++ __netdev_watchdog_up(dev); ++ } ++} ++EXPORT_SYMBOL(netif_device_attach); ++ ++/* ++ * Returns a Tx hash based on the given packet descriptor a Tx queues' number ++ * to be used as a distribution range. ++ */ ++static u16 skb_tx_hash(const struct net_device *dev, ++ const struct net_device *sb_dev, ++ struct sk_buff *skb) ++{ ++ u32 hash; ++ u16 qoffset = 0; ++ u16 qcount = dev->real_num_tx_queues; ++ ++ if (dev->num_tc) { ++ u8 tc = netdev_get_prio_tc_map(dev, skb->priority); ++ ++ qoffset = sb_dev->tc_to_txq[tc].offset; ++ qcount = sb_dev->tc_to_txq[tc].count; ++ if (unlikely(!qcount)) { ++ net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", ++ sb_dev->name, qoffset, tc); ++ qoffset = 0; ++ qcount = dev->real_num_tx_queues; ++ } ++ } ++ ++ if (skb_rx_queue_recorded(skb)) { ++ hash = skb_get_rx_queue(skb); ++ if (hash >= qoffset) ++ hash -= qoffset; ++ while (unlikely(hash >= qcount)) ++ hash -= qcount; ++ return hash + qoffset; ++ } ++ ++ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; ++} ++ ++static void skb_warn_bad_offload(const struct sk_buff *skb) ++{ ++ static const netdev_features_t null_features; ++ struct net_device *dev = skb->dev; ++ const char *name = ""; ++ ++ if (!net_ratelimit()) ++ return; ++ ++ if (dev) { ++ if (dev->dev.parent) ++ name = dev_driver_string(dev->dev.parent); ++ else ++ name = netdev_name(dev); ++ } ++ skb_dump(KERN_WARNING, skb, false); ++ WARN(1, "%s: caps=(%pNF, %pNF)\n", ++ name, dev ? &dev->features : &null_features, ++ skb->sk ? &skb->sk->sk_route_caps : &null_features); ++} ++ ++/* ++ * Invalidate hardware checksum when packet is to be mangled, and ++ * complete checksum manually on outgoing path. ++ */ ++int skb_checksum_help(struct sk_buff *skb) ++{ ++ __wsum csum; ++ int ret = 0, offset; ++ ++ if (skb->ip_summed == CHECKSUM_COMPLETE) ++ goto out_set_summed; ++ ++ if (unlikely(skb_is_gso(skb))) { ++ skb_warn_bad_offload(skb); ++ return -EINVAL; ++ } ++ ++ /* Before computing a checksum, we should make sure no frag could ++ * be modified by an external entity : checksum could be wrong. ++ */ ++ if (skb_has_shared_frag(skb)) { ++ ret = __skb_linearize(skb); ++ if (ret) ++ goto out; ++ } ++ ++ offset = skb_checksum_start_offset(skb); ++ ret = -EINVAL; ++ if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ++ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); ++ goto out; ++ } ++ csum = skb_checksum(skb, offset, skb->len - offset, 0); ++ ++ offset += skb->csum_offset; ++ if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) { ++ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); ++ goto out; ++ } ++ ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); ++ if (ret) ++ goto out; ++ ++ *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; ++out_set_summed: ++ skb->ip_summed = CHECKSUM_NONE; ++out: ++ return ret; ++} ++EXPORT_SYMBOL(skb_checksum_help); ++ ++int skb_crc32c_csum_help(struct sk_buff *skb) ++{ ++ __le32 crc32c_csum; ++ int ret = 0, offset, start; ++ ++ if (skb->ip_summed != CHECKSUM_PARTIAL) ++ goto out; ++ ++ if (unlikely(skb_is_gso(skb))) ++ goto out; ++ ++ /* Before computing a checksum, we should make sure no frag could ++ * be modified by an external entity : checksum could be wrong. ++ */ ++ if (unlikely(skb_has_shared_frag(skb))) { ++ ret = __skb_linearize(skb); ++ if (ret) ++ goto out; ++ } ++ start = skb_checksum_start_offset(skb); ++ offset = start + offsetof(struct sctphdr, checksum); ++ if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ ret = skb_ensure_writable(skb, offset + sizeof(__le32)); ++ if (ret) ++ goto out; ++ ++ crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, ++ skb->len - start, ~(__u32)0, ++ crc32c_csum_stub)); ++ *(__le32 *)(skb->data + offset) = crc32c_csum; ++ skb->ip_summed = CHECKSUM_NONE; ++ skb->csum_not_inet = 0; ++out: ++ return ret; ++} ++ ++__be16 skb_network_protocol(struct sk_buff *skb, int *depth) ++{ ++ __be16 type = skb->protocol; ++ ++ /* Tunnel gso handlers can set protocol to ethernet. */ ++ if (type == htons(ETH_P_TEB)) { ++ struct ethhdr *eth; ++ ++ if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) ++ return 0; ++ ++ eth = (struct ethhdr *)skb->data; ++ type = eth->h_proto; ++ } ++ ++ return __vlan_get_protocol(skb, type, depth); ++} ++ ++/* openvswitch calls this on rx path, so we need a different check. ++ */ ++static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) ++{ ++ if (tx_path) ++ return skb->ip_summed != CHECKSUM_PARTIAL && ++ skb->ip_summed != CHECKSUM_UNNECESSARY; ++ ++ return skb->ip_summed == CHECKSUM_NONE; ++} ++ ++/** ++ * __skb_gso_segment - Perform segmentation on skb. ++ * @skb: buffer to segment ++ * @features: features for the output path (see dev->features) ++ * @tx_path: whether it is called in TX path ++ * ++ * This function segments the given skb and returns a list of segments. ++ * ++ * It may return NULL if the skb requires no segmentation. This is ++ * only possible when GSO is used for verifying header integrity. ++ * ++ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. ++ */ ++struct sk_buff *__skb_gso_segment(struct sk_buff *skb, ++ netdev_features_t features, bool tx_path) ++{ ++ struct sk_buff *segs; ++ ++ if (unlikely(skb_needs_check(skb, tx_path))) { ++ int err; ++ ++ /* We're going to init ->check field in TCP or UDP header */ ++ err = skb_cow_head(skb, 0); ++ if (err < 0) ++ return ERR_PTR(err); ++ } ++ ++ /* Only report GSO partial support if it will enable us to ++ * support segmentation on this frame without needing additional ++ * work. ++ */ ++ if (features & NETIF_F_GSO_PARTIAL) { ++ netdev_features_t partial_features = NETIF_F_GSO_ROBUST; ++ struct net_device *dev = skb->dev; ++ ++ partial_features |= dev->features & dev->gso_partial_features; ++ if (!skb_gso_ok(skb, features | partial_features)) ++ features &= ~NETIF_F_GSO_PARTIAL; ++ } ++ ++ BUILD_BUG_ON(SKB_GSO_CB_OFFSET + ++ sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); ++ ++ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); ++ SKB_GSO_CB(skb)->encap_level = 0; ++ ++ skb_reset_mac_header(skb); ++ skb_reset_mac_len(skb); ++ ++ segs = skb_mac_gso_segment(skb, features); ++ ++ if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) ++ skb_warn_bad_offload(skb); ++ ++ return segs; ++} ++EXPORT_SYMBOL(__skb_gso_segment); ++ ++/* Take action when hardware reception checksum errors are detected. */ ++#ifdef CONFIG_BUG ++static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) ++{ ++ netdev_err(dev, "hw csum failure\n"); ++ skb_dump(KERN_ERR, skb, true); ++ dump_stack(); ++} ++ ++void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) ++{ ++ DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); ++} ++EXPORT_SYMBOL(netdev_rx_csum_fault); ++#endif ++ ++/* XXX: check that highmem exists at all on the given machine. */ ++static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) ++{ ++#ifdef CONFIG_HIGHMEM ++ int i; ++ ++ if (!(dev->features & NETIF_F_HIGHDMA)) { ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ if (PageHighMem(skb_frag_page(frag))) ++ return 1; ++ } ++ } ++#endif ++ return 0; ++} ++ ++/* If MPLS offload request, verify we are testing hardware MPLS features ++ * instead of standard features for the netdev. ++ */ ++#if IS_ENABLED(CONFIG_NET_MPLS_GSO) ++static netdev_features_t net_mpls_features(struct sk_buff *skb, ++ netdev_features_t features, ++ __be16 type) ++{ ++ if (eth_p_mpls(type)) ++ features &= skb->dev->mpls_features; ++ ++ return features; ++} ++#else ++static netdev_features_t net_mpls_features(struct sk_buff *skb, ++ netdev_features_t features, ++ __be16 type) ++{ ++ return features; ++} ++#endif ++ ++static netdev_features_t harmonize_features(struct sk_buff *skb, ++ netdev_features_t features) ++{ ++ __be16 type; ++ ++ type = skb_network_protocol(skb, NULL); ++ features = net_mpls_features(skb, features, type); ++ ++ if (skb->ip_summed != CHECKSUM_NONE && ++ !can_checksum_protocol(features, type)) { ++ features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); ++ } ++ if (illegal_highdma(skb->dev, skb)) ++ features &= ~NETIF_F_SG; ++ ++ return features; ++} ++ ++netdev_features_t passthru_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) ++{ ++ return features; ++} ++EXPORT_SYMBOL(passthru_features_check); ++ ++static netdev_features_t dflt_features_check(struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) ++{ ++ return vlan_features_check(skb, features); ++} ++ ++static netdev_features_t gso_features_check(const struct sk_buff *skb, ++ struct net_device *dev, ++ netdev_features_t features) ++{ ++ u16 gso_segs = skb_shinfo(skb)->gso_segs; ++ ++ if (gso_segs > READ_ONCE(dev->gso_max_segs)) ++ return features & ~NETIF_F_GSO_MASK; ++ ++ if (!skb_shinfo(skb)->gso_type) { ++ skb_warn_bad_offload(skb); ++ return features & ~NETIF_F_GSO_MASK; ++ } ++ ++ /* Support for GSO partial features requires software ++ * intervention before we can actually process the packets ++ * so we need to strip support for any partial features now ++ * and we can pull them back in after we have partially ++ * segmented the frame. ++ */ ++ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) ++ features &= ~dev->gso_partial_features; ++ ++ /* Make sure to clear the IPv4 ID mangling feature if the ++ * IPv4 header has the potential to be fragmented. ++ */ ++ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { ++ struct iphdr *iph = skb->encapsulation ? ++ inner_ip_hdr(skb) : ip_hdr(skb); ++ ++ if (!(iph->frag_off & htons(IP_DF))) ++ features &= ~NETIF_F_TSO_MANGLEID; ++ } ++ ++ return features; ++} ++ ++netdev_features_t netif_skb_features(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ netdev_features_t features = dev->features; ++ ++ if (skb_is_gso(skb)) ++ features = gso_features_check(skb, dev, features); ++ ++ /* If encapsulation offload request, verify we are testing ++ * hardware encapsulation features instead of standard ++ * features for the netdev ++ */ ++ if (skb->encapsulation) ++ features &= dev->hw_enc_features; ++ ++ if (skb_vlan_tagged(skb)) ++ features = netdev_intersect_features(features, ++ dev->vlan_features | ++ NETIF_F_HW_VLAN_CTAG_TX | ++ NETIF_F_HW_VLAN_STAG_TX); ++ ++ if (dev->netdev_ops->ndo_features_check) ++ features &= dev->netdev_ops->ndo_features_check(skb, dev, ++ features); ++ else ++ features &= dflt_features_check(skb, dev, features); ++ ++ return harmonize_features(skb, features); ++} ++EXPORT_SYMBOL(netif_skb_features); ++ ++static int xmit_one(struct sk_buff *skb, struct net_device *dev, ++ struct netdev_queue *txq, bool more) ++{ ++ unsigned int len; ++ int rc; ++ ++ if (dev_nit_active(dev)) ++ dev_queue_xmit_nit(skb, dev); ++ ++ len = skb->len; ++ trace_net_dev_start_xmit(skb, dev); ++ rc = netdev_start_xmit(skb, dev, txq, more); ++ trace_net_dev_xmit(skb, rc, dev, len); ++ ++ return rc; ++} ++ ++struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, ++ struct netdev_queue *txq, int *ret) ++{ ++ struct sk_buff *skb = first; ++ int rc = NETDEV_TX_OK; ++ ++ while (skb) { ++ struct sk_buff *next = skb->next; ++ ++ skb_mark_not_on_list(skb); ++ rc = xmit_one(skb, dev, txq, next != NULL); ++ if (unlikely(!dev_xmit_complete(rc))) { ++ skb->next = next; ++ goto out; ++ } ++ ++ skb = next; ++ if (netif_tx_queue_stopped(txq) && skb) { ++ rc = NETDEV_TX_BUSY; ++ break; ++ } ++ } ++ ++out: ++ *ret = rc; ++ return skb; ++} ++ ++static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, ++ netdev_features_t features) ++{ ++ if (skb_vlan_tag_present(skb) && ++ !vlan_hw_offload_capable(features, skb->vlan_proto)) ++ skb = __vlan_hwaccel_push_inside(skb); ++ return skb; ++} ++ ++int skb_csum_hwoffload_help(struct sk_buff *skb, ++ const netdev_features_t features) ++{ ++ if (unlikely(skb_csum_is_sctp(skb))) ++ return !!(features & NETIF_F_SCTP_CRC) ? 0 : ++ skb_crc32c_csum_help(skb); ++ ++ if (features & NETIF_F_HW_CSUM) ++ return 0; ++ ++ if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { ++ switch (skb->csum_offset) { ++ case offsetof(struct tcphdr, check): ++ case offsetof(struct udphdr, check): ++ return 0; ++ } ++ } ++ ++ return skb_checksum_help(skb); ++} ++EXPORT_SYMBOL(skb_csum_hwoffload_help); ++ ++static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) ++{ ++ netdev_features_t features; ++ ++ features = netif_skb_features(skb); ++ skb = validate_xmit_vlan(skb, features); ++ if (unlikely(!skb)) ++ goto out_null; ++ ++ skb = sk_validate_xmit_skb(skb, dev); ++ if (unlikely(!skb)) ++ goto out_null; ++ ++ if (netif_needs_gso(skb, features)) { ++ struct sk_buff *segs; ++ ++ segs = skb_gso_segment(skb, features); ++ if (IS_ERR(segs)) { ++ goto out_kfree_skb; ++ } else if (segs) { ++ consume_skb(skb); ++ skb = segs; ++ } ++ } else { ++ if (skb_needs_linearize(skb, features) && ++ __skb_linearize(skb)) ++ goto out_kfree_skb; ++ ++ /* If packet is not checksummed and device does not ++ * support checksumming for this protocol, complete ++ * checksumming here. ++ */ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ if (skb->encapsulation) ++ skb_set_inner_transport_header(skb, ++ skb_checksum_start_offset(skb)); ++ else ++ skb_set_transport_header(skb, ++ skb_checksum_start_offset(skb)); ++ if (skb_csum_hwoffload_help(skb, features)) ++ goto out_kfree_skb; ++ } ++ } ++ ++ skb = validate_xmit_xfrm(skb, features, again); ++ ++ return skb; ++ ++out_kfree_skb: ++ kfree_skb(skb); ++out_null: ++ dev_core_stats_tx_dropped_inc(dev); ++ return NULL; ++} ++ ++struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) ++{ ++ struct sk_buff *next, *head = NULL, *tail; ++ ++ for (; skb != NULL; skb = next) { ++ next = skb->next; ++ skb_mark_not_on_list(skb); ++ ++ /* in case skb wont be segmented, point to itself */ ++ skb->prev = skb; ++ ++ skb = validate_xmit_skb(skb, dev, again); ++ if (!skb) ++ continue; ++ ++ if (!head) ++ head = skb; ++ else ++ tail->next = skb; ++ /* If skb was segmented, skb->prev points to ++ * the last segment. If not, it still contains skb. ++ */ ++ tail = skb->prev; ++ } ++ return head; ++} ++EXPORT_SYMBOL_GPL(validate_xmit_skb_list); ++ ++static void qdisc_pkt_len_init(struct sk_buff *skb) ++{ ++ const struct skb_shared_info *shinfo = skb_shinfo(skb); ++ ++ qdisc_skb_cb(skb)->pkt_len = skb->len; ++ ++ /* To get more precise estimation of bytes sent on wire, ++ * we add to pkt_len the headers size of all segments ++ */ ++ if (shinfo->gso_size && skb_transport_header_was_set(skb)) { ++ unsigned int hdr_len; ++ u16 gso_segs = shinfo->gso_segs; ++ ++ /* mac layer + network layer */ ++ hdr_len = skb_transport_header(skb) - skb_mac_header(skb); ++ ++ /* + transport layer */ ++ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { ++ const struct tcphdr *th; ++ struct tcphdr _tcphdr; ++ ++ th = skb_header_pointer(skb, skb_transport_offset(skb), ++ sizeof(_tcphdr), &_tcphdr); ++ if (likely(th)) ++ hdr_len += __tcp_hdrlen(th); ++ } else { ++ struct udphdr _udphdr; ++ ++ if (skb_header_pointer(skb, skb_transport_offset(skb), ++ sizeof(_udphdr), &_udphdr)) ++ hdr_len += sizeof(struct udphdr); ++ } ++ ++ if (shinfo->gso_type & SKB_GSO_DODGY) ++ gso_segs = DIV_ROUND_UP(skb->len - hdr_len, ++ shinfo->gso_size); ++ ++ qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; ++ } ++} ++ ++static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, ++ struct sk_buff **to_free, ++ struct netdev_queue *txq) ++{ ++ int rc; ++ ++ rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; ++ if (rc == NET_XMIT_SUCCESS) ++ trace_qdisc_enqueue(q, txq, skb); ++ return rc; ++} ++ ++static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, ++ struct net_device *dev, ++ struct netdev_queue *txq) ++{ ++ spinlock_t *root_lock = qdisc_lock(q); ++ struct sk_buff *to_free = NULL; ++ bool contended; ++ int rc; ++ ++ qdisc_calculate_pkt_len(skb, q); ++ ++ if (q->flags & TCQ_F_NOLOCK) { ++ if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && ++ qdisc_run_begin(q)) { ++ /* Retest nolock_qdisc_is_empty() within the protection ++ * of q->seqlock to protect from racing with requeuing. ++ */ ++ if (unlikely(!nolock_qdisc_is_empty(q))) { ++ rc = dev_qdisc_enqueue(skb, q, &to_free, txq); ++ __qdisc_run(q); ++ qdisc_run_end(q); ++ ++ goto no_lock_out; ++ } ++ ++ qdisc_bstats_cpu_update(q, skb); ++ if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && ++ !nolock_qdisc_is_empty(q)) ++ __qdisc_run(q); ++ ++ qdisc_run_end(q); ++ return NET_XMIT_SUCCESS; ++ } ++ ++ rc = dev_qdisc_enqueue(skb, q, &to_free, txq); ++ qdisc_run(q); ++ ++no_lock_out: ++ if (unlikely(to_free)) ++ kfree_skb_list_reason(to_free, ++ SKB_DROP_REASON_QDISC_DROP); ++ return rc; ++ } ++ ++ /* ++ * Heuristic to force contended enqueues to serialize on a ++ * separate lock before trying to get qdisc main lock. ++ * This permits qdisc->running owner to get the lock more ++ * often and dequeue packets faster. ++ * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit ++ * and then other tasks will only enqueue packets. The packets will be ++ * sent after the qdisc owner is scheduled again. To prevent this ++ * scenario the task always serialize on the lock. ++ */ ++ contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); ++ if (unlikely(contended)) ++ spin_lock(&q->busylock); ++ ++ spin_lock(root_lock); ++ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { ++ __qdisc_drop(skb, &to_free); ++ rc = NET_XMIT_DROP; ++ } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && ++ qdisc_run_begin(q)) { ++ /* ++ * This is a work-conserving queue; there are no old skbs ++ * waiting to be sent out; and the qdisc is not running - ++ * xmit the skb directly. ++ */ ++ ++ qdisc_bstats_update(q, skb); ++ ++ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { ++ if (unlikely(contended)) { ++ spin_unlock(&q->busylock); ++ contended = false; ++ } ++ __qdisc_run(q); ++ } ++ ++ qdisc_run_end(q); ++ rc = NET_XMIT_SUCCESS; ++ } else { ++ rc = dev_qdisc_enqueue(skb, q, &to_free, txq); ++ if (qdisc_run_begin(q)) { ++ if (unlikely(contended)) { ++ spin_unlock(&q->busylock); ++ contended = false; ++ } ++ __qdisc_run(q); ++ qdisc_run_end(q); ++ } ++ } ++ spin_unlock(root_lock); ++ if (unlikely(to_free)) ++ kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); ++ if (unlikely(contended)) ++ spin_unlock(&q->busylock); ++ return rc; ++} ++ ++#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) ++static void skb_update_prio(struct sk_buff *skb) ++{ ++ const struct netprio_map *map; ++ const struct sock *sk; ++ unsigned int prioidx; ++ ++ if (skb->priority) ++ return; ++ map = rcu_dereference_bh(skb->dev->priomap); ++ if (!map) ++ return; ++ sk = skb_to_full_sk(skb); ++ if (!sk) ++ return; ++ ++ prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); ++ ++ if (prioidx < map->priomap_len) ++ skb->priority = map->priomap[prioidx]; ++} ++#else ++#define skb_update_prio(skb) ++#endif ++ ++/** ++ * dev_loopback_xmit - loop back @skb ++ * @net: network namespace this loopback is happening in ++ * @sk: sk needed to be a netfilter okfn ++ * @skb: buffer to transmit ++ */ ++int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) ++{ ++ skb_reset_mac_header(skb); ++ __skb_pull(skb, skb_network_offset(skb)); ++ skb->pkt_type = PACKET_LOOPBACK; ++ if (skb->ip_summed == CHECKSUM_NONE) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); ++ skb_dst_force(skb); ++ netif_rx(skb); ++ return 0; ++} ++EXPORT_SYMBOL(dev_loopback_xmit); ++ ++#ifdef CONFIG_NET_EGRESS ++static struct sk_buff * ++sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) ++{ ++#ifdef CONFIG_NET_CLS_ACT ++ struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); ++ struct tcf_result cl_res; ++ ++ if (!miniq) ++ return skb; ++ ++ /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ ++ tc_skb_cb(skb)->mru = 0; ++ tc_skb_cb(skb)->post_ct = false; ++ mini_qdisc_bstats_cpu_update(miniq, skb); ++ ++ switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { ++ case TC_ACT_OK: ++ case TC_ACT_RECLASSIFY: ++ skb->tc_index = TC_H_MIN(cl_res.classid); ++ break; ++ case TC_ACT_SHOT: ++ mini_qdisc_qstats_cpu_drop(miniq); ++ *ret = NET_XMIT_DROP; ++ kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); ++ return NULL; ++ case TC_ACT_STOLEN: ++ case TC_ACT_QUEUED: ++ case TC_ACT_TRAP: ++ *ret = NET_XMIT_SUCCESS; ++ consume_skb(skb); ++ return NULL; ++ case TC_ACT_REDIRECT: ++ /* No need to push/pop skb's mac_header here on egress! */ ++ skb_do_redirect(skb); ++ *ret = NET_XMIT_SUCCESS; ++ return NULL; ++ default: ++ break; ++ } ++#endif /* CONFIG_NET_CLS_ACT */ ++ ++ return skb; ++} ++ ++static struct netdev_queue * ++netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) ++{ ++ int qm = skb_get_queue_mapping(skb); ++ ++ return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); ++} ++ ++static bool netdev_xmit_txqueue_skipped(void) ++{ ++ return __this_cpu_read(softnet_data.xmit.skip_txqueue); ++} ++ ++void netdev_xmit_skip_txqueue(bool skip) ++{ ++ __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); ++} ++EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); ++#endif /* CONFIG_NET_EGRESS */ ++ ++#ifdef CONFIG_XPS ++static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, ++ struct xps_dev_maps *dev_maps, unsigned int tci) ++{ ++ int tc = netdev_get_prio_tc_map(dev, skb->priority); ++ struct xps_map *map; ++ int queue_index = -1; ++ ++ if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) ++ return queue_index; ++ ++ tci *= dev_maps->num_tc; ++ tci += tc; ++ ++ map = rcu_dereference(dev_maps->attr_map[tci]); ++ if (map) { ++ if (map->len == 1) ++ queue_index = map->queues[0]; ++ else ++ queue_index = map->queues[reciprocal_scale( ++ skb_get_hash(skb), map->len)]; ++ if (unlikely(queue_index >= dev->real_num_tx_queues)) ++ queue_index = -1; ++ } ++ return queue_index; ++} ++#endif ++ ++static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, ++ struct sk_buff *skb) ++{ ++#ifdef CONFIG_XPS ++ struct xps_dev_maps *dev_maps; ++ struct sock *sk = skb->sk; ++ int queue_index = -1; ++ ++ if (!static_key_false(&xps_needed)) ++ return -1; ++ ++ rcu_read_lock(); ++ if (!static_key_false(&xps_rxqs_needed)) ++ goto get_cpus_map; ++ ++ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); ++ if (dev_maps) { ++ int tci = sk_rx_queue_get(sk); ++ ++ if (tci >= 0) ++ queue_index = __get_xps_queue_idx(dev, skb, dev_maps, ++ tci); ++ } ++ ++get_cpus_map: ++ if (queue_index < 0) { ++ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); ++ if (dev_maps) { ++ unsigned int tci = skb->sender_cpu - 1; ++ ++ queue_index = __get_xps_queue_idx(dev, skb, dev_maps, ++ tci); ++ } ++ } ++ rcu_read_unlock(); ++ ++ return queue_index; ++#else ++ return -1; ++#endif ++} ++ ++u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, ++ struct net_device *sb_dev) ++{ ++ return 0; ++} ++EXPORT_SYMBOL(dev_pick_tx_zero); ++ ++u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, ++ struct net_device *sb_dev) ++{ ++ return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; ++} ++EXPORT_SYMBOL(dev_pick_tx_cpu_id); ++ ++u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, ++ struct net_device *sb_dev) ++{ ++ struct sock *sk = skb->sk; ++ int queue_index = sk_tx_queue_get(sk); ++ ++ sb_dev = sb_dev ? : dev; ++ ++ if (queue_index < 0 || skb->ooo_okay || ++ queue_index >= dev->real_num_tx_queues) { ++ int new_index = get_xps_queue(dev, sb_dev, skb); ++ ++ if (new_index < 0) ++ new_index = skb_tx_hash(dev, sb_dev, skb); ++ ++ if (queue_index != new_index && sk && ++ sk_fullsock(sk) && ++ rcu_access_pointer(sk->sk_dst_cache)) ++ sk_tx_queue_set(sk, new_index); ++ ++ queue_index = new_index; ++ } ++ ++ return queue_index; ++} ++EXPORT_SYMBOL(netdev_pick_tx); ++ ++struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, ++ struct sk_buff *skb, ++ struct net_device *sb_dev) ++{ ++ int queue_index = 0; ++ ++#ifdef CONFIG_XPS ++ u32 sender_cpu = skb->sender_cpu - 1; ++ ++ if (sender_cpu >= (u32)NR_CPUS) ++ skb->sender_cpu = raw_smp_processor_id() + 1; ++#endif ++ ++ if (dev->real_num_tx_queues != 1) { ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (ops->ndo_select_queue) ++ queue_index = ops->ndo_select_queue(dev, skb, sb_dev); ++ else ++ queue_index = netdev_pick_tx(dev, skb, sb_dev); ++ ++ queue_index = netdev_cap_txqueue(dev, queue_index); ++ } ++ ++ skb_set_queue_mapping(skb, queue_index); ++ return netdev_get_tx_queue(dev, queue_index); ++} ++ ++/** ++ * __dev_queue_xmit() - transmit a buffer ++ * @skb: buffer to transmit ++ * @sb_dev: suboordinate device used for L2 forwarding offload ++ * ++ * Queue a buffer for transmission to a network device. The caller must ++ * have set the device and priority and built the buffer before calling ++ * this function. The function can be called from an interrupt. ++ * ++ * When calling this method, interrupts MUST be enabled. This is because ++ * the BH enable code must have IRQs enabled so that it will not deadlock. ++ * ++ * Regardless of the return value, the skb is consumed, so it is currently ++ * difficult to retry a send to this method. (You can bump the ref count ++ * before sending to hold a reference for retry if you are careful.) ++ * ++ * Return: ++ * * 0 - buffer successfully transmitted ++ * * positive qdisc return code - NET_XMIT_DROP etc. ++ * * negative errno - other errors ++ */ ++int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) ++{ ++ struct net_device *dev = skb->dev; ++ struct netdev_queue *txq = NULL; ++ struct Qdisc *q; ++ int rc = -ENOMEM; ++ bool again = false; ++ ++ skb_reset_mac_header(skb); ++ skb_assert_len(skb); ++ ++ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) ++ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); ++ ++ /* Disable soft irqs for various locks below. Also ++ * stops preemption for RCU. ++ */ ++ rcu_read_lock_bh(); ++ ++ skb_update_prio(skb); ++ ++ qdisc_pkt_len_init(skb); ++#ifdef CONFIG_NET_CLS_ACT ++ skb->tc_at_ingress = 0; ++#endif ++#ifdef CONFIG_NET_EGRESS ++ if (static_branch_unlikely(&egress_needed_key)) { ++ if (nf_hook_egress_active()) { ++ skb = nf_hook_egress(skb, &rc, dev); ++ if (!skb) ++ goto out; ++ } ++ ++ netdev_xmit_skip_txqueue(false); ++ ++ nf_skip_egress(skb, true); ++ skb = sch_handle_egress(skb, &rc, dev); ++ if (!skb) ++ goto out; ++ nf_skip_egress(skb, false); ++ ++ if (netdev_xmit_txqueue_skipped()) ++ txq = netdev_tx_queue_mapping(dev, skb); ++ } ++#endif ++ /* If device/qdisc don't need skb->dst, release it right now while ++ * its hot in this cpu cache. ++ */ ++ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) ++ skb_dst_drop(skb); ++ else ++ skb_dst_force(skb); ++ ++ if (!txq) ++ txq = netdev_core_pick_tx(dev, skb, sb_dev); ++ ++ q = rcu_dereference_bh(txq->qdisc); ++ ++ trace_net_dev_queue(skb); ++ if (q->enqueue) { ++ rc = __dev_xmit_skb(skb, q, dev, txq); ++ goto out; ++ } ++ ++ /* The device has no queue. Common case for software devices: ++ * loopback, all the sorts of tunnels... ++ ++ * Really, it is unlikely that netif_tx_lock protection is necessary ++ * here. (f.e. loopback and IP tunnels are clean ignoring statistics ++ * counters.) ++ * However, it is possible, that they rely on protection ++ * made by us here. ++ ++ * Check this and shot the lock. It is not prone from deadlocks. ++ *Either shot noqueue qdisc, it is even simpler 8) ++ */ ++ if (dev->flags & IFF_UP) { ++ int cpu = smp_processor_id(); /* ok because BHs are off */ ++ ++ /* Other cpus might concurrently change txq->xmit_lock_owner ++ * to -1 or to their cpu id, but not to our id. ++ */ ++ if (READ_ONCE(txq->xmit_lock_owner) != cpu) { ++ if (dev_xmit_recursion()) ++ goto recursion_alert; ++ ++ skb = validate_xmit_skb(skb, dev, &again); ++ if (!skb) ++ goto out; ++ ++ HARD_TX_LOCK(dev, txq, cpu); ++ ++ if (!netif_xmit_stopped(txq)) { ++ dev_xmit_recursion_inc(); ++ skb = dev_hard_start_xmit(skb, dev, txq, &rc); ++ dev_xmit_recursion_dec(); ++ if (dev_xmit_complete(rc)) { ++ HARD_TX_UNLOCK(dev, txq); ++ goto out; ++ } ++ } ++ HARD_TX_UNLOCK(dev, txq); ++ net_crit_ratelimited("Virtual device %s asks to queue packet!\n", ++ dev->name); ++ } else { ++ /* Recursion is detected! It is possible, ++ * unfortunately ++ */ ++recursion_alert: ++ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", ++ dev->name); ++ } ++ } ++ ++ rc = -ENETDOWN; ++ rcu_read_unlock_bh(); ++ ++ dev_core_stats_tx_dropped_inc(dev); ++ kfree_skb_list(skb); ++ return rc; ++out: ++ rcu_read_unlock_bh(); ++ return rc; ++} ++EXPORT_SYMBOL(__dev_queue_xmit); ++ ++int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) ++{ ++ struct net_device *dev = skb->dev; ++ struct sk_buff *orig_skb = skb; ++ struct netdev_queue *txq; ++ int ret = NETDEV_TX_BUSY; ++ bool again = false; ++ ++ if (unlikely(!netif_running(dev) || ++ !netif_carrier_ok(dev))) ++ goto drop; ++ ++ skb = validate_xmit_skb_list(skb, dev, &again); ++ if (skb != orig_skb) ++ goto drop; ++ ++ skb_set_queue_mapping(skb, queue_id); ++ txq = skb_get_tx_queue(dev, skb); ++ ++ local_bh_disable(); ++ ++ dev_xmit_recursion_inc(); ++ HARD_TX_LOCK(dev, txq, smp_processor_id()); ++ if (!netif_xmit_frozen_or_drv_stopped(txq)) ++ ret = netdev_start_xmit(skb, dev, txq, false); ++ HARD_TX_UNLOCK(dev, txq); ++ dev_xmit_recursion_dec(); ++ ++ local_bh_enable(); ++ return ret; ++drop: ++ dev_core_stats_tx_dropped_inc(dev); ++ kfree_skb_list(skb); ++ return NET_XMIT_DROP; ++} ++EXPORT_SYMBOL(__dev_direct_xmit); ++ ++/************************************************************************* ++ * Receiver routines ++ *************************************************************************/ ++ ++int netdev_max_backlog __read_mostly = 1000; ++EXPORT_SYMBOL(netdev_max_backlog); ++ ++int netdev_tstamp_prequeue __read_mostly = 1; ++unsigned int sysctl_skb_defer_max __read_mostly = 64; ++int netdev_budget __read_mostly = 300; ++/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ ++unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; ++int weight_p __read_mostly = 64; /* old backlog weight */ ++int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ ++int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ ++int dev_rx_weight __read_mostly = 64; ++int dev_tx_weight __read_mostly = 64; ++ ++/* Called with irq disabled */ ++static inline void ____napi_schedule(struct softnet_data *sd, ++ struct napi_struct *napi) ++{ ++ struct task_struct *thread; ++ ++ lockdep_assert_irqs_disabled(); ++ ++ if (test_bit(NAPI_STATE_THREADED, &napi->state)) { ++ /* Paired with smp_mb__before_atomic() in ++ * napi_enable()/dev_set_threaded(). ++ * Use READ_ONCE() to guarantee a complete ++ * read on napi->thread. Only call ++ * wake_up_process() when it's not NULL. ++ */ ++ thread = READ_ONCE(napi->thread); ++ if (thread) { ++ /* Avoid doing set_bit() if the thread is in ++ * INTERRUPTIBLE state, cause napi_thread_wait() ++ * makes sure to proceed with napi polling ++ * if the thread is explicitly woken from here. ++ */ ++ if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) ++ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); ++ wake_up_process(thread); ++ return; ++ } ++ } ++ ++ list_add_tail(&napi->poll_list, &sd->poll_list); ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++} ++ ++#ifdef CONFIG_RPS ++ ++/* One global table that all flow-based protocols share. */ ++struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; ++EXPORT_SYMBOL(rps_sock_flow_table); ++u32 rps_cpu_mask __read_mostly; ++EXPORT_SYMBOL(rps_cpu_mask); ++ ++struct static_key_false rps_needed __read_mostly; ++EXPORT_SYMBOL(rps_needed); ++struct static_key_false rfs_needed __read_mostly; ++EXPORT_SYMBOL(rfs_needed); ++ ++static struct rps_dev_flow * ++set_rps_cpu(struct net_device *dev, struct sk_buff *skb, ++ struct rps_dev_flow *rflow, u16 next_cpu) ++{ ++ if (next_cpu < nr_cpu_ids) { ++#ifdef CONFIG_RFS_ACCEL ++ struct netdev_rx_queue *rxqueue; ++ struct rps_dev_flow_table *flow_table; ++ struct rps_dev_flow *old_rflow; ++ u32 flow_id; ++ u16 rxq_index; ++ int rc; ++ ++ /* Should we steer this flow to a different hardware queue? */ ++ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || ++ !(dev->features & NETIF_F_NTUPLE)) ++ goto out; ++ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); ++ if (rxq_index == skb_get_rx_queue(skb)) ++ goto out; ++ ++ rxqueue = dev->_rx + rxq_index; ++ flow_table = rcu_dereference(rxqueue->rps_flow_table); ++ if (!flow_table) ++ goto out; ++ flow_id = skb_get_hash(skb) & flow_table->mask; ++ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, ++ rxq_index, flow_id); ++ if (rc < 0) ++ goto out; ++ old_rflow = rflow; ++ rflow = &flow_table->flows[flow_id]; ++ rflow->filter = rc; ++ if (old_rflow->filter == rflow->filter) ++ old_rflow->filter = RPS_NO_FILTER; ++ out: ++#endif ++ rflow->last_qtail = ++ per_cpu(softnet_data, next_cpu).input_queue_head; ++ } ++ ++ rflow->cpu = next_cpu; ++ return rflow; ++} ++ ++/* ++ * get_rps_cpu is called from netif_receive_skb and returns the target ++ * CPU from the RPS map of the receiving queue for a given skb. ++ * rcu_read_lock must be held on entry. ++ */ ++static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ++ struct rps_dev_flow **rflowp) ++{ ++ const struct rps_sock_flow_table *sock_flow_table; ++ struct netdev_rx_queue *rxqueue = dev->_rx; ++ struct rps_dev_flow_table *flow_table; ++ struct rps_map *map; ++ int cpu = -1; ++ u32 tcpu; ++ u32 hash; ++ ++ if (skb_rx_queue_recorded(skb)) { ++ u16 index = skb_get_rx_queue(skb); ++ ++ if (unlikely(index >= dev->real_num_rx_queues)) { ++ WARN_ONCE(dev->real_num_rx_queues > 1, ++ "%s received packet on queue %u, but number " ++ "of RX queues is %u\n", ++ dev->name, index, dev->real_num_rx_queues); ++ goto done; ++ } ++ rxqueue += index; ++ } ++ ++ /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ ++ ++ flow_table = rcu_dereference(rxqueue->rps_flow_table); ++ map = rcu_dereference(rxqueue->rps_map); ++ if (!flow_table && !map) ++ goto done; ++ ++ skb_reset_network_header(skb); ++ hash = skb_get_hash(skb); ++ if (!hash) ++ goto done; ++ ++ sock_flow_table = rcu_dereference(rps_sock_flow_table); ++ if (flow_table && sock_flow_table) { ++ struct rps_dev_flow *rflow; ++ u32 next_cpu; ++ u32 ident; ++ ++ /* First check into global flow table if there is a match */ ++ ident = sock_flow_table->ents[hash & sock_flow_table->mask]; ++ if ((ident ^ hash) & ~rps_cpu_mask) ++ goto try_rps; ++ ++ next_cpu = ident & rps_cpu_mask; ++ ++ /* OK, now we know there is a match, ++ * we can look at the local (per receive queue) flow table ++ */ ++ rflow = &flow_table->flows[hash & flow_table->mask]; ++ tcpu = rflow->cpu; ++ ++ /* ++ * If the desired CPU (where last recvmsg was done) is ++ * different from current CPU (one in the rx-queue flow ++ * table entry), switch if one of the following holds: ++ * - Current CPU is unset (>= nr_cpu_ids). ++ * - Current CPU is offline. ++ * - The current CPU's queue tail has advanced beyond the ++ * last packet that was enqueued using this table entry. ++ * This guarantees that all previous packets for the flow ++ * have been dequeued, thus preserving in order delivery. ++ */ ++ if (unlikely(tcpu != next_cpu) && ++ (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ++ ((int)(per_cpu(softnet_data, tcpu).input_queue_head - ++ rflow->last_qtail)) >= 0)) { ++ tcpu = next_cpu; ++ rflow = set_rps_cpu(dev, skb, rflow, next_cpu); ++ } ++ ++ if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { ++ *rflowp = rflow; ++ cpu = tcpu; ++ goto done; ++ } ++ } ++ ++try_rps: ++ ++ if (map) { ++ tcpu = map->cpus[reciprocal_scale(hash, map->len)]; ++ if (cpu_online(tcpu)) { ++ cpu = tcpu; ++ goto done; ++ } ++ } ++ ++done: ++ return cpu; ++} ++ ++#ifdef CONFIG_RFS_ACCEL ++ ++/** ++ * rps_may_expire_flow - check whether an RFS hardware filter may be removed ++ * @dev: Device on which the filter was set ++ * @rxq_index: RX queue index ++ * @flow_id: Flow ID passed to ndo_rx_flow_steer() ++ * @filter_id: Filter ID returned by ndo_rx_flow_steer() ++ * ++ * Drivers that implement ndo_rx_flow_steer() should periodically call ++ * this function for each installed filter and remove the filters for ++ * which it returns %true. ++ */ ++bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, ++ u32 flow_id, u16 filter_id) ++{ ++ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; ++ struct rps_dev_flow_table *flow_table; ++ struct rps_dev_flow *rflow; ++ bool expire = true; ++ unsigned int cpu; ++ ++ rcu_read_lock(); ++ flow_table = rcu_dereference(rxqueue->rps_flow_table); ++ if (flow_table && flow_id <= flow_table->mask) { ++ rflow = &flow_table->flows[flow_id]; ++ cpu = READ_ONCE(rflow->cpu); ++ if (rflow->filter == filter_id && cpu < nr_cpu_ids && ++ ((int)(per_cpu(softnet_data, cpu).input_queue_head - ++ rflow->last_qtail) < ++ (int)(10 * flow_table->mask))) ++ expire = false; ++ } ++ rcu_read_unlock(); ++ return expire; ++} ++EXPORT_SYMBOL(rps_may_expire_flow); ++ ++#endif /* CONFIG_RFS_ACCEL */ ++ ++/* Called from hardirq (IPI) context */ ++static void rps_trigger_softirq(void *data) ++{ ++ struct softnet_data *sd = data; ++ ++ ____napi_schedule(sd, &sd->backlog); ++ sd->received_rps++; ++} ++ ++#endif /* CONFIG_RPS */ ++ ++/* Called from hardirq (IPI) context */ ++static void trigger_rx_softirq(void *data) ++{ ++ struct softnet_data *sd = data; ++ ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ smp_store_release(&sd->defer_ipi_scheduled, 0); ++} ++ ++/* ++ * Check if this softnet_data structure is another cpu one ++ * If yes, queue it to our IPI list and return 1 ++ * If no, return 0 ++ */ ++static int napi_schedule_rps(struct softnet_data *sd) ++{ ++ struct softnet_data *mysd = this_cpu_ptr(&softnet_data); ++ ++#ifdef CONFIG_RPS ++ if (sd != mysd) { ++ sd->rps_ipi_next = mysd->rps_ipi_list; ++ mysd->rps_ipi_list = sd; ++ ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ return 1; ++ } ++#endif /* CONFIG_RPS */ ++ __napi_schedule_irqoff(&mysd->backlog); ++ return 0; ++} ++ ++#ifdef CONFIG_NET_FLOW_LIMIT ++int netdev_flow_limit_table_len __read_mostly = (1 << 12); ++#endif ++ ++static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) ++{ ++#ifdef CONFIG_NET_FLOW_LIMIT ++ struct sd_flow_limit *fl; ++ struct softnet_data *sd; ++ unsigned int old_flow, new_flow; ++ ++ if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) ++ return false; ++ ++ sd = this_cpu_ptr(&softnet_data); ++ ++ rcu_read_lock(); ++ fl = rcu_dereference(sd->flow_limit); ++ if (fl) { ++ new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); ++ old_flow = fl->history[fl->history_head]; ++ fl->history[fl->history_head] = new_flow; ++ ++ fl->history_head++; ++ fl->history_head &= FLOW_LIMIT_HISTORY - 1; ++ ++ if (likely(fl->buckets[old_flow])) ++ fl->buckets[old_flow]--; ++ ++ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { ++ fl->count++; ++ rcu_read_unlock(); ++ return true; ++ } ++ } ++ rcu_read_unlock(); ++#endif ++ return false; ++} ++ ++/* ++ * enqueue_to_backlog is called to queue an skb to a per CPU backlog ++ * queue (may be a remote CPU queue). ++ */ ++static int enqueue_to_backlog(struct sk_buff *skb, int cpu, ++ unsigned int *qtail) ++{ ++ enum skb_drop_reason reason; ++ struct softnet_data *sd; ++ unsigned long flags; ++ unsigned int qlen; ++ ++ reason = SKB_DROP_REASON_NOT_SPECIFIED; ++ sd = &per_cpu(softnet_data, cpu); ++ ++ rps_lock_irqsave(sd, &flags); ++ if (!netif_running(skb->dev)) ++ goto drop; ++ qlen = skb_queue_len(&sd->input_pkt_queue); ++ if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { ++ if (qlen) { ++enqueue: ++ __skb_queue_tail(&sd->input_pkt_queue, skb); ++ input_queue_tail_incr_save(sd, qtail); ++ rps_unlock_irq_restore(sd, &flags); ++ return NET_RX_SUCCESS; ++ } ++ ++ /* Schedule NAPI for backlog device ++ * We can use non atomic operation since we own the queue lock ++ */ ++ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) ++ napi_schedule_rps(sd); ++ goto enqueue; ++ } ++ reason = SKB_DROP_REASON_CPU_BACKLOG; ++ ++drop: ++ sd->dropped++; ++ rps_unlock_irq_restore(sd, &flags); ++ ++ dev_core_stats_rx_dropped_inc(skb->dev); ++ kfree_skb_reason(skb, reason); ++ return NET_RX_DROP; ++} ++ ++static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ struct netdev_rx_queue *rxqueue; ++ ++ rxqueue = dev->_rx; ++ ++ if (skb_rx_queue_recorded(skb)) { ++ u16 index = skb_get_rx_queue(skb); ++ ++ if (unlikely(index >= dev->real_num_rx_queues)) { ++ WARN_ONCE(dev->real_num_rx_queues > 1, ++ "%s received packet on queue %u, but number " ++ "of RX queues is %u\n", ++ dev->name, index, dev->real_num_rx_queues); ++ ++ return rxqueue; /* Return first rxqueue */ ++ } ++ rxqueue += index; ++ } ++ return rxqueue; ++} ++ ++u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, ++ struct bpf_prog *xdp_prog) ++{ ++ void *orig_data, *orig_data_end, *hard_start; ++ struct netdev_rx_queue *rxqueue; ++ bool orig_bcast, orig_host; ++ u32 mac_len, frame_sz; ++ __be16 orig_eth_type; ++ struct ethhdr *eth; ++ u32 metalen, act; ++ int off; ++ ++ /* The XDP program wants to see the packet starting at the MAC ++ * header. ++ */ ++ mac_len = skb->data - skb_mac_header(skb); ++ hard_start = skb->data - skb_headroom(skb); ++ ++ /* SKB "head" area always have tailroom for skb_shared_info */ ++ frame_sz = (void *)skb_end_pointer(skb) - hard_start; ++ frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ ++ rxqueue = netif_get_rxqueue(skb); ++ xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); ++ xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, ++ skb_headlen(skb) + mac_len, true); ++ ++ orig_data_end = xdp->data_end; ++ orig_data = xdp->data; ++ eth = (struct ethhdr *)xdp->data; ++ orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); ++ orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); ++ orig_eth_type = eth->h_proto; ++ ++ act = bpf_prog_run_xdp(xdp_prog, xdp); ++ ++ /* check if bpf_xdp_adjust_head was used */ ++ off = xdp->data - orig_data; ++ if (off) { ++ if (off > 0) ++ __skb_pull(skb, off); ++ else if (off < 0) ++ __skb_push(skb, -off); ++ ++ skb->mac_header += off; ++ skb_reset_network_header(skb); ++ } ++ ++ /* check if bpf_xdp_adjust_tail was used */ ++ off = xdp->data_end - orig_data_end; ++ if (off != 0) { ++ skb_set_tail_pointer(skb, xdp->data_end - xdp->data); ++ skb->len += off; /* positive on grow, negative on shrink */ ++ } ++ ++ /* check if XDP changed eth hdr such SKB needs update */ ++ eth = (struct ethhdr *)xdp->data; ++ if ((orig_eth_type != eth->h_proto) || ++ (orig_host != ether_addr_equal_64bits(eth->h_dest, ++ skb->dev->dev_addr)) || ++ (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { ++ __skb_push(skb, ETH_HLEN); ++ skb->pkt_type = PACKET_HOST; ++ skb->protocol = eth_type_trans(skb, skb->dev); ++ } ++ ++ /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull ++ * before calling us again on redirect path. We do not call do_redirect ++ * as we leave that up to the caller. ++ * ++ * Caller is responsible for managing lifetime of skb (i.e. calling ++ * kfree_skb in response to actions it cannot handle/XDP_DROP). ++ */ ++ switch (act) { ++ case XDP_REDIRECT: ++ case XDP_TX: ++ __skb_push(skb, mac_len); ++ break; ++ case XDP_PASS: ++ metalen = xdp->data - xdp->data_meta; ++ if (metalen) ++ skb_metadata_set(skb, metalen); ++ break; ++ } ++ ++ return act; ++} ++ ++static u32 netif_receive_generic_xdp(struct sk_buff *skb, ++ struct xdp_buff *xdp, ++ struct bpf_prog *xdp_prog) ++{ ++ u32 act = XDP_DROP; ++ ++ /* Reinjected packets coming from act_mirred or similar should ++ * not get XDP generic processing. ++ */ ++ if (skb_is_redirected(skb)) ++ return XDP_PASS; ++ ++ /* XDP packets must be linear and must have sufficient headroom ++ * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also ++ * native XDP provides, thus we need to do it here as well. ++ */ ++ if (skb_cloned(skb) || skb_is_nonlinear(skb) || ++ skb_headroom(skb) < XDP_PACKET_HEADROOM) { ++ int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); ++ int troom = skb->tail + skb->data_len - skb->end; ++ ++ /* In case we have to go down the path and also linearize, ++ * then lets do the pskb_expand_head() work just once here. ++ */ ++ if (pskb_expand_head(skb, ++ hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, ++ troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) ++ goto do_drop; ++ if (skb_linearize(skb)) ++ goto do_drop; ++ } ++ ++ act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); ++ switch (act) { ++ case XDP_REDIRECT: ++ case XDP_TX: ++ case XDP_PASS: ++ break; ++ default: ++ bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); ++ fallthrough; ++ case XDP_ABORTED: ++ trace_xdp_exception(skb->dev, xdp_prog, act); ++ fallthrough; ++ case XDP_DROP: ++ do_drop: ++ kfree_skb(skb); ++ break; ++ } ++ ++ return act; ++} ++ ++/* When doing generic XDP we have to bypass the qdisc layer and the ++ * network taps in order to match in-driver-XDP behavior. This also means ++ * that XDP packets are able to starve other packets going through a qdisc, ++ * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX ++ * queues, so they do not have this starvation issue. ++ */ ++void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) ++{ ++ struct net_device *dev = skb->dev; ++ struct netdev_queue *txq; ++ bool free_skb = true; ++ int cpu, rc; ++ ++ txq = netdev_core_pick_tx(dev, skb, NULL); ++ cpu = smp_processor_id(); ++ HARD_TX_LOCK(dev, txq, cpu); ++ if (!netif_xmit_frozen_or_drv_stopped(txq)) { ++ rc = netdev_start_xmit(skb, dev, txq, 0); ++ if (dev_xmit_complete(rc)) ++ free_skb = false; ++ } ++ HARD_TX_UNLOCK(dev, txq); ++ if (free_skb) { ++ trace_xdp_exception(dev, xdp_prog, XDP_TX); ++ dev_core_stats_tx_dropped_inc(dev); ++ kfree_skb(skb); ++ } ++} ++ ++static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); ++ ++int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) ++{ ++ if (xdp_prog) { ++ struct xdp_buff xdp; ++ u32 act; ++ int err; ++ ++ act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); ++ if (act != XDP_PASS) { ++ switch (act) { ++ case XDP_REDIRECT: ++ err = xdp_do_generic_redirect(skb->dev, skb, ++ &xdp, xdp_prog); ++ if (err) ++ goto out_redir; ++ break; ++ case XDP_TX: ++ generic_xdp_tx(skb, xdp_prog); ++ break; ++ } ++ return XDP_DROP; ++ } ++ } ++ return XDP_PASS; ++out_redir: ++ kfree_skb_reason(skb, SKB_DROP_REASON_XDP); ++ return XDP_DROP; ++} ++EXPORT_SYMBOL_GPL(do_xdp_generic); ++ ++static int netif_rx_internal(struct sk_buff *skb) ++{ ++ int ret; ++ ++ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); ++ ++ trace_netif_rx(skb); ++ ++#ifdef CONFIG_RPS ++ if (static_branch_unlikely(&rps_needed)) { ++ struct rps_dev_flow voidflow, *rflow = &voidflow; ++ int cpu; ++ ++ rcu_read_lock(); ++ ++ cpu = get_rps_cpu(skb->dev, skb, &rflow); ++ if (cpu < 0) ++ cpu = smp_processor_id(); ++ ++ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); ++ ++ rcu_read_unlock(); ++ } else ++#endif ++ { ++ unsigned int qtail; ++ ++ ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); ++ } ++ return ret; ++} ++ ++/** ++ * __netif_rx - Slightly optimized version of netif_rx ++ * @skb: buffer to post ++ * ++ * This behaves as netif_rx except that it does not disable bottom halves. ++ * As a result this function may only be invoked from the interrupt context ++ * (either hard or soft interrupt). ++ */ ++int __netif_rx(struct sk_buff *skb) ++{ ++ int ret; ++ ++ lockdep_assert_once(hardirq_count() | softirq_count()); ++ ++ trace_netif_rx_entry(skb); ++ ret = netif_rx_internal(skb); ++ trace_netif_rx_exit(ret); ++ return ret; ++} ++EXPORT_SYMBOL(__netif_rx); ++ ++/** ++ * netif_rx - post buffer to the network code ++ * @skb: buffer to post ++ * ++ * This function receives a packet from a device driver and queues it for ++ * the upper (protocol) levels to process via the backlog NAPI device. It ++ * always succeeds. The buffer may be dropped during processing for ++ * congestion control or by the protocol layers. ++ * The network buffer is passed via the backlog NAPI device. Modern NIC ++ * driver should use NAPI and GRO. ++ * This function can used from interrupt and from process context. The ++ * caller from process context must not disable interrupts before invoking ++ * this function. ++ * ++ * return values: ++ * NET_RX_SUCCESS (no congestion) ++ * NET_RX_DROP (packet was dropped) ++ * ++ */ ++int netif_rx(struct sk_buff *skb) ++{ ++ bool need_bh_off = !(hardirq_count() | softirq_count()); ++ int ret; ++ ++ if (need_bh_off) ++ local_bh_disable(); ++ trace_netif_rx_entry(skb); ++ ret = netif_rx_internal(skb); ++ trace_netif_rx_exit(ret); ++ if (need_bh_off) ++ local_bh_enable(); ++ return ret; ++} ++EXPORT_SYMBOL(netif_rx); ++ ++static __latent_entropy void net_tx_action(struct softirq_action *h) ++{ ++ struct softnet_data *sd = this_cpu_ptr(&softnet_data); ++ ++ if (sd->completion_queue) { ++ struct sk_buff *clist; ++ ++ local_irq_disable(); ++ clist = sd->completion_queue; ++ sd->completion_queue = NULL; ++ local_irq_enable(); ++ ++ while (clist) { ++ struct sk_buff *skb = clist; ++ ++ clist = clist->next; ++ ++ WARN_ON(refcount_read(&skb->users)); ++ if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) ++ trace_consume_skb(skb); ++ else ++ trace_kfree_skb(skb, net_tx_action, ++ SKB_DROP_REASON_NOT_SPECIFIED); ++ ++ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) ++ __kfree_skb(skb); ++ else ++ __kfree_skb_defer(skb); ++ } ++ } ++ ++ if (sd->output_queue) { ++ struct Qdisc *head; ++ ++ local_irq_disable(); ++ head = sd->output_queue; ++ sd->output_queue = NULL; ++ sd->output_queue_tailp = &sd->output_queue; ++ local_irq_enable(); ++ ++ rcu_read_lock(); ++ ++ while (head) { ++ struct Qdisc *q = head; ++ spinlock_t *root_lock = NULL; ++ ++ head = head->next_sched; ++ ++ /* We need to make sure head->next_sched is read ++ * before clearing __QDISC_STATE_SCHED ++ */ ++ smp_mb__before_atomic(); ++ ++ if (!(q->flags & TCQ_F_NOLOCK)) { ++ root_lock = qdisc_lock(q); ++ spin_lock(root_lock); ++ } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, ++ &q->state))) { ++ /* There is a synchronize_net() between ++ * STATE_DEACTIVATED flag being set and ++ * qdisc_reset()/some_qdisc_is_busy() in ++ * dev_deactivate(), so we can safely bail out ++ * early here to avoid data race between ++ * qdisc_deactivate() and some_qdisc_is_busy() ++ * for lockless qdisc. ++ */ ++ clear_bit(__QDISC_STATE_SCHED, &q->state); ++ continue; ++ } ++ ++ clear_bit(__QDISC_STATE_SCHED, &q->state); ++ qdisc_run(q); ++ if (root_lock) ++ spin_unlock(root_lock); ++ } ++ ++ rcu_read_unlock(); ++ } ++ ++ xfrm_dev_backlog(sd); ++} ++ ++#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) ++/* This hook is defined here for ATM LANE */ ++int (*br_fdb_test_addr_hook)(struct net_device *dev, ++ unsigned char *addr) __read_mostly; ++EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); ++#endif ++ ++static inline struct sk_buff * ++sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, ++ struct net_device *orig_dev, bool *another) ++{ ++#ifdef CONFIG_NET_CLS_ACT ++ struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); ++ struct tcf_result cl_res; ++ ++ /* If there's at least one ingress present somewhere (so ++ * we get here via enabled static key), remaining devices ++ * that are not configured with an ingress qdisc will bail ++ * out here. ++ */ ++ if (!miniq) ++ return skb; ++ ++ if (*pt_prev) { ++ *ret = deliver_skb(skb, *pt_prev, orig_dev); ++ *pt_prev = NULL; ++ } ++ ++ qdisc_skb_cb(skb)->pkt_len = skb->len; ++ tc_skb_cb(skb)->mru = 0; ++ tc_skb_cb(skb)->post_ct = false; ++ skb->tc_at_ingress = 1; ++ mini_qdisc_bstats_cpu_update(miniq, skb); ++ ++ switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { ++ case TC_ACT_OK: ++ case TC_ACT_RECLASSIFY: ++ skb->tc_index = TC_H_MIN(cl_res.classid); ++ break; ++ case TC_ACT_SHOT: ++ mini_qdisc_qstats_cpu_drop(miniq); ++ kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); ++ *ret = NET_RX_DROP; ++ return NULL; ++ case TC_ACT_STOLEN: ++ case TC_ACT_QUEUED: ++ case TC_ACT_TRAP: ++ consume_skb(skb); ++ *ret = NET_RX_SUCCESS; ++ return NULL; ++ case TC_ACT_REDIRECT: ++ /* skb_mac_header check was done by cls/act_bpf, so ++ * we can safely push the L2 header back before ++ * redirecting to another netdev ++ */ ++ __skb_push(skb, skb->mac_len); ++ if (skb_do_redirect(skb) == -EAGAIN) { ++ __skb_pull(skb, skb->mac_len); ++ *another = true; ++ break; ++ } ++ *ret = NET_RX_SUCCESS; ++ return NULL; ++ case TC_ACT_CONSUMED: ++ *ret = NET_RX_SUCCESS; ++ return NULL; ++ default: ++ break; ++ } ++#endif /* CONFIG_NET_CLS_ACT */ ++ return skb; ++} ++ ++/** ++ * netdev_is_rx_handler_busy - check if receive handler is registered ++ * @dev: device to check ++ * ++ * Check if a receive handler is already registered for a given device. ++ * Return true if there one. ++ * ++ * The caller must hold the rtnl_mutex. ++ */ ++bool netdev_is_rx_handler_busy(struct net_device *dev) ++{ ++ ASSERT_RTNL(); ++ return dev && rtnl_dereference(dev->rx_handler); ++} ++EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); ++ ++/** ++ * netdev_rx_handler_register - register receive handler ++ * @dev: device to register a handler for ++ * @rx_handler: receive handler to register ++ * @rx_handler_data: data pointer that is used by rx handler ++ * ++ * Register a receive handler for a device. This handler will then be ++ * called from __netif_receive_skb. A negative errno code is returned ++ * on a failure. ++ * ++ * The caller must hold the rtnl_mutex. ++ * ++ * For a general description of rx_handler, see enum rx_handler_result. ++ */ ++int netdev_rx_handler_register(struct net_device *dev, ++ rx_handler_func_t *rx_handler, ++ void *rx_handler_data) ++{ ++ if (netdev_is_rx_handler_busy(dev)) ++ return -EBUSY; ++ ++ if (dev->priv_flags & IFF_NO_RX_HANDLER) ++ return -EINVAL; ++ ++ /* Note: rx_handler_data must be set before rx_handler */ ++ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); ++ rcu_assign_pointer(dev->rx_handler, rx_handler); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(netdev_rx_handler_register); ++ ++/** ++ * netdev_rx_handler_unregister - unregister receive handler ++ * @dev: device to unregister a handler from ++ * ++ * Unregister a receive handler from a device. ++ * ++ * The caller must hold the rtnl_mutex. ++ */ ++void netdev_rx_handler_unregister(struct net_device *dev) ++{ ++ ++ ASSERT_RTNL(); ++ RCU_INIT_POINTER(dev->rx_handler, NULL); ++ /* a reader seeing a non NULL rx_handler in a rcu_read_lock() ++ * section has a guarantee to see a non NULL rx_handler_data ++ * as well. ++ */ ++ synchronize_net(); ++ RCU_INIT_POINTER(dev->rx_handler_data, NULL); ++} ++EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); ++ ++/* ++ * Limit the use of PFMEMALLOC reserves to those protocols that implement ++ * the special handling of PFMEMALLOC skbs. ++ */ ++static bool skb_pfmemalloc_protocol(struct sk_buff *skb) ++{ ++ switch (skb->protocol) { ++ case htons(ETH_P_ARP): ++ case htons(ETH_P_IP): ++ case htons(ETH_P_IPV6): ++ case htons(ETH_P_8021Q): ++ case htons(ETH_P_8021AD): ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, ++ int *ret, struct net_device *orig_dev) ++{ ++ if (nf_hook_ingress_active(skb)) { ++ int ingress_retval; ++ ++ if (*pt_prev) { ++ *ret = deliver_skb(skb, *pt_prev, orig_dev); ++ *pt_prev = NULL; ++ } ++ ++ rcu_read_lock(); ++ ingress_retval = nf_hook_ingress(skb); ++ rcu_read_unlock(); ++ return ingress_retval; ++ } ++ return 0; ++} ++ ++static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, ++ struct packet_type **ppt_prev) ++{ ++ struct packet_type *ptype, *pt_prev; ++ rx_handler_func_t *rx_handler; ++ struct sk_buff *skb = *pskb; ++ struct net_device *orig_dev; ++ bool deliver_exact = false; ++ int ret = NET_RX_DROP; ++ __be16 type; ++ ++ net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); ++ ++ trace_netif_receive_skb(skb); ++ ++ orig_dev = skb->dev; ++ ++ skb_reset_network_header(skb); ++ if (!skb_transport_header_was_set(skb)) ++ skb_reset_transport_header(skb); ++ skb_reset_mac_len(skb); ++ ++ pt_prev = NULL; ++ ++another_round: ++ skb->skb_iif = skb->dev->ifindex; ++ ++ __this_cpu_inc(softnet_data.processed); ++ ++ if (static_branch_unlikely(&generic_xdp_needed_key)) { ++ int ret2; ++ ++ migrate_disable(); ++ ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); ++ migrate_enable(); ++ ++ if (ret2 != XDP_PASS) { ++ ret = NET_RX_DROP; ++ goto out; ++ } ++ } ++ ++ if (eth_type_vlan(skb->protocol)) { ++ skb = skb_vlan_untag(skb); ++ if (unlikely(!skb)) ++ goto out; ++ } ++ ++ if (skb_skip_tc_classify(skb)) ++ goto skip_classify; ++ ++ if (pfmemalloc) ++ goto skip_taps; ++ ++ list_for_each_entry_rcu(ptype, &ptype_all, list) { ++ if (pt_prev) ++ ret = deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = ptype; ++ } ++ ++ list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { ++ if (pt_prev) ++ ret = deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = ptype; ++ } ++ ++skip_taps: ++#ifdef CONFIG_NET_INGRESS ++ if (static_branch_unlikely(&ingress_needed_key)) { ++ bool another = false; ++ ++ nf_skip_egress(skb, true); ++ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, ++ &another); ++ if (another) ++ goto another_round; ++ if (!skb) ++ goto out; ++ ++ nf_skip_egress(skb, false); ++ if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) ++ goto out; ++ } ++#endif ++ skb_reset_redirect(skb); ++skip_classify: ++ if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) ++ goto drop; ++ ++ if (skb_vlan_tag_present(skb)) { ++ if (pt_prev) { ++ ret = deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = NULL; ++ } ++ if (vlan_do_receive(&skb)) ++ goto another_round; ++ else if (unlikely(!skb)) ++ goto out; ++ } ++ ++ rx_handler = rcu_dereference(skb->dev->rx_handler); ++ if (rx_handler) { ++ if (pt_prev) { ++ ret = deliver_skb(skb, pt_prev, orig_dev); ++ pt_prev = NULL; ++ } ++ switch (rx_handler(&skb)) { ++ case RX_HANDLER_CONSUMED: ++ ret = NET_RX_SUCCESS; ++ goto out; ++ case RX_HANDLER_ANOTHER: ++ goto another_round; ++ case RX_HANDLER_EXACT: ++ deliver_exact = true; ++ break; ++ case RX_HANDLER_PASS: ++ break; ++ default: ++ BUG(); ++ } ++ } ++ ++ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { ++check_vlan_id: ++ if (skb_vlan_tag_get_id(skb)) { ++ /* Vlan id is non 0 and vlan_do_receive() above couldn't ++ * find vlan device. ++ */ ++ skb->pkt_type = PACKET_OTHERHOST; ++ } else if (eth_type_vlan(skb->protocol)) { ++ /* Outer header is 802.1P with vlan 0, inner header is ++ * 802.1Q or 802.1AD and vlan_do_receive() above could ++ * not find vlan dev for vlan id 0. ++ */ ++ __vlan_hwaccel_clear_tag(skb); ++ skb = skb_vlan_untag(skb); ++ if (unlikely(!skb)) ++ goto out; ++ if (vlan_do_receive(&skb)) ++ /* After stripping off 802.1P header with vlan 0 ++ * vlan dev is found for inner header. ++ */ ++ goto another_round; ++ else if (unlikely(!skb)) ++ goto out; ++ else ++ /* We have stripped outer 802.1P vlan 0 header. ++ * But could not find vlan dev. ++ * check again for vlan id to set OTHERHOST. ++ */ ++ goto check_vlan_id; ++ } ++ /* Note: we might in the future use prio bits ++ * and set skb->priority like in vlan_do_receive() ++ * For the time being, just ignore Priority Code Point ++ */ ++ __vlan_hwaccel_clear_tag(skb); ++ } ++ ++ type = skb->protocol; ++ ++ /* deliver only exact match when indicated */ ++ if (likely(!deliver_exact)) { ++ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, ++ &ptype_base[ntohs(type) & ++ PTYPE_HASH_MASK]); ++ } ++ ++ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, ++ &orig_dev->ptype_specific); ++ ++ if (unlikely(skb->dev != orig_dev)) { ++ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, ++ &skb->dev->ptype_specific); ++ } ++ ++ if (pt_prev) { ++ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) ++ goto drop; ++ *ppt_prev = pt_prev; ++ } else { ++drop: ++ if (!deliver_exact) ++ dev_core_stats_rx_dropped_inc(skb->dev); ++ else ++ dev_core_stats_rx_nohandler_inc(skb->dev); ++ kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); ++ /* Jamal, now you will not able to escape explaining ++ * me how you were going to use this. :-) ++ */ ++ ret = NET_RX_DROP; ++ } ++ ++out: ++ /* The invariant here is that if *ppt_prev is not NULL ++ * then skb should also be non-NULL. ++ * ++ * Apparently *ppt_prev assignment above holds this invariant due to ++ * skb dereferencing near it. ++ */ ++ *pskb = skb; ++ return ret; ++} ++ ++static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) ++{ ++ struct net_device *orig_dev = skb->dev; ++ struct packet_type *pt_prev = NULL; ++ int ret; ++ ++ ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); ++ if (pt_prev) ++ ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, ++ skb->dev, pt_prev, orig_dev); ++ return ret; ++} ++ ++/** ++ * netif_receive_skb_core - special purpose version of netif_receive_skb ++ * @skb: buffer to process ++ * ++ * More direct receive version of netif_receive_skb(). It should ++ * only be used by callers that have a need to skip RPS and Generic XDP. ++ * Caller must also take care of handling if ``(page_is_)pfmemalloc``. ++ * ++ * This function may only be called from softirq context and interrupts ++ * should be enabled. ++ * ++ * Return values (usually ignored): ++ * NET_RX_SUCCESS: no congestion ++ * NET_RX_DROP: packet was dropped ++ */ ++int netif_receive_skb_core(struct sk_buff *skb) ++{ ++ int ret; ++ ++ rcu_read_lock(); ++ ret = __netif_receive_skb_one_core(skb, false); ++ rcu_read_unlock(); ++ ++ return ret; ++} ++EXPORT_SYMBOL(netif_receive_skb_core); ++ ++static inline void __netif_receive_skb_list_ptype(struct list_head *head, ++ struct packet_type *pt_prev, ++ struct net_device *orig_dev) ++{ ++ struct sk_buff *skb, *next; ++ ++ if (!pt_prev) ++ return; ++ if (list_empty(head)) ++ return; ++ if (pt_prev->list_func != NULL) ++ INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, ++ ip_list_rcv, head, pt_prev, orig_dev); ++ else ++ list_for_each_entry_safe(skb, next, head, list) { ++ skb_list_del_init(skb); ++ pt_prev->func(skb, skb->dev, pt_prev, orig_dev); ++ } ++} ++ ++static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) ++{ ++ /* Fast-path assumptions: ++ * - There is no RX handler. ++ * - Only one packet_type matches. ++ * If either of these fails, we will end up doing some per-packet ++ * processing in-line, then handling the 'last ptype' for the whole ++ * sublist. This can't cause out-of-order delivery to any single ptype, ++ * because the 'last ptype' must be constant across the sublist, and all ++ * other ptypes are handled per-packet. ++ */ ++ /* Current (common) ptype of sublist */ ++ struct packet_type *pt_curr = NULL; ++ /* Current (common) orig_dev of sublist */ ++ struct net_device *od_curr = NULL; ++ struct list_head sublist; ++ struct sk_buff *skb, *next; ++ ++ INIT_LIST_HEAD(&sublist); ++ list_for_each_entry_safe(skb, next, head, list) { ++ struct net_device *orig_dev = skb->dev; ++ struct packet_type *pt_prev = NULL; ++ ++ skb_list_del_init(skb); ++ __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); ++ if (!pt_prev) ++ continue; ++ if (pt_curr != pt_prev || od_curr != orig_dev) { ++ /* dispatch old sublist */ ++ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); ++ /* start new sublist */ ++ INIT_LIST_HEAD(&sublist); ++ pt_curr = pt_prev; ++ od_curr = orig_dev; ++ } ++ list_add_tail(&skb->list, &sublist); ++ } ++ ++ /* dispatch final sublist */ ++ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); ++} ++ ++static int __netif_receive_skb(struct sk_buff *skb) ++{ ++ int ret; ++ ++ if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { ++ unsigned int noreclaim_flag; ++ ++ /* ++ * PFMEMALLOC skbs are special, they should ++ * - be delivered to SOCK_MEMALLOC sockets only ++ * - stay away from userspace ++ * - have bounded memory usage ++ * ++ * Use PF_MEMALLOC as this saves us from propagating the allocation ++ * context down to all allocation sites. ++ */ ++ noreclaim_flag = memalloc_noreclaim_save(); ++ ret = __netif_receive_skb_one_core(skb, true); ++ memalloc_noreclaim_restore(noreclaim_flag); ++ } else ++ ret = __netif_receive_skb_one_core(skb, false); ++ ++ return ret; ++} ++ ++static void __netif_receive_skb_list(struct list_head *head) ++{ ++ unsigned long noreclaim_flag = 0; ++ struct sk_buff *skb, *next; ++ bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ ++ ++ list_for_each_entry_safe(skb, next, head, list) { ++ if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { ++ struct list_head sublist; ++ ++ /* Handle the previous sublist */ ++ list_cut_before(&sublist, head, &skb->list); ++ if (!list_empty(&sublist)) ++ __netif_receive_skb_list_core(&sublist, pfmemalloc); ++ pfmemalloc = !pfmemalloc; ++ /* See comments in __netif_receive_skb */ ++ if (pfmemalloc) ++ noreclaim_flag = memalloc_noreclaim_save(); ++ else ++ memalloc_noreclaim_restore(noreclaim_flag); ++ } ++ } ++ /* Handle the remaining sublist */ ++ if (!list_empty(head)) ++ __netif_receive_skb_list_core(head, pfmemalloc); ++ /* Restore pflags */ ++ if (pfmemalloc) ++ memalloc_noreclaim_restore(noreclaim_flag); ++} ++ ++static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) ++{ ++ struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); ++ struct bpf_prog *new = xdp->prog; ++ int ret = 0; ++ ++ switch (xdp->command) { ++ case XDP_SETUP_PROG: ++ rcu_assign_pointer(dev->xdp_prog, new); ++ if (old) ++ bpf_prog_put(old); ++ ++ if (old && !new) { ++ static_branch_dec(&generic_xdp_needed_key); ++ } else if (new && !old) { ++ static_branch_inc(&generic_xdp_needed_key); ++ dev_disable_lro(dev); ++ dev_disable_gro_hw(dev); ++ } ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++static int netif_receive_skb_internal(struct sk_buff *skb) ++{ ++ int ret; ++ ++ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); ++ ++ if (skb_defer_rx_timestamp(skb)) ++ return NET_RX_SUCCESS; ++ ++ rcu_read_lock(); ++#ifdef CONFIG_RPS ++ if (static_branch_unlikely(&rps_needed)) { ++ struct rps_dev_flow voidflow, *rflow = &voidflow; ++ int cpu = get_rps_cpu(skb->dev, skb, &rflow); ++ ++ if (cpu >= 0) { ++ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); ++ rcu_read_unlock(); ++ return ret; ++ } ++ } ++#endif ++ ret = __netif_receive_skb(skb); ++ rcu_read_unlock(); ++ return ret; ++} ++ ++void netif_receive_skb_list_internal(struct list_head *head) ++{ ++ struct sk_buff *skb, *next; ++ struct list_head sublist; ++ ++ INIT_LIST_HEAD(&sublist); ++ list_for_each_entry_safe(skb, next, head, list) { ++ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); ++ skb_list_del_init(skb); ++ if (!skb_defer_rx_timestamp(skb)) ++ list_add_tail(&skb->list, &sublist); ++ } ++ list_splice_init(&sublist, head); ++ ++ rcu_read_lock(); ++#ifdef CONFIG_RPS ++ if (static_branch_unlikely(&rps_needed)) { ++ list_for_each_entry_safe(skb, next, head, list) { ++ struct rps_dev_flow voidflow, *rflow = &voidflow; ++ int cpu = get_rps_cpu(skb->dev, skb, &rflow); ++ ++ if (cpu >= 0) { ++ /* Will be handled, remove from list */ ++ skb_list_del_init(skb); ++ enqueue_to_backlog(skb, cpu, &rflow->last_qtail); ++ } ++ } ++ } ++#endif ++ __netif_receive_skb_list(head); ++ rcu_read_unlock(); ++} ++ ++/** ++ * netif_receive_skb - process receive buffer from network ++ * @skb: buffer to process ++ * ++ * netif_receive_skb() is the main receive data processing function. ++ * It always succeeds. The buffer may be dropped during processing ++ * for congestion control or by the protocol layers. ++ * ++ * This function may only be called from softirq context and interrupts ++ * should be enabled. ++ * ++ * Return values (usually ignored): ++ * NET_RX_SUCCESS: no congestion ++ * NET_RX_DROP: packet was dropped ++ */ ++int netif_receive_skb(struct sk_buff *skb) ++{ ++ int ret; ++ ++ trace_netif_receive_skb_entry(skb); ++ ++ ret = netif_receive_skb_internal(skb); ++ trace_netif_receive_skb_exit(ret); ++ ++ return ret; ++} ++EXPORT_SYMBOL(netif_receive_skb); ++ ++/** ++ * netif_receive_skb_list - process many receive buffers from network ++ * @head: list of skbs to process. ++ * ++ * Since return value of netif_receive_skb() is normally ignored, and ++ * wouldn't be meaningful for a list, this function returns void. ++ * ++ * This function may only be called from softirq context and interrupts ++ * should be enabled. ++ */ ++void netif_receive_skb_list(struct list_head *head) ++{ ++ struct sk_buff *skb; ++ ++ if (list_empty(head)) ++ return; ++ if (trace_netif_receive_skb_list_entry_enabled()) { ++ list_for_each_entry(skb, head, list) ++ trace_netif_receive_skb_list_entry(skb); ++ } ++ netif_receive_skb_list_internal(head); ++ trace_netif_receive_skb_list_exit(0); ++} ++EXPORT_SYMBOL(netif_receive_skb_list); ++ ++static DEFINE_PER_CPU(struct work_struct, flush_works); ++ ++/* Network device is going away, flush any packets still pending */ ++static void flush_backlog(struct work_struct *work) ++{ ++ struct sk_buff *skb, *tmp; ++ struct softnet_data *sd; ++ ++ local_bh_disable(); ++ sd = this_cpu_ptr(&softnet_data); ++ ++ rps_lock_irq_disable(sd); ++ skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { ++ if (skb->dev->reg_state == NETREG_UNREGISTERING) { ++ __skb_unlink(skb, &sd->input_pkt_queue); ++ dev_kfree_skb_irq(skb); ++ input_queue_head_incr(sd); ++ } ++ } ++ rps_unlock_irq_enable(sd); ++ ++ skb_queue_walk_safe(&sd->process_queue, skb, tmp) { ++ if (skb->dev->reg_state == NETREG_UNREGISTERING) { ++ __skb_unlink(skb, &sd->process_queue); ++ kfree_skb(skb); ++ input_queue_head_incr(sd); ++ } ++ } ++ local_bh_enable(); ++} ++ ++static bool flush_required(int cpu) ++{ ++#if IS_ENABLED(CONFIG_RPS) ++ struct softnet_data *sd = &per_cpu(softnet_data, cpu); ++ bool do_flush; ++ ++ rps_lock_irq_disable(sd); ++ ++ /* as insertion into process_queue happens with the rps lock held, ++ * process_queue access may race only with dequeue ++ */ ++ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || ++ !skb_queue_empty_lockless(&sd->process_queue); ++ rps_unlock_irq_enable(sd); ++ ++ return do_flush; ++#endif ++ /* without RPS we can't safely check input_pkt_queue: during a ++ * concurrent remote skb_queue_splice() we can detect as empty both ++ * input_pkt_queue and process_queue even if the latter could end-up ++ * containing a lot of packets. ++ */ ++ return true; ++} ++ ++static void flush_all_backlogs(void) ++{ ++ static cpumask_t flush_cpus; ++ unsigned int cpu; ++ ++ /* since we are under rtnl lock protection we can use static data ++ * for the cpumask and avoid allocating on stack the possibly ++ * large mask ++ */ ++ ASSERT_RTNL(); ++ ++ cpus_read_lock(); ++ ++ cpumask_clear(&flush_cpus); ++ for_each_online_cpu(cpu) { ++ if (flush_required(cpu)) { ++ queue_work_on(cpu, system_highpri_wq, ++ per_cpu_ptr(&flush_works, cpu)); ++ cpumask_set_cpu(cpu, &flush_cpus); ++ } ++ } ++ ++ /* we can have in flight packet[s] on the cpus we are not flushing, ++ * synchronize_net() in unregister_netdevice_many() will take care of ++ * them ++ */ ++ for_each_cpu(cpu, &flush_cpus) ++ flush_work(per_cpu_ptr(&flush_works, cpu)); ++ ++ cpus_read_unlock(); ++} ++ ++static void net_rps_send_ipi(struct softnet_data *remsd) ++{ ++#ifdef CONFIG_RPS ++ while (remsd) { ++ struct softnet_data *next = remsd->rps_ipi_next; ++ ++ if (cpu_online(remsd->cpu)) ++ smp_call_function_single_async(remsd->cpu, &remsd->csd); ++ remsd = next; ++ } ++#endif ++} ++ ++/* ++ * net_rps_action_and_irq_enable sends any pending IPI's for rps. ++ * Note: called with local irq disabled, but exits with local irq enabled. ++ */ ++static void net_rps_action_and_irq_enable(struct softnet_data *sd) ++{ ++#ifdef CONFIG_RPS ++ struct softnet_data *remsd = sd->rps_ipi_list; ++ ++ if (remsd) { ++ sd->rps_ipi_list = NULL; ++ ++ local_irq_enable(); ++ ++ /* Send pending IPI's to kick RPS processing on remote cpus. */ ++ net_rps_send_ipi(remsd); ++ } else ++#endif ++ local_irq_enable(); ++} ++ ++static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) ++{ ++#ifdef CONFIG_RPS ++ return sd->rps_ipi_list != NULL; ++#else ++ return false; ++#endif ++} ++ ++static int process_backlog(struct napi_struct *napi, int quota) ++{ ++ struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); ++ bool again = true; ++ int work = 0; ++ ++ /* Check if we have pending ipi, its better to send them now, ++ * not waiting net_rx_action() end. ++ */ ++ if (sd_has_rps_ipi_waiting(sd)) { ++ local_irq_disable(); ++ net_rps_action_and_irq_enable(sd); ++ } ++ ++ napi->weight = READ_ONCE(dev_rx_weight); ++ while (again) { ++ struct sk_buff *skb; ++ ++ while ((skb = __skb_dequeue(&sd->process_queue))) { ++ rcu_read_lock(); ++ __netif_receive_skb(skb); ++ rcu_read_unlock(); ++ input_queue_head_incr(sd); ++ if (++work >= quota) ++ return work; ++ ++ } ++ ++ rps_lock_irq_disable(sd); ++ if (skb_queue_empty(&sd->input_pkt_queue)) { ++ /* ++ * Inline a custom version of __napi_complete(). ++ * only current cpu owns and manipulates this napi, ++ * and NAPI_STATE_SCHED is the only possible flag set ++ * on backlog. ++ * We can use a plain write instead of clear_bit(), ++ * and we dont need an smp_mb() memory barrier. ++ */ ++ napi->state = 0; ++ again = false; ++ } else { ++ skb_queue_splice_tail_init(&sd->input_pkt_queue, ++ &sd->process_queue); ++ } ++ rps_unlock_irq_enable(sd); ++ } ++ ++ return work; ++} ++ ++/** ++ * __napi_schedule - schedule for receive ++ * @n: entry to schedule ++ * ++ * The entry's receive function will be scheduled to run. ++ * Consider using __napi_schedule_irqoff() if hard irqs are masked. ++ */ ++void __napi_schedule(struct napi_struct *n) ++{ ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ____napi_schedule(this_cpu_ptr(&softnet_data), n); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL(__napi_schedule); ++ ++/** ++ * napi_schedule_prep - check if napi can be scheduled ++ * @n: napi context ++ * ++ * Test if NAPI routine is already running, and if not mark ++ * it as running. This is used as a condition variable to ++ * insure only one NAPI poll instance runs. We also make ++ * sure there is no pending NAPI disable. ++ */ ++bool napi_schedule_prep(struct napi_struct *n) ++{ ++ unsigned long val, new; ++ ++ do { ++ val = READ_ONCE(n->state); ++ if (unlikely(val & NAPIF_STATE_DISABLE)) ++ return false; ++ new = val | NAPIF_STATE_SCHED; ++ ++ /* Sets STATE_MISSED bit if STATE_SCHED was already set ++ * This was suggested by Alexander Duyck, as compiler ++ * emits better code than : ++ * if (val & NAPIF_STATE_SCHED) ++ * new |= NAPIF_STATE_MISSED; ++ */ ++ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * ++ NAPIF_STATE_MISSED; ++ } while (cmpxchg(&n->state, val, new) != val); ++ ++ return !(val & NAPIF_STATE_SCHED); ++} ++EXPORT_SYMBOL(napi_schedule_prep); ++ ++/** ++ * __napi_schedule_irqoff - schedule for receive ++ * @n: entry to schedule ++ * ++ * Variant of __napi_schedule() assuming hard irqs are masked. ++ * ++ * On PREEMPT_RT enabled kernels this maps to __napi_schedule() ++ * because the interrupt disabled assumption might not be true ++ * due to force-threaded interrupts and spinlock substitution. ++ */ ++void __napi_schedule_irqoff(struct napi_struct *n) ++{ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ ____napi_schedule(this_cpu_ptr(&softnet_data), n); ++ else ++ __napi_schedule(n); ++} ++EXPORT_SYMBOL(__napi_schedule_irqoff); ++ ++bool napi_complete_done(struct napi_struct *n, int work_done) ++{ ++ unsigned long flags, val, new, timeout = 0; ++ bool ret = true; ++ ++ /* ++ * 1) Don't let napi dequeue from the cpu poll list ++ * just in case its running on a different cpu. ++ * 2) If we are busy polling, do nothing here, we have ++ * the guarantee we will be called later. ++ */ ++ if (unlikely(n->state & (NAPIF_STATE_NPSVC | ++ NAPIF_STATE_IN_BUSY_POLL))) ++ return false; ++ ++ if (work_done) { ++ if (n->gro_bitmask) ++ timeout = READ_ONCE(n->dev->gro_flush_timeout); ++ n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); ++ } ++ if (n->defer_hard_irqs_count > 0) { ++ n->defer_hard_irqs_count--; ++ timeout = READ_ONCE(n->dev->gro_flush_timeout); ++ if (timeout) ++ ret = false; ++ } ++ if (n->gro_bitmask) { ++ /* When the NAPI instance uses a timeout and keeps postponing ++ * it, we need to bound somehow the time packets are kept in ++ * the GRO layer ++ */ ++ napi_gro_flush(n, !!timeout); ++ } ++ ++ gro_normal_list(n); ++ ++ if (unlikely(!list_empty(&n->poll_list))) { ++ /* If n->poll_list is not empty, we need to mask irqs */ ++ local_irq_save(flags); ++ list_del_init(&n->poll_list); ++ local_irq_restore(flags); ++ } ++ ++ do { ++ val = READ_ONCE(n->state); ++ ++ WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); ++ ++ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | ++ NAPIF_STATE_SCHED_THREADED | ++ NAPIF_STATE_PREFER_BUSY_POLL); ++ ++ /* If STATE_MISSED was set, leave STATE_SCHED set, ++ * because we will call napi->poll() one more time. ++ * This C code was suggested by Alexander Duyck to help gcc. ++ */ ++ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * ++ NAPIF_STATE_SCHED; ++ } while (cmpxchg(&n->state, val, new) != val); ++ ++ if (unlikely(val & NAPIF_STATE_MISSED)) { ++ __napi_schedule(n); ++ return false; ++ } ++ ++ if (timeout) ++ hrtimer_start(&n->timer, ns_to_ktime(timeout), ++ HRTIMER_MODE_REL_PINNED); ++ return ret; ++} ++EXPORT_SYMBOL(napi_complete_done); ++ ++/* must be called under rcu_read_lock(), as we dont take a reference */ ++static struct napi_struct *napi_by_id(unsigned int napi_id) ++{ ++ unsigned int hash = napi_id % HASH_SIZE(napi_hash); ++ struct napi_struct *napi; ++ ++ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) ++ if (napi->napi_id == napi_id) ++ return napi; ++ ++ return NULL; ++} ++ ++#if defined(CONFIG_NET_RX_BUSY_POLL) ++ ++static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) ++{ ++ if (!skip_schedule) { ++ gro_normal_list(napi); ++ __napi_schedule(napi); ++ return; ++ } ++ ++ if (napi->gro_bitmask) { ++ /* flush too old packets ++ * If HZ < 1000, flush all packets. ++ */ ++ napi_gro_flush(napi, HZ >= 1000); ++ } ++ ++ gro_normal_list(napi); ++ clear_bit(NAPI_STATE_SCHED, &napi->state); ++} ++ ++static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, ++ u16 budget) ++{ ++ bool skip_schedule = false; ++ unsigned long timeout; ++ int rc; ++ ++ /* Busy polling means there is a high chance device driver hard irq ++ * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was ++ * set in napi_schedule_prep(). ++ * Since we are about to call napi->poll() once more, we can safely ++ * clear NAPI_STATE_MISSED. ++ * ++ * Note: x86 could use a single "lock and ..." instruction ++ * to perform these two clear_bit() ++ */ ++ clear_bit(NAPI_STATE_MISSED, &napi->state); ++ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); ++ ++ local_bh_disable(); ++ ++ if (prefer_busy_poll) { ++ napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); ++ timeout = READ_ONCE(napi->dev->gro_flush_timeout); ++ if (napi->defer_hard_irqs_count && timeout) { ++ hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); ++ skip_schedule = true; ++ } ++ } ++ ++ /* All we really want here is to re-enable device interrupts. ++ * Ideally, a new ndo_busy_poll_stop() could avoid another round. ++ */ ++ rc = napi->poll(napi, budget); ++ /* We can't gro_normal_list() here, because napi->poll() might have ++ * rearmed the napi (napi_complete_done()) in which case it could ++ * already be running on another CPU. ++ */ ++ trace_napi_poll(napi, rc, budget); ++ netpoll_poll_unlock(have_poll_lock); ++ if (rc == budget) ++ __busy_poll_stop(napi, skip_schedule); ++ local_bh_enable(); ++} ++ ++void napi_busy_loop(unsigned int napi_id, ++ bool (*loop_end)(void *, unsigned long), ++ void *loop_end_arg, bool prefer_busy_poll, u16 budget) ++{ ++ unsigned long start_time = loop_end ? busy_loop_current_time() : 0; ++ int (*napi_poll)(struct napi_struct *napi, int budget); ++ void *have_poll_lock = NULL; ++ struct napi_struct *napi; ++ ++restart: ++ napi_poll = NULL; ++ ++ rcu_read_lock(); ++ ++ napi = napi_by_id(napi_id); ++ if (!napi) ++ goto out; ++ ++ preempt_disable(); ++ for (;;) { ++ int work = 0; ++ ++ local_bh_disable(); ++ if (!napi_poll) { ++ unsigned long val = READ_ONCE(napi->state); ++ ++ /* If multiple threads are competing for this napi, ++ * we avoid dirtying napi->state as much as we can. ++ */ ++ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | ++ NAPIF_STATE_IN_BUSY_POLL)) { ++ if (prefer_busy_poll) ++ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); ++ goto count; ++ } ++ if (cmpxchg(&napi->state, val, ++ val | NAPIF_STATE_IN_BUSY_POLL | ++ NAPIF_STATE_SCHED) != val) { ++ if (prefer_busy_poll) ++ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); ++ goto count; ++ } ++ have_poll_lock = netpoll_poll_lock(napi); ++ napi_poll = napi->poll; ++ } ++ work = napi_poll(napi, budget); ++ trace_napi_poll(napi, work, budget); ++ gro_normal_list(napi); ++count: ++ if (work > 0) ++ __NET_ADD_STATS(dev_net(napi->dev), ++ LINUX_MIB_BUSYPOLLRXPACKETS, work); ++ local_bh_enable(); ++ ++ if (!loop_end || loop_end(loop_end_arg, start_time)) ++ break; ++ ++ if (unlikely(need_resched())) { ++ if (napi_poll) ++ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); ++ preempt_enable(); ++ rcu_read_unlock(); ++ cond_resched(); ++ if (loop_end(loop_end_arg, start_time)) ++ return; ++ goto restart; ++ } ++ cpu_relax(); ++ } ++ if (napi_poll) ++ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); ++ preempt_enable(); ++out: ++ rcu_read_unlock(); ++} ++EXPORT_SYMBOL(napi_busy_loop); ++ ++#endif /* CONFIG_NET_RX_BUSY_POLL */ ++ ++static void napi_hash_add(struct napi_struct *napi) ++{ ++ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) ++ return; ++ ++ spin_lock(&napi_hash_lock); ++ ++ /* 0..NR_CPUS range is reserved for sender_cpu use */ ++ do { ++ if (unlikely(++napi_gen_id < MIN_NAPI_ID)) ++ napi_gen_id = MIN_NAPI_ID; ++ } while (napi_by_id(napi_gen_id)); ++ napi->napi_id = napi_gen_id; ++ ++ hlist_add_head_rcu(&napi->napi_hash_node, ++ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); ++ ++ spin_unlock(&napi_hash_lock); ++} ++ ++/* Warning : caller is responsible to make sure rcu grace period ++ * is respected before freeing memory containing @napi ++ */ ++static void napi_hash_del(struct napi_struct *napi) ++{ ++ spin_lock(&napi_hash_lock); ++ ++ hlist_del_init_rcu(&napi->napi_hash_node); ++ ++ spin_unlock(&napi_hash_lock); ++} ++ ++static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) ++{ ++ struct napi_struct *napi; ++ ++ napi = container_of(timer, struct napi_struct, timer); ++ ++ /* Note : we use a relaxed variant of napi_schedule_prep() not setting ++ * NAPI_STATE_MISSED, since we do not react to a device IRQ. ++ */ ++ if (!napi_disable_pending(napi) && ++ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { ++ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); ++ __napi_schedule_irqoff(napi); ++ } ++ ++ return HRTIMER_NORESTART; ++} ++ ++static void init_gro_hash(struct napi_struct *napi) ++{ ++ int i; ++ ++ for (i = 0; i < GRO_HASH_BUCKETS; i++) { ++ INIT_LIST_HEAD(&napi->gro_hash[i].list); ++ napi->gro_hash[i].count = 0; ++ } ++ napi->gro_bitmask = 0; ++} ++ ++int dev_set_threaded(struct net_device *dev, bool threaded) ++{ ++ struct napi_struct *napi; ++ int err = 0; ++ ++ if (dev->threaded == threaded) ++ return 0; ++ ++ if (threaded) { ++ list_for_each_entry(napi, &dev->napi_list, dev_list) { ++ if (!napi->thread) { ++ err = napi_kthread_create(napi); ++ if (err) { ++ threaded = false; ++ break; ++ } ++ } ++ } ++ } ++ ++ dev->threaded = threaded; ++ ++ /* Make sure kthread is created before THREADED bit ++ * is set. ++ */ ++ smp_mb__before_atomic(); ++ ++ /* Setting/unsetting threaded mode on a napi might not immediately ++ * take effect, if the current napi instance is actively being ++ * polled. In this case, the switch between threaded mode and ++ * softirq mode will happen in the next round of napi_schedule(). ++ * This should not cause hiccups/stalls to the live traffic. ++ */ ++ list_for_each_entry(napi, &dev->napi_list, dev_list) { ++ if (threaded) ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++ else ++ clear_bit(NAPI_STATE_THREADED, &napi->state); ++ } ++ ++ return err; ++} ++EXPORT_SYMBOL(dev_set_threaded); ++ ++/* Double check that napi_get_frags() allocates skbs with ++ * skb->head being backed by slab, not a page fragment. ++ * This is to make sure bug fixed in 3226b158e67c ++ * ("net: avoid 32 x truesize under-estimation for tiny skbs") ++ * does not accidentally come back. ++ */ ++static void napi_get_frags_check(struct napi_struct *napi) ++{ ++ struct sk_buff *skb; ++ ++ local_bh_disable(); ++ skb = napi_get_frags(napi); ++ WARN_ON_ONCE(skb && skb->head_frag); ++ napi_free_frags(napi); ++ local_bh_enable(); ++} ++ ++void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, ++ int (*poll)(struct napi_struct *, int), int weight) ++{ ++ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) ++ return; ++ ++ INIT_LIST_HEAD(&napi->poll_list); ++ INIT_HLIST_NODE(&napi->napi_hash_node); ++ hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); ++ napi->timer.function = napi_watchdog; ++ init_gro_hash(napi); ++ napi->skb = NULL; ++ INIT_LIST_HEAD(&napi->rx_list); ++ napi->rx_count = 0; ++ napi->poll = poll; ++ if (weight > NAPI_POLL_WEIGHT) ++ netdev_err_once(dev, "%s() called with weight %d\n", __func__, ++ weight); ++ napi->weight = weight; ++ napi->dev = dev; ++#ifdef CONFIG_NETPOLL ++ napi->poll_owner = -1; ++#endif ++ set_bit(NAPI_STATE_SCHED, &napi->state); ++ set_bit(NAPI_STATE_NPSVC, &napi->state); ++ list_add_rcu(&napi->dev_list, &dev->napi_list); ++ napi_hash_add(napi); ++ napi_get_frags_check(napi); ++ /* Create kthread for this napi if dev->threaded is set. ++ * Clear dev->threaded if kthread creation failed so that ++ * threaded mode will not be enabled in napi_enable(). ++ */ ++ if (dev->threaded && napi_kthread_create(napi)) ++ dev->threaded = 0; ++} ++EXPORT_SYMBOL(netif_napi_add_weight); ++ ++void napi_disable(struct napi_struct *n) ++{ ++ unsigned long val, new; ++ ++ might_sleep(); ++ set_bit(NAPI_STATE_DISABLE, &n->state); ++ ++ for ( ; ; ) { ++ val = READ_ONCE(n->state); ++ if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { ++ usleep_range(20, 200); ++ continue; ++ } ++ ++ new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; ++ new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); ++ ++ if (cmpxchg(&n->state, val, new) == val) ++ break; ++ } ++ ++ hrtimer_cancel(&n->timer); ++ ++ clear_bit(NAPI_STATE_DISABLE, &n->state); ++} ++EXPORT_SYMBOL(napi_disable); ++ ++/** ++ * napi_enable - enable NAPI scheduling ++ * @n: NAPI context ++ * ++ * Resume NAPI from being scheduled on this context. ++ * Must be paired with napi_disable. ++ */ ++void napi_enable(struct napi_struct *n) ++{ ++ unsigned long val, new; ++ ++ do { ++ val = READ_ONCE(n->state); ++ BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); ++ ++ new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); ++ if (n->dev->threaded && n->thread) ++ new |= NAPIF_STATE_THREADED; ++ } while (cmpxchg(&n->state, val, new) != val); ++} ++EXPORT_SYMBOL(napi_enable); ++ ++static void flush_gro_hash(struct napi_struct *napi) ++{ ++ int i; ++ ++ for (i = 0; i < GRO_HASH_BUCKETS; i++) { ++ struct sk_buff *skb, *n; ++ ++ list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) ++ kfree_skb(skb); ++ napi->gro_hash[i].count = 0; ++ } ++} ++ ++/* Must be called in process context */ ++void __netif_napi_del(struct napi_struct *napi) ++{ ++ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) ++ return; ++ ++ napi_hash_del(napi); ++ list_del_rcu(&napi->dev_list); ++ napi_free_frags(napi); ++ ++ flush_gro_hash(napi); ++ napi->gro_bitmask = 0; ++ ++ if (napi->thread) { ++ kthread_stop(napi->thread); ++ napi->thread = NULL; ++ } ++} ++EXPORT_SYMBOL(__netif_napi_del); ++ ++static int __napi_poll(struct napi_struct *n, bool *repoll) ++{ ++ int work, weight; ++ ++ weight = n->weight; ++ ++ /* This NAPI_STATE_SCHED test is for avoiding a race ++ * with netpoll's poll_napi(). Only the entity which ++ * obtains the lock and sees NAPI_STATE_SCHED set will ++ * actually make the ->poll() call. Therefore we avoid ++ * accidentally calling ->poll() when NAPI is not scheduled. ++ */ ++ work = 0; ++ if (test_bit(NAPI_STATE_SCHED, &n->state)) { ++ work = n->poll(n, weight); ++ trace_napi_poll(n, work, weight); ++ } ++ ++ if (unlikely(work > weight)) ++ netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", ++ n->poll, work, weight); ++ ++ if (likely(work < weight)) ++ return work; ++ ++ /* Drivers must not modify the NAPI state if they ++ * consume the entire weight. In such cases this code ++ * still "owns" the NAPI instance and therefore can ++ * move the instance around on the list at-will. ++ */ ++ if (unlikely(napi_disable_pending(n))) { ++ napi_complete(n); ++ return work; ++ } ++ ++ /* The NAPI context has more processing work, but busy-polling ++ * is preferred. Exit early. ++ */ ++ if (napi_prefer_busy_poll(n)) { ++ if (napi_complete_done(n, work)) { ++ /* If timeout is not set, we need to make sure ++ * that the NAPI is re-scheduled. ++ */ ++ napi_schedule(n); ++ } ++ return work; ++ } ++ ++ if (n->gro_bitmask) { ++ /* flush too old packets ++ * If HZ < 1000, flush all packets. ++ */ ++ napi_gro_flush(n, HZ >= 1000); ++ } ++ ++ gro_normal_list(n); ++ ++ /* Some drivers may have called napi_schedule ++ * prior to exhausting their budget. ++ */ ++ if (unlikely(!list_empty(&n->poll_list))) { ++ pr_warn_once("%s: Budget exhausted after napi rescheduled\n", ++ n->dev ? n->dev->name : "backlog"); ++ return work; ++ } ++ ++ *repoll = true; ++ ++ return work; ++} ++ ++static int napi_poll(struct napi_struct *n, struct list_head *repoll) ++{ ++ bool do_repoll = false; ++ void *have; ++ int work; ++ ++ list_del_init(&n->poll_list); ++ ++ have = netpoll_poll_lock(n); ++ ++ work = __napi_poll(n, &do_repoll); ++ ++ if (do_repoll) ++ list_add_tail(&n->poll_list, repoll); ++ ++ netpoll_poll_unlock(have); ++ ++ return work; ++} ++ ++static int napi_thread_wait(struct napi_struct *napi) ++{ ++ bool woken = false; ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ while (!kthread_should_stop()) { ++ /* Testing SCHED_THREADED bit here to make sure the current ++ * kthread owns this napi and could poll on this napi. ++ * Testing SCHED bit is not enough because SCHED bit might be ++ * set by some other busy poll thread or by napi_disable(). ++ */ ++ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { ++ WARN_ON(!list_empty(&napi->poll_list)); ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ schedule(); ++ /* woken being true indicates this thread owns this napi. */ ++ woken = true; ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ __set_current_state(TASK_RUNNING); ++ ++ return -1; ++} ++ ++static int napi_threaded_poll(void *data) ++{ ++ struct napi_struct *napi = data; ++ void *have; ++ ++ while (!napi_thread_wait(napi)) { ++ for (;;) { ++ bool repoll = false; ++ ++ local_bh_disable(); ++ ++ have = netpoll_poll_lock(napi); ++ __napi_poll(napi, &repoll); ++ netpoll_poll_unlock(have); ++ ++ local_bh_enable(); ++ ++ if (!repoll) ++ break; ++ ++ cond_resched(); ++ } ++ } ++ return 0; ++} ++ ++static void skb_defer_free_flush(struct softnet_data *sd) ++{ ++ struct sk_buff *skb, *next; ++ unsigned long flags; ++ ++ /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ ++ if (!READ_ONCE(sd->defer_list)) ++ return; ++ ++ spin_lock_irqsave(&sd->defer_lock, flags); ++ skb = sd->defer_list; ++ sd->defer_list = NULL; ++ sd->defer_count = 0; ++ spin_unlock_irqrestore(&sd->defer_lock, flags); ++ ++ while (skb != NULL) { ++ next = skb->next; ++ napi_consume_skb(skb, 1); ++ skb = next; ++ } ++} ++ ++static __latent_entropy void net_rx_action(struct softirq_action *h) ++{ ++ struct softnet_data *sd = this_cpu_ptr(&softnet_data); ++ unsigned long time_limit = jiffies + ++ usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); ++ int budget = READ_ONCE(netdev_budget); ++ LIST_HEAD(list); ++ LIST_HEAD(repoll); ++ ++ local_irq_disable(); ++ list_splice_init(&sd->poll_list, &list); ++ local_irq_enable(); ++ ++ for (;;) { ++ struct napi_struct *n; ++ ++ skb_defer_free_flush(sd); ++ ++ if (list_empty(&list)) { ++ if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) ++ goto end; ++ break; ++ } ++ ++ n = list_first_entry(&list, struct napi_struct, poll_list); ++ budget -= napi_poll(n, &repoll); ++ ++ /* If softirq window is exhausted then punt. ++ * Allow this to run for 2 jiffies since which will allow ++ * an average latency of 1.5/HZ. ++ */ ++ if (unlikely(budget <= 0 || ++ time_after_eq(jiffies, time_limit))) { ++ sd->time_squeeze++; ++ break; ++ } ++ } ++ ++ local_irq_disable(); ++ ++ list_splice_tail_init(&sd->poll_list, &list); ++ list_splice_tail(&repoll, &list); ++ list_splice(&list, &sd->poll_list); ++ if (!list_empty(&sd->poll_list)) ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ ++ net_rps_action_and_irq_enable(sd); ++end:; ++} ++ ++struct netdev_adjacent { ++ struct net_device *dev; ++ netdevice_tracker dev_tracker; ++ ++ /* upper master flag, there can only be one master device per list */ ++ bool master; ++ ++ /* lookup ignore flag */ ++ bool ignore; ++ ++ /* counter for the number of times this device was added to us */ ++ u16 ref_nr; ++ ++ /* private field for the users */ ++ void *private; ++ ++ struct list_head list; ++ struct rcu_head rcu; ++}; ++ ++static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, ++ struct list_head *adj_list) ++{ ++ struct netdev_adjacent *adj; ++ ++ list_for_each_entry(adj, adj_list, list) { ++ if (adj->dev == adj_dev) ++ return adj; ++ } ++ return NULL; ++} ++ ++static int ____netdev_has_upper_dev(struct net_device *upper_dev, ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *dev = (struct net_device *)priv->data; ++ ++ return upper_dev == dev; ++} ++ ++/** ++ * netdev_has_upper_dev - Check if device is linked to an upper device ++ * @dev: device ++ * @upper_dev: upper device to check ++ * ++ * Find out if a device is linked to specified upper device and return true ++ * in case it is. Note that this checks only immediate upper device, ++ * not through a complete stack of devices. The caller must hold the RTNL lock. ++ */ ++bool netdev_has_upper_dev(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ struct netdev_nested_priv priv = { ++ .data = (void *)upper_dev, ++ }; ++ ++ ASSERT_RTNL(); ++ ++ return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, ++ &priv); ++} ++EXPORT_SYMBOL(netdev_has_upper_dev); ++ ++/** ++ * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device ++ * @dev: device ++ * @upper_dev: upper device to check ++ * ++ * Find out if a device is linked to specified upper device and return true ++ * in case it is. Note that this checks the entire upper device chain. ++ * The caller must hold rcu lock. ++ */ ++ ++bool netdev_has_upper_dev_all_rcu(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ struct netdev_nested_priv priv = { ++ .data = (void *)upper_dev, ++ }; ++ ++ return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, ++ &priv); ++} ++EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); ++ ++/** ++ * netdev_has_any_upper_dev - Check if device is linked to some device ++ * @dev: device ++ * ++ * Find out if a device is linked to an upper device and return true in case ++ * it is. The caller must hold the RTNL lock. ++ */ ++bool netdev_has_any_upper_dev(struct net_device *dev) ++{ ++ ASSERT_RTNL(); ++ ++ return !list_empty(&dev->adj_list.upper); ++} ++EXPORT_SYMBOL(netdev_has_any_upper_dev); ++ ++/** ++ * netdev_master_upper_dev_get - Get master upper device ++ * @dev: device ++ * ++ * Find a master upper device and return pointer to it or NULL in case ++ * it's not there. The caller must hold the RTNL lock. ++ */ ++struct net_device *netdev_master_upper_dev_get(struct net_device *dev) ++{ ++ struct netdev_adjacent *upper; ++ ++ ASSERT_RTNL(); ++ ++ if (list_empty(&dev->adj_list.upper)) ++ return NULL; ++ ++ upper = list_first_entry(&dev->adj_list.upper, ++ struct netdev_adjacent, list); ++ if (likely(upper->master)) ++ return upper->dev; ++ return NULL; ++} ++EXPORT_SYMBOL(netdev_master_upper_dev_get); ++ ++static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) ++{ ++ struct netdev_adjacent *upper; ++ ++ ASSERT_RTNL(); ++ ++ if (list_empty(&dev->adj_list.upper)) ++ return NULL; ++ ++ upper = list_first_entry(&dev->adj_list.upper, ++ struct netdev_adjacent, list); ++ if (likely(upper->master) && !upper->ignore) ++ return upper->dev; ++ return NULL; ++} ++ ++/** ++ * netdev_has_any_lower_dev - Check if device is linked to some device ++ * @dev: device ++ * ++ * Find out if a device is linked to a lower device and return true in case ++ * it is. The caller must hold the RTNL lock. ++ */ ++static bool netdev_has_any_lower_dev(struct net_device *dev) ++{ ++ ASSERT_RTNL(); ++ ++ return !list_empty(&dev->adj_list.lower); ++} ++ ++void *netdev_adjacent_get_private(struct list_head *adj_list) ++{ ++ struct netdev_adjacent *adj; ++ ++ adj = list_entry(adj_list, struct netdev_adjacent, list); ++ ++ return adj->private; ++} ++EXPORT_SYMBOL(netdev_adjacent_get_private); ++ ++/** ++ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list ++ * @dev: device ++ * @iter: list_head ** of the current position ++ * ++ * Gets the next device from the dev's upper list, starting from iter ++ * position. The caller must hold RCU read lock. ++ */ ++struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *upper; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); ++ ++ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&upper->list == &dev->adj_list.upper) ++ return NULL; ++ ++ *iter = &upper->list; ++ ++ return upper->dev; ++} ++EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); ++ ++static struct net_device *__netdev_next_upper_dev(struct net_device *dev, ++ struct list_head **iter, ++ bool *ignore) ++{ ++ struct netdev_adjacent *upper; ++ ++ upper = list_entry((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&upper->list == &dev->adj_list.upper) ++ return NULL; ++ ++ *iter = &upper->list; ++ *ignore = upper->ignore; ++ ++ return upper->dev; ++} ++ ++static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *upper; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); ++ ++ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&upper->list == &dev->adj_list.upper) ++ return NULL; ++ ++ *iter = &upper->list; ++ ++ return upper->dev; ++} ++ ++static int __netdev_walk_all_upper_dev(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ bool ignore; ++ ++ now = dev; ++ iter = &dev->adj_list.upper; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ udev = __netdev_next_upper_dev(now, &iter, &ignore); ++ if (!udev) ++ break; ++ if (ignore) ++ continue; ++ ++ next = udev; ++ niter = &udev->adj_list.upper; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++ ++int netdev_walk_all_upper_dev_rcu(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ ++ now = dev; ++ iter = &dev->adj_list.upper; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ udev = netdev_next_upper_dev_rcu(now, &iter); ++ if (!udev) ++ break; ++ ++ next = udev; ++ niter = &udev->adj_list.upper; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); ++ ++static bool __netdev_has_upper_dev(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = 0, ++ .data = (void *)upper_dev, ++ }; ++ ++ ASSERT_RTNL(); ++ ++ return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, ++ &priv); ++} ++ ++/** ++ * netdev_lower_get_next_private - Get the next ->private from the ++ * lower neighbour list ++ * @dev: device ++ * @iter: list_head ** of the current position ++ * ++ * Gets the next netdev_adjacent->private from the dev's lower neighbour ++ * list, starting from iter position. The caller must hold either hold the ++ * RTNL lock or its own locking that guarantees that the neighbour lower ++ * list will remain unchanged. ++ */ ++void *netdev_lower_get_next_private(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry(*iter, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = lower->list.next; ++ ++ return lower->private; ++} ++EXPORT_SYMBOL(netdev_lower_get_next_private); ++ ++/** ++ * netdev_lower_get_next_private_rcu - Get the next ->private from the ++ * lower neighbour list, RCU ++ * variant ++ * @dev: device ++ * @iter: list_head ** of the current position ++ * ++ * Gets the next netdev_adjacent->private from the dev's lower neighbour ++ * list, starting from iter position. The caller must hold RCU read lock. ++ */ ++void *netdev_lower_get_next_private_rcu(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); ++ ++ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = &lower->list; ++ ++ return lower->private; ++} ++EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); ++ ++/** ++ * netdev_lower_get_next - Get the next device from the lower neighbour ++ * list ++ * @dev: device ++ * @iter: list_head ** of the current position ++ * ++ * Gets the next netdev_adjacent from the dev's lower neighbour ++ * list, starting from iter position. The caller must hold RTNL lock or ++ * its own locking that guarantees that the neighbour lower ++ * list will remain unchanged. ++ */ ++void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry(*iter, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = lower->list.next; ++ ++ return lower->dev; ++} ++EXPORT_SYMBOL(netdev_lower_get_next); ++ ++static struct net_device *netdev_next_lower_dev(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = &lower->list; ++ ++ return lower->dev; ++} ++ ++static struct net_device *__netdev_next_lower_dev(struct net_device *dev, ++ struct list_head **iter, ++ bool *ignore) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry((*iter)->next, struct netdev_adjacent, list); ++ ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = &lower->list; ++ *ignore = lower->ignore; ++ ++ return lower->dev; ++} ++ ++int netdev_walk_all_lower_dev(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ ++ now = dev; ++ iter = &dev->adj_list.lower; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ ldev = netdev_next_lower_dev(now, &iter); ++ if (!ldev) ++ break; ++ ++ next = ldev; ++ niter = &ldev->adj_list.lower; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); ++ ++static int __netdev_walk_all_lower_dev(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ bool ignore; ++ ++ now = dev; ++ iter = &dev->adj_list.lower; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ ldev = __netdev_next_lower_dev(now, &iter, &ignore); ++ if (!ldev) ++ break; ++ if (ignore) ++ continue; ++ ++ next = ldev; ++ niter = &ldev->adj_list.lower; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++ ++struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, ++ struct list_head **iter) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); ++ if (&lower->list == &dev->adj_list.lower) ++ return NULL; ++ ++ *iter = &lower->list; ++ ++ return lower->dev; ++} ++EXPORT_SYMBOL(netdev_next_lower_dev_rcu); ++ ++static u8 __netdev_upper_depth(struct net_device *dev) ++{ ++ struct net_device *udev; ++ struct list_head *iter; ++ u8 max_depth = 0; ++ bool ignore; ++ ++ for (iter = &dev->adj_list.upper, ++ udev = __netdev_next_upper_dev(dev, &iter, &ignore); ++ udev; ++ udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { ++ if (ignore) ++ continue; ++ if (max_depth < udev->upper_level) ++ max_depth = udev->upper_level; ++ } ++ ++ return max_depth; ++} ++ ++static u8 __netdev_lower_depth(struct net_device *dev) ++{ ++ struct net_device *ldev; ++ struct list_head *iter; ++ u8 max_depth = 0; ++ bool ignore; ++ ++ for (iter = &dev->adj_list.lower, ++ ldev = __netdev_next_lower_dev(dev, &iter, &ignore); ++ ldev; ++ ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { ++ if (ignore) ++ continue; ++ if (max_depth < ldev->lower_level) ++ max_depth = ldev->lower_level; ++ } ++ ++ return max_depth; ++} ++ ++static int __netdev_update_upper_level(struct net_device *dev, ++ struct netdev_nested_priv *__unused) ++{ ++ dev->upper_level = __netdev_upper_depth(dev) + 1; ++ return 0; ++} ++ ++#ifdef CONFIG_LOCKDEP ++static LIST_HEAD(net_unlink_list); ++ ++static void net_unlink_todo(struct net_device *dev) ++{ ++ if (list_empty(&dev->unlink_list)) ++ list_add_tail(&dev->unlink_list, &net_unlink_list); ++} ++#endif ++ ++static int __netdev_update_lower_level(struct net_device *dev, ++ struct netdev_nested_priv *priv) ++{ ++ dev->lower_level = __netdev_lower_depth(dev) + 1; ++ ++#ifdef CONFIG_LOCKDEP ++ if (!priv) ++ return 0; ++ ++ if (priv->flags & NESTED_SYNC_IMM) ++ dev->nested_level = dev->lower_level - 1; ++ if (priv->flags & NESTED_SYNC_TODO) ++ net_unlink_todo(dev); ++#endif ++ return 0; ++} ++ ++int netdev_walk_all_lower_dev_rcu(struct net_device *dev, ++ int (*fn)(struct net_device *dev, ++ struct netdev_nested_priv *priv), ++ struct netdev_nested_priv *priv) ++{ ++ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; ++ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; ++ int ret, cur = 0; ++ ++ now = dev; ++ iter = &dev->adj_list.lower; ++ ++ while (1) { ++ if (now != dev) { ++ ret = fn(now, priv); ++ if (ret) ++ return ret; ++ } ++ ++ next = NULL; ++ while (1) { ++ ldev = netdev_next_lower_dev_rcu(now, &iter); ++ if (!ldev) ++ break; ++ ++ next = ldev; ++ niter = &ldev->adj_list.lower; ++ dev_stack[cur] = now; ++ iter_stack[cur++] = iter; ++ break; ++ } ++ ++ if (!next) { ++ if (!cur) ++ return 0; ++ next = dev_stack[--cur]; ++ niter = iter_stack[cur]; ++ } ++ ++ now = next; ++ iter = niter; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); ++ ++/** ++ * netdev_lower_get_first_private_rcu - Get the first ->private from the ++ * lower neighbour list, RCU ++ * variant ++ * @dev: device ++ * ++ * Gets the first netdev_adjacent->private from the dev's lower neighbour ++ * list. The caller must hold RCU read lock. ++ */ ++void *netdev_lower_get_first_private_rcu(struct net_device *dev) ++{ ++ struct netdev_adjacent *lower; ++ ++ lower = list_first_or_null_rcu(&dev->adj_list.lower, ++ struct netdev_adjacent, list); ++ if (lower) ++ return lower->private; ++ return NULL; ++} ++EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); ++ ++/** ++ * netdev_master_upper_dev_get_rcu - Get master upper device ++ * @dev: device ++ * ++ * Find a master upper device and return pointer to it or NULL in case ++ * it's not there. The caller must hold the RCU read lock. ++ */ ++struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) ++{ ++ struct netdev_adjacent *upper; ++ ++ upper = list_first_or_null_rcu(&dev->adj_list.upper, ++ struct netdev_adjacent, list); ++ if (upper && likely(upper->master)) ++ return upper->dev; ++ return NULL; ++} ++EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); ++ ++static int netdev_adjacent_sysfs_add(struct net_device *dev, ++ struct net_device *adj_dev, ++ struct list_head *dev_list) ++{ ++ char linkname[IFNAMSIZ+7]; ++ ++ sprintf(linkname, dev_list == &dev->adj_list.upper ? ++ "upper_%s" : "lower_%s", adj_dev->name); ++ return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), ++ linkname); ++} ++static void netdev_adjacent_sysfs_del(struct net_device *dev, ++ char *name, ++ struct list_head *dev_list) ++{ ++ char linkname[IFNAMSIZ+7]; ++ ++ sprintf(linkname, dev_list == &dev->adj_list.upper ? ++ "upper_%s" : "lower_%s", name); ++ sysfs_remove_link(&(dev->dev.kobj), linkname); ++} ++ ++static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, ++ struct net_device *adj_dev, ++ struct list_head *dev_list) ++{ ++ return (dev_list == &dev->adj_list.upper || ++ dev_list == &dev->adj_list.lower) && ++ net_eq(dev_net(dev), dev_net(adj_dev)); ++} ++ ++static int __netdev_adjacent_dev_insert(struct net_device *dev, ++ struct net_device *adj_dev, ++ struct list_head *dev_list, ++ void *private, bool master) ++{ ++ struct netdev_adjacent *adj; ++ int ret; ++ ++ adj = __netdev_find_adj(adj_dev, dev_list); ++ ++ if (adj) { ++ adj->ref_nr += 1; ++ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", ++ dev->name, adj_dev->name, adj->ref_nr); ++ ++ return 0; ++ } ++ ++ adj = kmalloc(sizeof(*adj), GFP_KERNEL); ++ if (!adj) ++ return -ENOMEM; ++ ++ adj->dev = adj_dev; ++ adj->master = master; ++ adj->ref_nr = 1; ++ adj->private = private; ++ adj->ignore = false; ++ netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); ++ ++ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", ++ dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); ++ ++ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ++ ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); ++ if (ret) ++ goto free_adj; ++ } ++ ++ /* Ensure that master link is always the first item in list. */ ++ if (master) { ++ ret = sysfs_create_link(&(dev->dev.kobj), ++ &(adj_dev->dev.kobj), "master"); ++ if (ret) ++ goto remove_symlinks; ++ ++ list_add_rcu(&adj->list, dev_list); ++ } else { ++ list_add_tail_rcu(&adj->list, dev_list); ++ } ++ ++ return 0; ++ ++remove_symlinks: ++ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) ++ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); ++free_adj: ++ netdev_put(adj_dev, &adj->dev_tracker); ++ kfree(adj); ++ ++ return ret; ++} ++ ++static void __netdev_adjacent_dev_remove(struct net_device *dev, ++ struct net_device *adj_dev, ++ u16 ref_nr, ++ struct list_head *dev_list) ++{ ++ struct netdev_adjacent *adj; ++ ++ pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", ++ dev->name, adj_dev->name, ref_nr); ++ ++ adj = __netdev_find_adj(adj_dev, dev_list); ++ ++ if (!adj) { ++ pr_err("Adjacency does not exist for device %s from %s\n", ++ dev->name, adj_dev->name); ++ WARN_ON(1); ++ return; ++ } ++ ++ if (adj->ref_nr > ref_nr) { ++ pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", ++ dev->name, adj_dev->name, ref_nr, ++ adj->ref_nr - ref_nr); ++ adj->ref_nr -= ref_nr; ++ return; ++ } ++ ++ if (adj->master) ++ sysfs_remove_link(&(dev->dev.kobj), "master"); ++ ++ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) ++ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); ++ ++ list_del_rcu(&adj->list); ++ pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", ++ adj_dev->name, dev->name, adj_dev->name); ++ netdev_put(adj_dev, &adj->dev_tracker); ++ kfree_rcu(adj, rcu); ++} ++ ++static int __netdev_adjacent_dev_link_lists(struct net_device *dev, ++ struct net_device *upper_dev, ++ struct list_head *up_list, ++ struct list_head *down_list, ++ void *private, bool master) ++{ ++ int ret; ++ ++ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, ++ private, master); ++ if (ret) ++ return ret; ++ ++ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, ++ private, false); ++ if (ret) { ++ __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, ++ struct net_device *upper_dev, ++ u16 ref_nr, ++ struct list_head *up_list, ++ struct list_head *down_list) ++{ ++ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); ++ __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); ++} ++ ++static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, ++ struct net_device *upper_dev, ++ void *private, bool master) ++{ ++ return __netdev_adjacent_dev_link_lists(dev, upper_dev, ++ &dev->adj_list.upper, ++ &upper_dev->adj_list.lower, ++ private, master); ++} ++ ++static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, ++ &dev->adj_list.upper, ++ &upper_dev->adj_list.lower); ++} ++ ++static int __netdev_upper_dev_link(struct net_device *dev, ++ struct net_device *upper_dev, bool master, ++ void *upper_priv, void *upper_info, ++ struct netdev_nested_priv *priv, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_changeupper_info changeupper_info = { ++ .info = { ++ .dev = dev, ++ .extack = extack, ++ }, ++ .upper_dev = upper_dev, ++ .master = master, ++ .linking = true, ++ .upper_info = upper_info, ++ }; ++ struct net_device *master_dev; ++ int ret = 0; ++ ++ ASSERT_RTNL(); ++ ++ if (dev == upper_dev) ++ return -EBUSY; ++ ++ /* To prevent loops, check if dev is not upper device to upper_dev. */ ++ if (__netdev_has_upper_dev(upper_dev, dev)) ++ return -EBUSY; ++ ++ if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) ++ return -EMLINK; ++ ++ if (!master) { ++ if (__netdev_has_upper_dev(dev, upper_dev)) ++ return -EEXIST; ++ } else { ++ master_dev = __netdev_master_upper_dev_get(dev); ++ if (master_dev) ++ return master_dev == upper_dev ? -EEXIST : -EBUSY; ++ } ++ ++ ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, ++ &changeupper_info.info); ++ ret = notifier_to_errno(ret); ++ if (ret) ++ return ret; ++ ++ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, ++ master); ++ if (ret) ++ return ret; ++ ++ ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, ++ &changeupper_info.info); ++ ret = notifier_to_errno(ret); ++ if (ret) ++ goto rollback; ++ ++ __netdev_update_upper_level(dev, NULL); ++ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); ++ ++ __netdev_update_lower_level(upper_dev, priv); ++ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, ++ priv); ++ ++ return 0; ++ ++rollback: ++ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); ++ ++ return ret; ++} ++ ++/** ++ * netdev_upper_dev_link - Add a link to the upper device ++ * @dev: device ++ * @upper_dev: new upper device ++ * @extack: netlink extended ack ++ * ++ * Adds a link to device which is upper to this one. The caller must hold ++ * the RTNL lock. On a failure a negative errno code is returned. ++ * On success the reference counts are adjusted and the function ++ * returns zero. ++ */ ++int netdev_upper_dev_link(struct net_device *dev, ++ struct net_device *upper_dev, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, ++ .data = NULL, ++ }; ++ ++ return __netdev_upper_dev_link(dev, upper_dev, false, ++ NULL, NULL, &priv, extack); ++} ++EXPORT_SYMBOL(netdev_upper_dev_link); ++ ++/** ++ * netdev_master_upper_dev_link - Add a master link to the upper device ++ * @dev: device ++ * @upper_dev: new upper device ++ * @upper_priv: upper device private ++ * @upper_info: upper info to be passed down via notifier ++ * @extack: netlink extended ack ++ * ++ * Adds a link to device which is upper to this one. In this case, only ++ * one master upper device can be linked, although other non-master devices ++ * might be linked as well. The caller must hold the RTNL lock. ++ * On a failure a negative errno code is returned. On success the reference ++ * counts are adjusted and the function returns zero. ++ */ ++int netdev_master_upper_dev_link(struct net_device *dev, ++ struct net_device *upper_dev, ++ void *upper_priv, void *upper_info, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, ++ .data = NULL, ++ }; ++ ++ return __netdev_upper_dev_link(dev, upper_dev, true, ++ upper_priv, upper_info, &priv, extack); ++} ++EXPORT_SYMBOL(netdev_master_upper_dev_link); ++ ++static void __netdev_upper_dev_unlink(struct net_device *dev, ++ struct net_device *upper_dev, ++ struct netdev_nested_priv *priv) ++{ ++ struct netdev_notifier_changeupper_info changeupper_info = { ++ .info = { ++ .dev = dev, ++ }, ++ .upper_dev = upper_dev, ++ .linking = false, ++ }; ++ ++ ASSERT_RTNL(); ++ ++ changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; ++ ++ call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, ++ &changeupper_info.info); ++ ++ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); ++ ++ call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, ++ &changeupper_info.info); ++ ++ __netdev_update_upper_level(dev, NULL); ++ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); ++ ++ __netdev_update_lower_level(upper_dev, priv); ++ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, ++ priv); ++} ++ ++/** ++ * netdev_upper_dev_unlink - Removes a link to upper device ++ * @dev: device ++ * @upper_dev: new upper device ++ * ++ * Removes a link to device which is upper to this one. The caller must hold ++ * the RTNL lock. ++ */ ++void netdev_upper_dev_unlink(struct net_device *dev, ++ struct net_device *upper_dev) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = NESTED_SYNC_TODO, ++ .data = NULL, ++ }; ++ ++ __netdev_upper_dev_unlink(dev, upper_dev, &priv); ++} ++EXPORT_SYMBOL(netdev_upper_dev_unlink); ++ ++static void __netdev_adjacent_dev_set(struct net_device *upper_dev, ++ struct net_device *lower_dev, ++ bool val) ++{ ++ struct netdev_adjacent *adj; ++ ++ adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); ++ if (adj) ++ adj->ignore = val; ++ ++ adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); ++ if (adj) ++ adj->ignore = val; ++} ++ ++static void netdev_adjacent_dev_disable(struct net_device *upper_dev, ++ struct net_device *lower_dev) ++{ ++ __netdev_adjacent_dev_set(upper_dev, lower_dev, true); ++} ++ ++static void netdev_adjacent_dev_enable(struct net_device *upper_dev, ++ struct net_device *lower_dev) ++{ ++ __netdev_adjacent_dev_set(upper_dev, lower_dev, false); ++} ++ ++int netdev_adjacent_change_prepare(struct net_device *old_dev, ++ struct net_device *new_dev, ++ struct net_device *dev, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = 0, ++ .data = NULL, ++ }; ++ int err; ++ ++ if (!new_dev) ++ return 0; ++ ++ if (old_dev && new_dev != old_dev) ++ netdev_adjacent_dev_disable(dev, old_dev); ++ err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, ++ extack); ++ if (err) { ++ if (old_dev && new_dev != old_dev) ++ netdev_adjacent_dev_enable(dev, old_dev); ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(netdev_adjacent_change_prepare); ++ ++void netdev_adjacent_change_commit(struct net_device *old_dev, ++ struct net_device *new_dev, ++ struct net_device *dev) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, ++ .data = NULL, ++ }; ++ ++ if (!new_dev || !old_dev) ++ return; ++ ++ if (new_dev == old_dev) ++ return; ++ ++ netdev_adjacent_dev_enable(dev, old_dev); ++ __netdev_upper_dev_unlink(old_dev, dev, &priv); ++} ++EXPORT_SYMBOL(netdev_adjacent_change_commit); ++ ++void netdev_adjacent_change_abort(struct net_device *old_dev, ++ struct net_device *new_dev, ++ struct net_device *dev) ++{ ++ struct netdev_nested_priv priv = { ++ .flags = 0, ++ .data = NULL, ++ }; ++ ++ if (!new_dev) ++ return; ++ ++ if (old_dev && new_dev != old_dev) ++ netdev_adjacent_dev_enable(dev, old_dev); ++ ++ __netdev_upper_dev_unlink(new_dev, dev, &priv); ++} ++EXPORT_SYMBOL(netdev_adjacent_change_abort); ++ ++/** ++ * netdev_bonding_info_change - Dispatch event about slave change ++ * @dev: device ++ * @bonding_info: info to dispatch ++ * ++ * Send NETDEV_BONDING_INFO to netdev notifiers with info. ++ * The caller must hold the RTNL lock. ++ */ ++void netdev_bonding_info_change(struct net_device *dev, ++ struct netdev_bonding_info *bonding_info) ++{ ++ struct netdev_notifier_bonding_info info = { ++ .info.dev = dev, ++ }; ++ ++ memcpy(&info.bonding_info, bonding_info, ++ sizeof(struct netdev_bonding_info)); ++ call_netdevice_notifiers_info(NETDEV_BONDING_INFO, ++ &info.info); ++} ++EXPORT_SYMBOL(netdev_bonding_info_change); ++ ++static int netdev_offload_xstats_enable_l3(struct net_device *dev, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_offload_xstats_info info = { ++ .info.dev = dev, ++ .info.extack = extack, ++ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, ++ }; ++ int err; ++ int rc; ++ ++ dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), ++ GFP_KERNEL); ++ if (!dev->offload_xstats_l3) ++ return -ENOMEM; ++ ++ rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, ++ NETDEV_OFFLOAD_XSTATS_DISABLE, ++ &info.info); ++ err = notifier_to_errno(rc); ++ if (err) ++ goto free_stats; ++ ++ return 0; ++ ++free_stats: ++ kfree(dev->offload_xstats_l3); ++ dev->offload_xstats_l3 = NULL; ++ return err; ++} ++ ++int netdev_offload_xstats_enable(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ struct netlink_ext_ack *extack) ++{ ++ ASSERT_RTNL(); ++ ++ if (netdev_offload_xstats_enabled(dev, type)) ++ return -EALREADY; ++ ++ switch (type) { ++ case NETDEV_OFFLOAD_XSTATS_TYPE_L3: ++ return netdev_offload_xstats_enable_l3(dev, extack); ++ } ++ ++ WARN_ON(1); ++ return -EINVAL; ++} ++EXPORT_SYMBOL(netdev_offload_xstats_enable); ++ ++static void netdev_offload_xstats_disable_l3(struct net_device *dev) ++{ ++ struct netdev_notifier_offload_xstats_info info = { ++ .info.dev = dev, ++ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, ++ }; ++ ++ call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, ++ &info.info); ++ kfree(dev->offload_xstats_l3); ++ dev->offload_xstats_l3 = NULL; ++} ++ ++int netdev_offload_xstats_disable(struct net_device *dev, ++ enum netdev_offload_xstats_type type) ++{ ++ ASSERT_RTNL(); ++ ++ if (!netdev_offload_xstats_enabled(dev, type)) ++ return -EALREADY; ++ ++ switch (type) { ++ case NETDEV_OFFLOAD_XSTATS_TYPE_L3: ++ netdev_offload_xstats_disable_l3(dev); ++ return 0; ++ } ++ ++ WARN_ON(1); ++ return -EINVAL; ++} ++EXPORT_SYMBOL(netdev_offload_xstats_disable); ++ ++static void netdev_offload_xstats_disable_all(struct net_device *dev) ++{ ++ netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); ++} ++ ++static struct rtnl_hw_stats64 * ++netdev_offload_xstats_get_ptr(const struct net_device *dev, ++ enum netdev_offload_xstats_type type) ++{ ++ switch (type) { ++ case NETDEV_OFFLOAD_XSTATS_TYPE_L3: ++ return dev->offload_xstats_l3; ++ } ++ ++ WARN_ON(1); ++ return NULL; ++} ++ ++bool netdev_offload_xstats_enabled(const struct net_device *dev, ++ enum netdev_offload_xstats_type type) ++{ ++ ASSERT_RTNL(); ++ ++ return netdev_offload_xstats_get_ptr(dev, type); ++} ++EXPORT_SYMBOL(netdev_offload_xstats_enabled); ++ ++struct netdev_notifier_offload_xstats_ru { ++ bool used; ++}; ++ ++struct netdev_notifier_offload_xstats_rd { ++ struct rtnl_hw_stats64 stats; ++ bool used; ++}; ++ ++static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, ++ const struct rtnl_hw_stats64 *src) ++{ ++ dest->rx_packets += src->rx_packets; ++ dest->tx_packets += src->tx_packets; ++ dest->rx_bytes += src->rx_bytes; ++ dest->tx_bytes += src->tx_bytes; ++ dest->rx_errors += src->rx_errors; ++ dest->tx_errors += src->tx_errors; ++ dest->rx_dropped += src->rx_dropped; ++ dest->tx_dropped += src->tx_dropped; ++ dest->multicast += src->multicast; ++} ++ ++static int netdev_offload_xstats_get_used(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ bool *p_used, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_offload_xstats_ru report_used = {}; ++ struct netdev_notifier_offload_xstats_info info = { ++ .info.dev = dev, ++ .info.extack = extack, ++ .type = type, ++ .report_used = &report_used, ++ }; ++ int rc; ++ ++ WARN_ON(!netdev_offload_xstats_enabled(dev, type)); ++ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, ++ &info.info); ++ *p_used = report_used.used; ++ return notifier_to_errno(rc); ++} ++ ++static int netdev_offload_xstats_get_stats(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ struct rtnl_hw_stats64 *p_stats, ++ bool *p_used, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_offload_xstats_rd report_delta = {}; ++ struct netdev_notifier_offload_xstats_info info = { ++ .info.dev = dev, ++ .info.extack = extack, ++ .type = type, ++ .report_delta = &report_delta, ++ }; ++ struct rtnl_hw_stats64 *stats; ++ int rc; ++ ++ stats = netdev_offload_xstats_get_ptr(dev, type); ++ if (WARN_ON(!stats)) ++ return -EINVAL; ++ ++ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, ++ &info.info); ++ ++ /* Cache whatever we got, even if there was an error, otherwise the ++ * successful stats retrievals would get lost. ++ */ ++ netdev_hw_stats64_add(stats, &report_delta.stats); ++ ++ if (p_stats) ++ *p_stats = *stats; ++ *p_used = report_delta.used; ++ ++ return notifier_to_errno(rc); ++} ++ ++int netdev_offload_xstats_get(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ struct rtnl_hw_stats64 *p_stats, bool *p_used, ++ struct netlink_ext_ack *extack) ++{ ++ ASSERT_RTNL(); ++ ++ if (p_stats) ++ return netdev_offload_xstats_get_stats(dev, type, p_stats, ++ p_used, extack); ++ else ++ return netdev_offload_xstats_get_used(dev, type, p_used, ++ extack); ++} ++EXPORT_SYMBOL(netdev_offload_xstats_get); ++ ++void ++netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, ++ const struct rtnl_hw_stats64 *stats) ++{ ++ report_delta->used = true; ++ netdev_hw_stats64_add(&report_delta->stats, stats); ++} ++EXPORT_SYMBOL(netdev_offload_xstats_report_delta); ++ ++void ++netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) ++{ ++ report_used->used = true; ++} ++EXPORT_SYMBOL(netdev_offload_xstats_report_used); ++ ++void netdev_offload_xstats_push_delta(struct net_device *dev, ++ enum netdev_offload_xstats_type type, ++ const struct rtnl_hw_stats64 *p_stats) ++{ ++ struct rtnl_hw_stats64 *stats; ++ ++ ASSERT_RTNL(); ++ ++ stats = netdev_offload_xstats_get_ptr(dev, type); ++ if (WARN_ON(!stats)) ++ return; ++ ++ netdev_hw_stats64_add(stats, p_stats); ++} ++EXPORT_SYMBOL(netdev_offload_xstats_push_delta); ++ ++/** ++ * netdev_get_xmit_slave - Get the xmit slave of master device ++ * @dev: device ++ * @skb: The packet ++ * @all_slaves: assume all the slaves are active ++ * ++ * The reference counters are not incremented so the caller must be ++ * careful with locks. The caller must hold RCU lock. ++ * %NULL is returned if no slave is found. ++ */ ++ ++struct net_device *netdev_get_xmit_slave(struct net_device *dev, ++ struct sk_buff *skb, ++ bool all_slaves) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (!ops->ndo_get_xmit_slave) ++ return NULL; ++ return ops->ndo_get_xmit_slave(dev, skb, all_slaves); ++} ++EXPORT_SYMBOL(netdev_get_xmit_slave); ++ ++static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, ++ struct sock *sk) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (!ops->ndo_sk_get_lower_dev) ++ return NULL; ++ return ops->ndo_sk_get_lower_dev(dev, sk); ++} ++ ++/** ++ * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket ++ * @dev: device ++ * @sk: the socket ++ * ++ * %NULL is returned if no lower device is found. ++ */ ++ ++struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, ++ struct sock *sk) ++{ ++ struct net_device *lower; ++ ++ lower = netdev_sk_get_lower_dev(dev, sk); ++ while (lower) { ++ dev = lower; ++ lower = netdev_sk_get_lower_dev(dev, sk); ++ } ++ ++ return dev; ++} ++EXPORT_SYMBOL(netdev_sk_get_lowest_dev); ++ ++static void netdev_adjacent_add_links(struct net_device *dev) ++{ ++ struct netdev_adjacent *iter; ++ ++ struct net *net = dev_net(dev); ++ ++ list_for_each_entry(iter, &dev->adj_list.upper, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_add(iter->dev, dev, ++ &iter->dev->adj_list.lower); ++ netdev_adjacent_sysfs_add(dev, iter->dev, ++ &dev->adj_list.upper); ++ } ++ ++ list_for_each_entry(iter, &dev->adj_list.lower, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_add(iter->dev, dev, ++ &iter->dev->adj_list.upper); ++ netdev_adjacent_sysfs_add(dev, iter->dev, ++ &dev->adj_list.lower); ++ } ++} ++ ++static void netdev_adjacent_del_links(struct net_device *dev) ++{ ++ struct netdev_adjacent *iter; ++ ++ struct net *net = dev_net(dev); ++ ++ list_for_each_entry(iter, &dev->adj_list.upper, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_del(iter->dev, dev->name, ++ &iter->dev->adj_list.lower); ++ netdev_adjacent_sysfs_del(dev, iter->dev->name, ++ &dev->adj_list.upper); ++ } ++ ++ list_for_each_entry(iter, &dev->adj_list.lower, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_del(iter->dev, dev->name, ++ &iter->dev->adj_list.upper); ++ netdev_adjacent_sysfs_del(dev, iter->dev->name, ++ &dev->adj_list.lower); ++ } ++} ++ ++void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) ++{ ++ struct netdev_adjacent *iter; ++ ++ struct net *net = dev_net(dev); ++ ++ list_for_each_entry(iter, &dev->adj_list.upper, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_del(iter->dev, oldname, ++ &iter->dev->adj_list.lower); ++ netdev_adjacent_sysfs_add(iter->dev, dev, ++ &iter->dev->adj_list.lower); ++ } ++ ++ list_for_each_entry(iter, &dev->adj_list.lower, list) { ++ if (!net_eq(net, dev_net(iter->dev))) ++ continue; ++ netdev_adjacent_sysfs_del(iter->dev, oldname, ++ &iter->dev->adj_list.upper); ++ netdev_adjacent_sysfs_add(iter->dev, dev, ++ &iter->dev->adj_list.upper); ++ } ++} ++ ++void *netdev_lower_dev_get_private(struct net_device *dev, ++ struct net_device *lower_dev) ++{ ++ struct netdev_adjacent *lower; ++ ++ if (!lower_dev) ++ return NULL; ++ lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); ++ if (!lower) ++ return NULL; ++ ++ return lower->private; ++} ++EXPORT_SYMBOL(netdev_lower_dev_get_private); ++ ++ ++/** ++ * netdev_lower_state_changed - Dispatch event about lower device state change ++ * @lower_dev: device ++ * @lower_state_info: state to dispatch ++ * ++ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. ++ * The caller must hold the RTNL lock. ++ */ ++void netdev_lower_state_changed(struct net_device *lower_dev, ++ void *lower_state_info) ++{ ++ struct netdev_notifier_changelowerstate_info changelowerstate_info = { ++ .info.dev = lower_dev, ++ }; ++ ++ ASSERT_RTNL(); ++ changelowerstate_info.lower_state_info = lower_state_info; ++ call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, ++ &changelowerstate_info.info); ++} ++EXPORT_SYMBOL(netdev_lower_state_changed); ++ ++static void dev_change_rx_flags(struct net_device *dev, int flags) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (ops->ndo_change_rx_flags) ++ ops->ndo_change_rx_flags(dev, flags); ++} ++ ++static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) ++{ ++ unsigned int old_flags = dev->flags; ++ kuid_t uid; ++ kgid_t gid; ++ ++ ASSERT_RTNL(); ++ ++ dev->flags |= IFF_PROMISC; ++ dev->promiscuity += inc; ++ if (dev->promiscuity == 0) { ++ /* ++ * Avoid overflow. ++ * If inc causes overflow, untouch promisc and return error. ++ */ ++ if (inc < 0) ++ dev->flags &= ~IFF_PROMISC; ++ else { ++ dev->promiscuity -= inc; ++ netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); ++ return -EOVERFLOW; ++ } ++ } ++ if (dev->flags != old_flags) { ++ pr_info("device %s %s promiscuous mode\n", ++ dev->name, ++ dev->flags & IFF_PROMISC ? "entered" : "left"); ++ if (audit_enabled) { ++ current_uid_gid(&uid, &gid); ++ audit_log(audit_context(), GFP_ATOMIC, ++ AUDIT_ANOM_PROMISCUOUS, ++ "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", ++ dev->name, (dev->flags & IFF_PROMISC), ++ (old_flags & IFF_PROMISC), ++ from_kuid(&init_user_ns, audit_get_loginuid(current)), ++ from_kuid(&init_user_ns, uid), ++ from_kgid(&init_user_ns, gid), ++ audit_get_sessionid(current)); ++ } ++ ++ dev_change_rx_flags(dev, IFF_PROMISC); ++ } ++ if (notify) ++ __dev_notify_flags(dev, old_flags, IFF_PROMISC); ++ return 0; ++} ++ ++/** ++ * dev_set_promiscuity - update promiscuity count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove promiscuity from a device. While the count in the device ++ * remains above zero the interface remains promiscuous. Once it hits zero ++ * the device reverts back to normal filtering operation. A negative inc ++ * value is used to drop promiscuity on the device. ++ * Return 0 if successful or a negative errno code on error. ++ */ ++int dev_set_promiscuity(struct net_device *dev, int inc) ++{ ++ unsigned int old_flags = dev->flags; ++ int err; ++ ++ err = __dev_set_promiscuity(dev, inc, true); ++ if (err < 0) ++ return err; ++ if (dev->flags != old_flags) ++ dev_set_rx_mode(dev); ++ return err; ++} ++EXPORT_SYMBOL(dev_set_promiscuity); ++ ++static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) ++{ ++ unsigned int old_flags = dev->flags, old_gflags = dev->gflags; ++ ++ ASSERT_RTNL(); ++ ++ dev->flags |= IFF_ALLMULTI; ++ dev->allmulti += inc; ++ if (dev->allmulti == 0) { ++ /* ++ * Avoid overflow. ++ * If inc causes overflow, untouch allmulti and return error. ++ */ ++ if (inc < 0) ++ dev->flags &= ~IFF_ALLMULTI; ++ else { ++ dev->allmulti -= inc; ++ netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); ++ return -EOVERFLOW; ++ } ++ } ++ if (dev->flags ^ old_flags) { ++ dev_change_rx_flags(dev, IFF_ALLMULTI); ++ dev_set_rx_mode(dev); ++ if (notify) ++ __dev_notify_flags(dev, old_flags, ++ dev->gflags ^ old_gflags); ++ } ++ return 0; ++} ++ ++/** ++ * dev_set_allmulti - update allmulti count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove reception of all multicast frames to a device. While the ++ * count in the device remains above zero the interface remains listening ++ * to all interfaces. Once it hits zero the device reverts back to normal ++ * filtering operation. A negative @inc value is used to drop the counter ++ * when releasing a resource needing all multicasts. ++ * Return 0 if successful or a negative errno code on error. ++ */ ++ ++int dev_set_allmulti(struct net_device *dev, int inc) ++{ ++ return __dev_set_allmulti(dev, inc, true); ++} ++EXPORT_SYMBOL(dev_set_allmulti); ++ ++/* ++ * Upload unicast and multicast address lists to device and ++ * configure RX filtering. When the device doesn't support unicast ++ * filtering it is put in promiscuous mode while unicast addresses ++ * are present. ++ */ ++void __dev_set_rx_mode(struct net_device *dev) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ /* dev_open will call this function so the list will stay sane. */ ++ if (!(dev->flags&IFF_UP)) ++ return; ++ ++ if (!netif_device_present(dev)) ++ return; ++ ++ if (!(dev->priv_flags & IFF_UNICAST_FLT)) { ++ /* Unicast addresses changes may only happen under the rtnl, ++ * therefore calling __dev_set_promiscuity here is safe. ++ */ ++ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { ++ __dev_set_promiscuity(dev, 1, false); ++ dev->uc_promisc = true; ++ } else if (netdev_uc_empty(dev) && dev->uc_promisc) { ++ __dev_set_promiscuity(dev, -1, false); ++ dev->uc_promisc = false; ++ } ++ } ++ ++ if (ops->ndo_set_rx_mode) ++ ops->ndo_set_rx_mode(dev); ++} ++ ++void dev_set_rx_mode(struct net_device *dev) ++{ ++ netif_addr_lock_bh(dev); ++ __dev_set_rx_mode(dev); ++ netif_addr_unlock_bh(dev); ++} ++ ++/** ++ * dev_get_flags - get flags reported to userspace ++ * @dev: device ++ * ++ * Get the combination of flag bits exported through APIs to userspace. ++ */ ++unsigned int dev_get_flags(const struct net_device *dev) ++{ ++ unsigned int flags; ++ ++ flags = (dev->flags & ~(IFF_PROMISC | ++ IFF_ALLMULTI | ++ IFF_RUNNING | ++ IFF_LOWER_UP | ++ IFF_DORMANT)) | ++ (dev->gflags & (IFF_PROMISC | ++ IFF_ALLMULTI)); ++ ++ if (netif_running(dev)) { ++ if (netif_oper_up(dev)) ++ flags |= IFF_RUNNING; ++ if (netif_carrier_ok(dev)) ++ flags |= IFF_LOWER_UP; ++ if (netif_dormant(dev)) ++ flags |= IFF_DORMANT; ++ } ++ ++ return flags; ++} ++EXPORT_SYMBOL(dev_get_flags); ++ ++int __dev_change_flags(struct net_device *dev, unsigned int flags, ++ struct netlink_ext_ack *extack) ++{ ++ unsigned int old_flags = dev->flags; ++ int ret; ++ ++ ASSERT_RTNL(); ++ ++ /* ++ * Set the flags on our device. ++ */ ++ ++ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | ++ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | ++ IFF_AUTOMEDIA)) | ++ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | ++ IFF_ALLMULTI)); ++ ++ /* ++ * Load in the correct multicast list now the flags have changed. ++ */ ++ ++ if ((old_flags ^ flags) & IFF_MULTICAST) ++ dev_change_rx_flags(dev, IFF_MULTICAST); ++ ++ dev_set_rx_mode(dev); ++ ++ /* ++ * Have we downed the interface. We handle IFF_UP ourselves ++ * according to user attempts to set it, rather than blindly ++ * setting it. ++ */ ++ ++ ret = 0; ++ if ((old_flags ^ flags) & IFF_UP) { ++ if (old_flags & IFF_UP) ++ __dev_close(dev); ++ else ++ ret = __dev_open(dev, extack); ++ } ++ ++ if ((flags ^ dev->gflags) & IFF_PROMISC) { ++ int inc = (flags & IFF_PROMISC) ? 1 : -1; ++ unsigned int old_flags = dev->flags; ++ ++ dev->gflags ^= IFF_PROMISC; ++ ++ if (__dev_set_promiscuity(dev, inc, false) >= 0) ++ if (dev->flags != old_flags) ++ dev_set_rx_mode(dev); ++ } ++ ++ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI ++ * is important. Some (broken) drivers set IFF_PROMISC, when ++ * IFF_ALLMULTI is requested not asking us and not reporting. ++ */ ++ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { ++ int inc = (flags & IFF_ALLMULTI) ? 1 : -1; ++ ++ dev->gflags ^= IFF_ALLMULTI; ++ __dev_set_allmulti(dev, inc, false); ++ } ++ ++ return ret; ++} ++ ++void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, ++ unsigned int gchanges) ++{ ++ unsigned int changes = dev->flags ^ old_flags; ++ ++ if (gchanges) ++ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); ++ ++ if (changes & IFF_UP) { ++ if (dev->flags & IFF_UP) ++ call_netdevice_notifiers(NETDEV_UP, dev); ++ else ++ call_netdevice_notifiers(NETDEV_DOWN, dev); ++ } ++ ++ if (dev->flags & IFF_UP && ++ (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { ++ struct netdev_notifier_change_info change_info = { ++ .info = { ++ .dev = dev, ++ }, ++ .flags_changed = changes, ++ }; ++ ++ call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); ++ } ++} ++ ++/** ++ * dev_change_flags - change device settings ++ * @dev: device ++ * @flags: device state flags ++ * @extack: netlink extended ack ++ * ++ * Change settings on device based state flags. The flags are ++ * in the userspace exported format. ++ */ ++int dev_change_flags(struct net_device *dev, unsigned int flags, ++ struct netlink_ext_ack *extack) ++{ ++ int ret; ++ unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ++ ++ ret = __dev_change_flags(dev, flags, extack); ++ if (ret < 0) ++ return ret; ++ ++ changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); ++ __dev_notify_flags(dev, old_flags, changes); ++ return ret; ++} ++EXPORT_SYMBOL(dev_change_flags); ++ ++int __dev_set_mtu(struct net_device *dev, int new_mtu) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (ops->ndo_change_mtu) ++ return ops->ndo_change_mtu(dev, new_mtu); ++ ++ /* Pairs with all the lockless reads of dev->mtu in the stack */ ++ WRITE_ONCE(dev->mtu, new_mtu); ++ return 0; ++} ++EXPORT_SYMBOL(__dev_set_mtu); ++ ++int dev_validate_mtu(struct net_device *dev, int new_mtu, ++ struct netlink_ext_ack *extack) ++{ ++ /* MTU must be positive, and in range */ ++ if (new_mtu < 0 || new_mtu < dev->min_mtu) { ++ NL_SET_ERR_MSG(extack, "mtu less than device minimum"); ++ return -EINVAL; ++ } ++ ++ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { ++ NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++/** ++ * dev_set_mtu_ext - Change maximum transfer unit ++ * @dev: device ++ * @new_mtu: new transfer unit ++ * @extack: netlink extended ack ++ * ++ * Change the maximum transfer size of the network device. ++ */ ++int dev_set_mtu_ext(struct net_device *dev, int new_mtu, ++ struct netlink_ext_ack *extack) ++{ ++ int err, orig_mtu; ++ ++ if (new_mtu == dev->mtu) ++ return 0; ++ ++ err = dev_validate_mtu(dev, new_mtu, extack); ++ if (err) ++ return err; ++ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); ++ err = notifier_to_errno(err); ++ if (err) ++ return err; ++ ++ orig_mtu = dev->mtu; ++ err = __dev_set_mtu(dev, new_mtu); ++ ++ if (!err) { ++ err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, ++ orig_mtu); ++ err = notifier_to_errno(err); ++ if (err) { ++ /* setting mtu back and notifying everyone again, ++ * so that they have a chance to revert changes. ++ */ ++ __dev_set_mtu(dev, orig_mtu); ++ call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, ++ new_mtu); ++ } ++ } ++ return err; ++} ++ ++int dev_set_mtu(struct net_device *dev, int new_mtu) ++{ ++ struct netlink_ext_ack extack; ++ int err; ++ ++ memset(&extack, 0, sizeof(extack)); ++ err = dev_set_mtu_ext(dev, new_mtu, &extack); ++ if (err && extack._msg) ++ net_err_ratelimited("%s: %s\n", dev->name, extack._msg); ++ return err; ++} ++EXPORT_SYMBOL(dev_set_mtu); ++ ++/** ++ * dev_change_tx_queue_len - Change TX queue length of a netdevice ++ * @dev: device ++ * @new_len: new tx queue length ++ */ ++int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) ++{ ++ unsigned int orig_len = dev->tx_queue_len; ++ int res; ++ ++ if (new_len != (unsigned int)new_len) ++ return -ERANGE; ++ ++ if (new_len != orig_len) { ++ dev->tx_queue_len = new_len; ++ res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); ++ res = notifier_to_errno(res); ++ if (res) ++ goto err_rollback; ++ res = dev_qdisc_change_tx_queue_len(dev); ++ if (res) ++ goto err_rollback; ++ } ++ ++ return 0; ++ ++err_rollback: ++ netdev_err(dev, "refused to change device tx_queue_len\n"); ++ dev->tx_queue_len = orig_len; ++ return res; ++} ++ ++/** ++ * dev_set_group - Change group this device belongs to ++ * @dev: device ++ * @new_group: group this device should belong to ++ */ ++void dev_set_group(struct net_device *dev, int new_group) ++{ ++ dev->group = new_group; ++} ++ ++/** ++ * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. ++ * @dev: device ++ * @addr: new address ++ * @extack: netlink extended ack ++ */ ++int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, ++ struct netlink_ext_ack *extack) ++{ ++ struct netdev_notifier_pre_changeaddr_info info = { ++ .info.dev = dev, ++ .info.extack = extack, ++ .dev_addr = addr, ++ }; ++ int rc; ++ ++ rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); ++ return notifier_to_errno(rc); ++} ++EXPORT_SYMBOL(dev_pre_changeaddr_notify); ++ ++/** ++ * dev_set_mac_address - Change Media Access Control Address ++ * @dev: device ++ * @sa: new address ++ * @extack: netlink extended ack ++ * ++ * Change the hardware (MAC) address of the device ++ */ ++int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, ++ struct netlink_ext_ack *extack) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ int err; ++ ++ if (!ops->ndo_set_mac_address) ++ return -EOPNOTSUPP; ++ if (sa->sa_family != dev->type) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); ++ if (err) ++ return err; ++ err = ops->ndo_set_mac_address(dev, sa); ++ if (err) ++ return err; ++ dev->addr_assign_type = NET_ADDR_SET; ++ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); ++ add_device_randomness(dev->dev_addr, dev->addr_len); ++ return 0; ++} ++EXPORT_SYMBOL(dev_set_mac_address); ++ ++static DECLARE_RWSEM(dev_addr_sem); ++ ++int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, ++ struct netlink_ext_ack *extack) ++{ ++ int ret; ++ ++ down_write(&dev_addr_sem); ++ ret = dev_set_mac_address(dev, sa, extack); ++ up_write(&dev_addr_sem); ++ return ret; ++} ++EXPORT_SYMBOL(dev_set_mac_address_user); ++ ++int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) ++{ ++ size_t size = sizeof(sa->sa_data); ++ struct net_device *dev; ++ int ret = 0; ++ ++ down_read(&dev_addr_sem); ++ rcu_read_lock(); ++ ++ dev = dev_get_by_name_rcu(net, dev_name); ++ if (!dev) { ++ ret = -ENODEV; ++ goto unlock; ++ } ++ if (!dev->addr_len) ++ memset(sa->sa_data, 0, size); ++ else ++ memcpy(sa->sa_data, dev->dev_addr, ++ min_t(size_t, size, dev->addr_len)); ++ sa->sa_family = dev->type; ++ ++unlock: ++ rcu_read_unlock(); ++ up_read(&dev_addr_sem); ++ return ret; ++} ++EXPORT_SYMBOL(dev_get_mac_address); ++ ++/** ++ * dev_change_carrier - Change device carrier ++ * @dev: device ++ * @new_carrier: new value ++ * ++ * Change device carrier ++ */ ++int dev_change_carrier(struct net_device *dev, bool new_carrier) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (!ops->ndo_change_carrier) ++ return -EOPNOTSUPP; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return ops->ndo_change_carrier(dev, new_carrier); ++} ++ ++/** ++ * dev_get_phys_port_id - Get device physical port ID ++ * @dev: device ++ * @ppid: port ID ++ * ++ * Get device physical port ID ++ */ ++int dev_get_phys_port_id(struct net_device *dev, ++ struct netdev_phys_item_id *ppid) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ ++ if (!ops->ndo_get_phys_port_id) ++ return -EOPNOTSUPP; ++ return ops->ndo_get_phys_port_id(dev, ppid); ++} ++ ++/** ++ * dev_get_phys_port_name - Get device physical port name ++ * @dev: device ++ * @name: port name ++ * @len: limit of bytes to copy to name ++ * ++ * Get device physical port name ++ */ ++int dev_get_phys_port_name(struct net_device *dev, ++ char *name, size_t len) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ int err; ++ ++ if (ops->ndo_get_phys_port_name) { ++ err = ops->ndo_get_phys_port_name(dev, name, len); ++ if (err != -EOPNOTSUPP) ++ return err; ++ } ++ return devlink_compat_phys_port_name_get(dev, name, len); ++} ++ ++/** ++ * dev_get_port_parent_id - Get the device's port parent identifier ++ * @dev: network device ++ * @ppid: pointer to a storage for the port's parent identifier ++ * @recurse: allow/disallow recursion to lower devices ++ * ++ * Get the devices's port parent identifier ++ */ ++int dev_get_port_parent_id(struct net_device *dev, ++ struct netdev_phys_item_id *ppid, ++ bool recurse) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ struct netdev_phys_item_id first = { }; ++ struct net_device *lower_dev; ++ struct list_head *iter; ++ int err; ++ ++ if (ops->ndo_get_port_parent_id) { ++ err = ops->ndo_get_port_parent_id(dev, ppid); ++ if (err != -EOPNOTSUPP) ++ return err; ++ } ++ ++ err = devlink_compat_switch_id_get(dev, ppid); ++ if (!recurse || err != -EOPNOTSUPP) ++ return err; ++ ++ netdev_for_each_lower_dev(dev, lower_dev, iter) { ++ err = dev_get_port_parent_id(lower_dev, ppid, true); ++ if (err) ++ break; ++ if (!first.id_len) ++ first = *ppid; ++ else if (memcmp(&first, ppid, sizeof(*ppid))) ++ return -EOPNOTSUPP; ++ } ++ ++ return err; ++} ++EXPORT_SYMBOL(dev_get_port_parent_id); ++ ++/** ++ * netdev_port_same_parent_id - Indicate if two network devices have ++ * the same port parent identifier ++ * @a: first network device ++ * @b: second network device ++ */ ++bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) ++{ ++ struct netdev_phys_item_id a_id = { }; ++ struct netdev_phys_item_id b_id = { }; ++ ++ if (dev_get_port_parent_id(a, &a_id, true) || ++ dev_get_port_parent_id(b, &b_id, true)) ++ return false; ++ ++ return netdev_phys_item_id_same(&a_id, &b_id); ++} ++EXPORT_SYMBOL(netdev_port_same_parent_id); ++ ++/** ++ * dev_change_proto_down - set carrier according to proto_down. ++ * ++ * @dev: device ++ * @proto_down: new value ++ */ ++int dev_change_proto_down(struct net_device *dev, bool proto_down) ++{ ++ if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) ++ return -EOPNOTSUPP; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ if (proto_down) ++ netif_carrier_off(dev); ++ else ++ netif_carrier_on(dev); ++ dev->proto_down = proto_down; ++ return 0; ++} ++ ++/** ++ * dev_change_proto_down_reason - proto down reason ++ * ++ * @dev: device ++ * @mask: proto down mask ++ * @value: proto down value ++ */ ++void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, ++ u32 value) ++{ ++ int b; ++ ++ if (!mask) { ++ dev->proto_down_reason = value; ++ } else { ++ for_each_set_bit(b, &mask, 32) { ++ if (value & (1 << b)) ++ dev->proto_down_reason |= BIT(b); ++ else ++ dev->proto_down_reason &= ~BIT(b); ++ } ++ } ++} ++ ++struct bpf_xdp_link { ++ struct bpf_link link; ++ struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ ++ int flags; ++}; ++ ++static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) ++{ ++ if (flags & XDP_FLAGS_HW_MODE) ++ return XDP_MODE_HW; ++ if (flags & XDP_FLAGS_DRV_MODE) ++ return XDP_MODE_DRV; ++ if (flags & XDP_FLAGS_SKB_MODE) ++ return XDP_MODE_SKB; ++ return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; ++} ++ ++static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) ++{ ++ switch (mode) { ++ case XDP_MODE_SKB: ++ return generic_xdp_install; ++ case XDP_MODE_DRV: ++ case XDP_MODE_HW: ++ return dev->netdev_ops->ndo_bpf; ++ default: ++ return NULL; ++ } ++} ++ ++static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, ++ enum bpf_xdp_mode mode) ++{ ++ return dev->xdp_state[mode].link; ++} ++ ++static struct bpf_prog *dev_xdp_prog(struct net_device *dev, ++ enum bpf_xdp_mode mode) ++{ ++ struct bpf_xdp_link *link = dev_xdp_link(dev, mode); ++ ++ if (link) ++ return link->link.prog; ++ return dev->xdp_state[mode].prog; ++} ++ ++u8 dev_xdp_prog_count(struct net_device *dev) ++{ ++ u8 count = 0; ++ int i; ++ ++ for (i = 0; i < __MAX_XDP_MODE; i++) ++ if (dev->xdp_state[i].prog || dev->xdp_state[i].link) ++ count++; ++ return count; ++} ++EXPORT_SYMBOL_GPL(dev_xdp_prog_count); ++ ++u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) ++{ ++ struct bpf_prog *prog = dev_xdp_prog(dev, mode); ++ ++ return prog ? prog->aux->id : 0; ++} ++ ++static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, ++ struct bpf_xdp_link *link) ++{ ++ dev->xdp_state[mode].link = link; ++ dev->xdp_state[mode].prog = NULL; ++} ++ ++static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, ++ struct bpf_prog *prog) ++{ ++ dev->xdp_state[mode].link = NULL; ++ dev->xdp_state[mode].prog = prog; ++} ++ ++static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, ++ bpf_op_t bpf_op, struct netlink_ext_ack *extack, ++ u32 flags, struct bpf_prog *prog) ++{ ++ struct netdev_bpf xdp; ++ int err; ++ ++ memset(&xdp, 0, sizeof(xdp)); ++ xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; ++ xdp.extack = extack; ++ xdp.flags = flags; ++ xdp.prog = prog; ++ ++ /* Drivers assume refcnt is already incremented (i.e, prog pointer is ++ * "moved" into driver), so they don't increment it on their own, but ++ * they do decrement refcnt when program is detached or replaced. ++ * Given net_device also owns link/prog, we need to bump refcnt here ++ * to prevent drivers from underflowing it. ++ */ ++ if (prog) ++ bpf_prog_inc(prog); ++ err = bpf_op(dev, &xdp); ++ if (err) { ++ if (prog) ++ bpf_prog_put(prog); ++ return err; ++ } ++ ++ if (mode != XDP_MODE_HW) ++ bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); ++ ++ return 0; ++} ++ ++static void dev_xdp_uninstall(struct net_device *dev) ++{ ++ struct bpf_xdp_link *link; ++ struct bpf_prog *prog; ++ enum bpf_xdp_mode mode; ++ bpf_op_t bpf_op; ++ ++ ASSERT_RTNL(); ++ ++ for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { ++ prog = dev_xdp_prog(dev, mode); ++ if (!prog) ++ continue; ++ ++ bpf_op = dev_xdp_bpf_op(dev, mode); ++ if (!bpf_op) ++ continue; ++ ++ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); ++ ++ /* auto-detach link from net device */ ++ link = dev_xdp_link(dev, mode); ++ if (link) ++ link->dev = NULL; ++ else ++ bpf_prog_put(prog); ++ ++ dev_xdp_set_link(dev, mode, NULL); ++ } ++} ++ ++static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, ++ struct bpf_xdp_link *link, struct bpf_prog *new_prog, ++ struct bpf_prog *old_prog, u32 flags) ++{ ++ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); ++ struct bpf_prog *cur_prog; ++ struct net_device *upper; ++ struct list_head *iter; ++ enum bpf_xdp_mode mode; ++ bpf_op_t bpf_op; ++ int err; ++ ++ ASSERT_RTNL(); ++ ++ /* either link or prog attachment, never both */ ++ if (link && (new_prog || old_prog)) ++ return -EINVAL; ++ /* link supports only XDP mode flags */ ++ if (link && (flags & ~XDP_FLAGS_MODES)) { ++ NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); ++ return -EINVAL; ++ } ++ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ ++ if (num_modes > 1) { ++ NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); ++ return -EINVAL; ++ } ++ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ ++ if (!num_modes && dev_xdp_prog_count(dev) > 1) { ++ NL_SET_ERR_MSG(extack, ++ "More than one program loaded, unset mode is ambiguous"); ++ return -EINVAL; ++ } ++ /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ ++ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { ++ NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); ++ return -EINVAL; ++ } ++ ++ mode = dev_xdp_mode(dev, flags); ++ /* can't replace attached link */ ++ if (dev_xdp_link(dev, mode)) { ++ NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); ++ return -EBUSY; ++ } ++ ++ /* don't allow if an upper device already has a program */ ++ netdev_for_each_upper_dev_rcu(dev, upper, iter) { ++ if (dev_xdp_prog_count(upper) > 0) { ++ NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); ++ return -EEXIST; ++ } ++ } ++ ++ cur_prog = dev_xdp_prog(dev, mode); ++ /* can't replace attached prog with link */ ++ if (link && cur_prog) { ++ NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); ++ return -EBUSY; ++ } ++ if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { ++ NL_SET_ERR_MSG(extack, "Active program does not match expected"); ++ return -EEXIST; ++ } ++ ++ /* put effective new program into new_prog */ ++ if (link) ++ new_prog = link->link.prog; ++ ++ if (new_prog) { ++ bool offload = mode == XDP_MODE_HW; ++ enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB ++ ? XDP_MODE_DRV : XDP_MODE_SKB; ++ ++ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { ++ NL_SET_ERR_MSG(extack, "XDP program already attached"); ++ return -EBUSY; ++ } ++ if (!offload && dev_xdp_prog(dev, other_mode)) { ++ NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); ++ return -EEXIST; ++ } ++ if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { ++ NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); ++ return -EINVAL; ++ } ++ if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { ++ NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); ++ return -EINVAL; ++ } ++ if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { ++ NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); ++ return -EINVAL; ++ } ++ } ++ ++ /* don't call drivers if the effective program didn't change */ ++ if (new_prog != cur_prog) { ++ bpf_op = dev_xdp_bpf_op(dev, mode); ++ if (!bpf_op) { ++ NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); ++ return -EOPNOTSUPP; ++ } ++ ++ err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); ++ if (err) ++ return err; ++ } ++ ++ if (link) ++ dev_xdp_set_link(dev, mode, link); ++ else ++ dev_xdp_set_prog(dev, mode, new_prog); ++ if (cur_prog) ++ bpf_prog_put(cur_prog); ++ ++ return 0; ++} ++ ++static int dev_xdp_attach_link(struct net_device *dev, ++ struct netlink_ext_ack *extack, ++ struct bpf_xdp_link *link) ++{ ++ return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); ++} ++ ++static int dev_xdp_detach_link(struct net_device *dev, ++ struct netlink_ext_ack *extack, ++ struct bpf_xdp_link *link) ++{ ++ enum bpf_xdp_mode mode; ++ bpf_op_t bpf_op; ++ ++ ASSERT_RTNL(); ++ ++ mode = dev_xdp_mode(dev, link->flags); ++ if (dev_xdp_link(dev, mode) != link) ++ return -EINVAL; ++ ++ bpf_op = dev_xdp_bpf_op(dev, mode); ++ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); ++ dev_xdp_set_link(dev, mode, NULL); ++ return 0; ++} ++ ++static void bpf_xdp_link_release(struct bpf_link *link) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ ++ rtnl_lock(); ++ ++ /* if racing with net_device's tear down, xdp_link->dev might be ++ * already NULL, in which case link was already auto-detached ++ */ ++ if (xdp_link->dev) { ++ WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); ++ xdp_link->dev = NULL; ++ } ++ ++ rtnl_unlock(); ++} ++ ++static int bpf_xdp_link_detach(struct bpf_link *link) ++{ ++ bpf_xdp_link_release(link); ++ return 0; ++} ++ ++static void bpf_xdp_link_dealloc(struct bpf_link *link) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ ++ kfree(xdp_link); ++} ++ ++static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, ++ struct seq_file *seq) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ u32 ifindex = 0; ++ ++ rtnl_lock(); ++ if (xdp_link->dev) ++ ifindex = xdp_link->dev->ifindex; ++ rtnl_unlock(); ++ ++ seq_printf(seq, "ifindex:\t%u\n", ifindex); ++} ++ ++static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, ++ struct bpf_link_info *info) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ u32 ifindex = 0; ++ ++ rtnl_lock(); ++ if (xdp_link->dev) ++ ifindex = xdp_link->dev->ifindex; ++ rtnl_unlock(); ++ ++ info->xdp.ifindex = ifindex; ++ return 0; ++} ++ ++static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, ++ struct bpf_prog *old_prog) ++{ ++ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); ++ enum bpf_xdp_mode mode; ++ bpf_op_t bpf_op; ++ int err = 0; ++ ++ rtnl_lock(); ++ ++ /* link might have been auto-released already, so fail */ ++ if (!xdp_link->dev) { ++ err = -ENOLINK; ++ goto out_unlock; ++ } ++ ++ if (old_prog && link->prog != old_prog) { ++ err = -EPERM; ++ goto out_unlock; ++ } ++ old_prog = link->prog; ++ if (old_prog->type != new_prog->type || ++ old_prog->expected_attach_type != new_prog->expected_attach_type) { ++ err = -EINVAL; ++ goto out_unlock; ++ } ++ ++ if (old_prog == new_prog) { ++ /* no-op, don't disturb drivers */ ++ bpf_prog_put(new_prog); ++ goto out_unlock; ++ } ++ ++ mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); ++ bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); ++ err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, ++ xdp_link->flags, new_prog); ++ if (err) ++ goto out_unlock; ++ ++ old_prog = xchg(&link->prog, new_prog); ++ bpf_prog_put(old_prog); ++ ++out_unlock: ++ rtnl_unlock(); ++ return err; ++} ++ ++static const struct bpf_link_ops bpf_xdp_link_lops = { ++ .release = bpf_xdp_link_release, ++ .dealloc = bpf_xdp_link_dealloc, ++ .detach = bpf_xdp_link_detach, ++ .show_fdinfo = bpf_xdp_link_show_fdinfo, ++ .fill_link_info = bpf_xdp_link_fill_link_info, ++ .update_prog = bpf_xdp_link_update, ++}; ++ ++int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) ++{ ++ struct net *net = current->nsproxy->net_ns; ++ struct bpf_link_primer link_primer; ++ struct bpf_xdp_link *link; ++ struct net_device *dev; ++ int err, fd; ++ ++ rtnl_lock(); ++ dev = dev_get_by_index(net, attr->link_create.target_ifindex); ++ if (!dev) { ++ rtnl_unlock(); ++ return -EINVAL; ++ } ++ ++ link = kzalloc(sizeof(*link), GFP_USER); ++ if (!link) { ++ err = -ENOMEM; ++ goto unlock; ++ } ++ ++ bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); ++ link->dev = dev; ++ link->flags = attr->link_create.flags; ++ ++ err = bpf_link_prime(&link->link, &link_primer); ++ if (err) { ++ kfree(link); ++ goto unlock; ++ } ++ ++ err = dev_xdp_attach_link(dev, NULL, link); ++ rtnl_unlock(); ++ ++ if (err) { ++ link->dev = NULL; ++ bpf_link_cleanup(&link_primer); ++ goto out_put_dev; ++ } ++ ++ fd = bpf_link_settle(&link_primer); ++ /* link itself doesn't hold dev's refcnt to not complicate shutdown */ ++ dev_put(dev); ++ return fd; ++ ++unlock: ++ rtnl_unlock(); ++ ++out_put_dev: ++ dev_put(dev); ++ return err; ++} ++ ++/** ++ * dev_change_xdp_fd - set or clear a bpf program for a device rx path ++ * @dev: device ++ * @extack: netlink extended ack ++ * @fd: new program fd or negative value to clear ++ * @expected_fd: old program fd that userspace expects to replace or clear ++ * @flags: xdp-related flags ++ * ++ * Set or clear a bpf program for a device ++ */ ++int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, ++ int fd, int expected_fd, u32 flags) ++{ ++ enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); ++ struct bpf_prog *new_prog = NULL, *old_prog = NULL; ++ int err; ++ ++ ASSERT_RTNL(); ++ ++ if (fd >= 0) { ++ new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, ++ mode != XDP_MODE_SKB); ++ if (IS_ERR(new_prog)) ++ return PTR_ERR(new_prog); ++ } ++ ++ if (expected_fd >= 0) { ++ old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, ++ mode != XDP_MODE_SKB); ++ if (IS_ERR(old_prog)) { ++ err = PTR_ERR(old_prog); ++ old_prog = NULL; ++ goto err_out; ++ } ++ } ++ ++ err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); ++ ++err_out: ++ if (err && new_prog) ++ bpf_prog_put(new_prog); ++ if (old_prog) ++ bpf_prog_put(old_prog); ++ return err; ++} ++ ++/** ++ * dev_new_index - allocate an ifindex ++ * @net: the applicable net namespace ++ * ++ * Returns a suitable unique value for a new device interface ++ * number. The caller must hold the rtnl semaphore or the ++ * dev_base_lock to be sure it remains unique. ++ */ ++static int dev_new_index(struct net *net) ++{ ++ int ifindex = net->ifindex; ++ ++ for (;;) { ++ if (++ifindex <= 0) ++ ifindex = 1; ++ if (!__dev_get_by_index(net, ifindex)) ++ return net->ifindex = ifindex; ++ } ++} ++ ++/* Delayed registration/unregisteration */ ++LIST_HEAD(net_todo_list); ++DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); ++ ++static void net_set_todo(struct net_device *dev) ++{ ++ list_add_tail(&dev->todo_list, &net_todo_list); ++ atomic_inc(&dev_net(dev)->dev_unreg_count); ++} ++ ++static netdev_features_t netdev_sync_upper_features(struct net_device *lower, ++ struct net_device *upper, netdev_features_t features) ++{ ++ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; ++ netdev_features_t feature; ++ int feature_bit; ++ ++ for_each_netdev_feature(upper_disables, feature_bit) { ++ feature = __NETIF_F_BIT(feature_bit); ++ if (!(upper->wanted_features & feature) ++ && (features & feature)) { ++ netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", ++ &feature, upper->name); ++ features &= ~feature; ++ } ++ } ++ ++ return features; ++} ++ ++static void netdev_sync_lower_features(struct net_device *upper, ++ struct net_device *lower, netdev_features_t features) ++{ ++ netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; ++ netdev_features_t feature; ++ int feature_bit; ++ ++ for_each_netdev_feature(upper_disables, feature_bit) { ++ feature = __NETIF_F_BIT(feature_bit); ++ if (!(features & feature) && (lower->features & feature)) { ++ netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", ++ &feature, lower->name); ++ lower->wanted_features &= ~feature; ++ __netdev_update_features(lower); ++ ++ if (unlikely(lower->features & feature)) ++ netdev_WARN(upper, "failed to disable %pNF on %s!\n", ++ &feature, lower->name); ++ else ++ netdev_features_change(lower); ++ } ++ } ++} ++ ++static netdev_features_t netdev_fix_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ /* Fix illegal checksum combinations */ ++ if ((features & NETIF_F_HW_CSUM) && ++ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { ++ netdev_warn(dev, "mixed HW and IP checksum settings.\n"); ++ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); ++ } ++ ++ /* TSO requires that SG is present as well. */ ++ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { ++ netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); ++ features &= ~NETIF_F_ALL_TSO; ++ } ++ ++ if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && ++ !(features & NETIF_F_IP_CSUM)) { ++ netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); ++ features &= ~NETIF_F_TSO; ++ features &= ~NETIF_F_TSO_ECN; ++ } ++ ++ if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && ++ !(features & NETIF_F_IPV6_CSUM)) { ++ netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); ++ features &= ~NETIF_F_TSO6; ++ } ++ ++ /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ ++ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) ++ features &= ~NETIF_F_TSO_MANGLEID; ++ ++ /* TSO ECN requires that TSO is present as well. */ ++ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) ++ features &= ~NETIF_F_TSO_ECN; ++ ++ /* Software GSO depends on SG. */ ++ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { ++ netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); ++ features &= ~NETIF_F_GSO; ++ } ++ ++ /* GSO partial features require GSO partial be set */ ++ if ((features & dev->gso_partial_features) && ++ !(features & NETIF_F_GSO_PARTIAL)) { ++ netdev_dbg(dev, ++ "Dropping partially supported GSO features since no GSO partial.\n"); ++ features &= ~dev->gso_partial_features; ++ } ++ ++ if (!(features & NETIF_F_RXCSUM)) { ++ /* NETIF_F_GRO_HW implies doing RXCSUM since every packet ++ * successfully merged by hardware must also have the ++ * checksum verified by hardware. If the user does not ++ * want to enable RXCSUM, logically, we should disable GRO_HW. ++ */ ++ if (features & NETIF_F_GRO_HW) { ++ netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); ++ features &= ~NETIF_F_GRO_HW; ++ } ++ } ++ ++ /* LRO/HW-GRO features cannot be combined with RX-FCS */ ++ if (features & NETIF_F_RXFCS) { ++ if (features & NETIF_F_LRO) { ++ netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); ++ features &= ~NETIF_F_LRO; ++ } ++ ++ if (features & NETIF_F_GRO_HW) { ++ netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); ++ features &= ~NETIF_F_GRO_HW; ++ } ++ } ++ ++ if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { ++ netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); ++ features &= ~NETIF_F_LRO; ++ } ++ ++ if (features & NETIF_F_HW_TLS_TX) { ++ bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == ++ (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); ++ bool hw_csum = features & NETIF_F_HW_CSUM; ++ ++ if (!ip_csum && !hw_csum) { ++ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); ++ features &= ~NETIF_F_HW_TLS_TX; ++ } ++ } ++ ++ if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { ++ netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); ++ features &= ~NETIF_F_HW_TLS_RX; ++ } ++ ++ return features; ++} ++ ++int __netdev_update_features(struct net_device *dev) ++{ ++ struct net_device *upper, *lower; ++ netdev_features_t features; ++ struct list_head *iter; ++ int err = -1; ++ ++ ASSERT_RTNL(); ++ ++ features = netdev_get_wanted_features(dev); ++ ++ if (dev->netdev_ops->ndo_fix_features) ++ features = dev->netdev_ops->ndo_fix_features(dev, features); ++ ++ /* driver might be less strict about feature dependencies */ ++ features = netdev_fix_features(dev, features); ++ ++ /* some features can't be enabled if they're off on an upper device */ ++ netdev_for_each_upper_dev_rcu(dev, upper, iter) ++ features = netdev_sync_upper_features(dev, upper, features); ++ ++ if (dev->features == features) ++ goto sync_lower; ++ ++ netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", ++ &dev->features, &features); ++ ++ if (dev->netdev_ops->ndo_set_features) ++ err = dev->netdev_ops->ndo_set_features(dev, features); ++ else ++ err = 0; ++ ++ if (unlikely(err < 0)) { ++ netdev_err(dev, ++ "set_features() failed (%d); wanted %pNF, left %pNF\n", ++ err, &features, &dev->features); ++ /* return non-0 since some features might have changed and ++ * it's better to fire a spurious notification than miss it ++ */ ++ return -1; ++ } ++ ++sync_lower: ++ /* some features must be disabled on lower devices when disabled ++ * on an upper device (think: bonding master or bridge) ++ */ ++ netdev_for_each_lower_dev(dev, lower, iter) ++ netdev_sync_lower_features(dev, lower, features); ++ ++ if (!err) { ++ netdev_features_t diff = features ^ dev->features; ++ ++ if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { ++ /* udp_tunnel_{get,drop}_rx_info both need ++ * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the ++ * device, or they won't do anything. ++ * Thus we need to update dev->features ++ * *before* calling udp_tunnel_get_rx_info, ++ * but *after* calling udp_tunnel_drop_rx_info. ++ */ ++ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { ++ dev->features = features; ++ udp_tunnel_get_rx_info(dev); ++ } else { ++ udp_tunnel_drop_rx_info(dev); ++ } ++ } ++ ++ if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { ++ if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { ++ dev->features = features; ++ err |= vlan_get_rx_ctag_filter_info(dev); ++ } else { ++ vlan_drop_rx_ctag_filter_info(dev); ++ } ++ } ++ ++ if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { ++ if (features & NETIF_F_HW_VLAN_STAG_FILTER) { ++ dev->features = features; ++ err |= vlan_get_rx_stag_filter_info(dev); ++ } else { ++ vlan_drop_rx_stag_filter_info(dev); ++ } ++ } ++ ++ dev->features = features; ++ } ++ ++ return err < 0 ? 0 : 1; ++} ++ ++/** ++ * netdev_update_features - recalculate device features ++ * @dev: the device to check ++ * ++ * Recalculate dev->features set and send notifications if it ++ * has changed. Should be called after driver or hardware dependent ++ * conditions might have changed that influence the features. ++ */ ++void netdev_update_features(struct net_device *dev) ++{ ++ if (__netdev_update_features(dev)) ++ netdev_features_change(dev); ++} ++EXPORT_SYMBOL(netdev_update_features); ++ ++/** ++ * netdev_change_features - recalculate device features ++ * @dev: the device to check ++ * ++ * Recalculate dev->features set and send notifications even ++ * if they have not changed. Should be called instead of ++ * netdev_update_features() if also dev->vlan_features might ++ * have changed to allow the changes to be propagated to stacked ++ * VLAN devices. ++ */ ++void netdev_change_features(struct net_device *dev) ++{ ++ __netdev_update_features(dev); ++ netdev_features_change(dev); ++} ++EXPORT_SYMBOL(netdev_change_features); ++ ++/** ++ * netif_stacked_transfer_operstate - transfer operstate ++ * @rootdev: the root or lower level device to transfer state from ++ * @dev: the device to transfer operstate to ++ * ++ * Transfer operational state from root to device. This is normally ++ * called when a stacking relationship exists between the root ++ * device and the device(a leaf device). ++ */ ++void netif_stacked_transfer_operstate(const struct net_device *rootdev, ++ struct net_device *dev) ++{ ++ if (rootdev->operstate == IF_OPER_DORMANT) ++ netif_dormant_on(dev); ++ else ++ netif_dormant_off(dev); ++ ++ if (rootdev->operstate == IF_OPER_TESTING) ++ netif_testing_on(dev); ++ else ++ netif_testing_off(dev); ++ ++ if (netif_carrier_ok(rootdev)) ++ netif_carrier_on(dev); ++ else ++ netif_carrier_off(dev); ++} ++EXPORT_SYMBOL(netif_stacked_transfer_operstate); ++ ++static int netif_alloc_rx_queues(struct net_device *dev) ++{ ++ unsigned int i, count = dev->num_rx_queues; ++ struct netdev_rx_queue *rx; ++ size_t sz = count * sizeof(*rx); ++ int err = 0; ++ ++ BUG_ON(count < 1); ++ ++ rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); ++ if (!rx) ++ return -ENOMEM; ++ ++ dev->_rx = rx; ++ ++ for (i = 0; i < count; i++) { ++ rx[i].dev = dev; ++ ++ /* XDP RX-queue setup */ ++ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); ++ if (err < 0) ++ goto err_rxq_info; ++ } ++ return 0; ++ ++err_rxq_info: ++ /* Rollback successful reg's and free other resources */ ++ while (i--) ++ xdp_rxq_info_unreg(&rx[i].xdp_rxq); ++ kvfree(dev->_rx); ++ dev->_rx = NULL; ++ return err; ++} ++ ++static void netif_free_rx_queues(struct net_device *dev) ++{ ++ unsigned int i, count = dev->num_rx_queues; ++ ++ /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ ++ if (!dev->_rx) ++ return; ++ ++ for (i = 0; i < count; i++) ++ xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); ++ ++ kvfree(dev->_rx); ++} ++ ++static void netdev_init_one_queue(struct net_device *dev, ++ struct netdev_queue *queue, void *_unused) ++{ ++ /* Initialize queue lock */ ++ spin_lock_init(&queue->_xmit_lock); ++ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); ++ queue->xmit_lock_owner = -1; ++ netdev_queue_numa_node_write(queue, NUMA_NO_NODE); ++ queue->dev = dev; ++#ifdef CONFIG_BQL ++ dql_init(&queue->dql, HZ); ++#endif ++} ++ ++static void netif_free_tx_queues(struct net_device *dev) ++{ ++ kvfree(dev->_tx); ++} ++ ++static int netif_alloc_netdev_queues(struct net_device *dev) ++{ ++ unsigned int count = dev->num_tx_queues; ++ struct netdev_queue *tx; ++ size_t sz = count * sizeof(*tx); ++ ++ if (count < 1 || count > 0xffff) ++ return -EINVAL; ++ ++ tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); ++ if (!tx) ++ return -ENOMEM; ++ ++ dev->_tx = tx; ++ ++ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); ++ spin_lock_init(&dev->tx_global_lock); ++ ++ return 0; ++} ++ ++void netif_tx_stop_all_queues(struct net_device *dev) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < dev->num_tx_queues; i++) { ++ struct netdev_queue *txq = netdev_get_tx_queue(dev, i); ++ ++ netif_tx_stop_queue(txq); ++ } ++} ++EXPORT_SYMBOL(netif_tx_stop_all_queues); ++ ++/** ++ * register_netdevice() - register a network device ++ * @dev: device to register ++ * ++ * Take a prepared network device structure and make it externally accessible. ++ * A %NETDEV_REGISTER message is sent to the netdev notifier chain. ++ * Callers must hold the rtnl lock - you may want register_netdev() ++ * instead of this. ++ */ ++int register_netdevice(struct net_device *dev) ++{ ++ int ret; ++ struct net *net = dev_net(dev); ++ ++ BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < ++ NETDEV_FEATURE_COUNT); ++ BUG_ON(dev_boot_phase); ++ ASSERT_RTNL(); ++ ++ might_sleep(); ++ ++ /* When net_device's are persistent, this will be fatal. */ ++ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); ++ BUG_ON(!net); ++ ++ ret = ethtool_check_ops(dev->ethtool_ops); ++ if (ret) ++ return ret; ++ ++ spin_lock_init(&dev->addr_list_lock); ++ netdev_set_addr_lockdep_class(dev); ++ ++ ret = dev_get_valid_name(net, dev, dev->name); ++ if (ret < 0) ++ goto out; ++ ++ ret = -ENOMEM; ++ dev->name_node = netdev_name_node_head_alloc(dev); ++ if (!dev->name_node) ++ goto out; ++ ++ /* Init, if this function is available */ ++ if (dev->netdev_ops->ndo_init) { ++ ret = dev->netdev_ops->ndo_init(dev); ++ if (ret) { ++ if (ret > 0) ++ ret = -EIO; ++ goto err_free_name; ++ } ++ } ++ ++ if (((dev->hw_features | dev->features) & ++ NETIF_F_HW_VLAN_CTAG_FILTER) && ++ (!dev->netdev_ops->ndo_vlan_rx_add_vid || ++ !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { ++ netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); ++ ret = -EINVAL; ++ goto err_uninit; ++ } ++ ++ ret = -EBUSY; ++ if (!dev->ifindex) ++ dev->ifindex = dev_new_index(net); ++ else if (__dev_get_by_index(net, dev->ifindex)) ++ goto err_uninit; ++ ++ /* Transfer changeable features to wanted_features and enable ++ * software offloads (GSO and GRO). ++ */ ++ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); ++ dev->features |= NETIF_F_SOFT_FEATURES; ++ ++ if (dev->udp_tunnel_nic_info) { ++ dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; ++ dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; ++ } ++ ++ dev->wanted_features = dev->features & dev->hw_features; ++ ++ if (!(dev->flags & IFF_LOOPBACK)) ++ dev->hw_features |= NETIF_F_NOCACHE_COPY; ++ ++ /* If IPv4 TCP segmentation offload is supported we should also ++ * allow the device to enable segmenting the frame with the option ++ * of ignoring a static IP ID value. This doesn't enable the ++ * feature itself but allows the user to enable it later. ++ */ ++ if (dev->hw_features & NETIF_F_TSO) ++ dev->hw_features |= NETIF_F_TSO_MANGLEID; ++ if (dev->vlan_features & NETIF_F_TSO) ++ dev->vlan_features |= NETIF_F_TSO_MANGLEID; ++ if (dev->mpls_features & NETIF_F_TSO) ++ dev->mpls_features |= NETIF_F_TSO_MANGLEID; ++ if (dev->hw_enc_features & NETIF_F_TSO) ++ dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; ++ ++ /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. ++ */ ++ dev->vlan_features |= NETIF_F_HIGHDMA; ++ ++ /* Make NETIF_F_SG inheritable to tunnel devices. ++ */ ++ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; ++ ++ /* Make NETIF_F_SG inheritable to MPLS. ++ */ ++ dev->mpls_features |= NETIF_F_SG; ++ ++ ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ++ ret = notifier_to_errno(ret); ++ if (ret) ++ goto err_uninit; ++ ++ ret = netdev_register_kobject(dev); ++ write_lock(&dev_base_lock); ++ dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; ++ write_unlock(&dev_base_lock); ++ if (ret) ++ goto err_uninit; ++ ++ __netdev_update_features(dev); ++ ++ /* ++ * Default initial state at registry is that the ++ * device is present. ++ */ ++ ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ ++ linkwatch_init_dev(dev); ++ ++ dev_init_scheduler(dev); ++ ++ netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); ++ list_netdevice(dev); ++ ++ add_device_randomness(dev->dev_addr, dev->addr_len); ++ ++ /* If the device has permanent device address, driver should ++ * set dev_addr and also addr_assign_type should be set to ++ * NET_ADDR_PERM (default value). ++ */ ++ if (dev->addr_assign_type == NET_ADDR_PERM) ++ memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); ++ ++ /* Notify protocols, that a new device appeared. */ ++ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ++ ret = notifier_to_errno(ret); ++ if (ret) { ++ /* Expect explicit free_netdev() on failure */ ++ dev->needs_free_netdev = false; ++ unregister_netdevice_queue(dev, NULL); ++ goto out; ++ } ++ /* ++ * Prevent userspace races by waiting until the network ++ * device is fully setup before sending notifications. ++ */ ++ if (!dev->rtnl_link_ops || ++ dev->rtnl_link_state == RTNL_LINK_INITIALIZED) ++ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); ++ ++out: ++ return ret; ++ ++err_uninit: ++ if (dev->netdev_ops->ndo_uninit) ++ dev->netdev_ops->ndo_uninit(dev); ++ if (dev->priv_destructor) ++ dev->priv_destructor(dev); ++err_free_name: ++ netdev_name_node_free(dev->name_node); ++ goto out; ++} ++EXPORT_SYMBOL(register_netdevice); ++ ++/** ++ * init_dummy_netdev - init a dummy network device for NAPI ++ * @dev: device to init ++ * ++ * This takes a network device structure and initialize the minimum ++ * amount of fields so it can be used to schedule NAPI polls without ++ * registering a full blown interface. This is to be used by drivers ++ * that need to tie several hardware interfaces to a single NAPI ++ * poll scheduler due to HW limitations. ++ */ ++int init_dummy_netdev(struct net_device *dev) ++{ ++ /* Clear everything. Note we don't initialize spinlocks ++ * are they aren't supposed to be taken by any of the ++ * NAPI code and this dummy netdev is supposed to be ++ * only ever used for NAPI polls ++ */ ++ memset(dev, 0, sizeof(struct net_device)); ++ ++ /* make sure we BUG if trying to hit standard ++ * register/unregister code path ++ */ ++ dev->reg_state = NETREG_DUMMY; ++ ++ /* NAPI wants this */ ++ INIT_LIST_HEAD(&dev->napi_list); ++ ++ /* a dummy interface is started by default */ ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ set_bit(__LINK_STATE_START, &dev->state); ++ ++ /* napi_busy_loop stats accounting wants this */ ++ dev_net_set(dev, &init_net); ++ ++ /* Note : We dont allocate pcpu_refcnt for dummy devices, ++ * because users of this 'device' dont need to change ++ * its refcount. ++ */ ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(init_dummy_netdev); ++ ++ ++/** ++ * register_netdev - register a network device ++ * @dev: device to register ++ * ++ * Take a completed network device structure and add it to the kernel ++ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier ++ * chain. 0 is returned on success. A negative errno code is returned ++ * on a failure to set up the device, or if the name is a duplicate. ++ * ++ * This is a wrapper around register_netdevice that takes the rtnl semaphore ++ * and expands the device name if you passed a format string to ++ * alloc_netdev. ++ */ ++int register_netdev(struct net_device *dev) ++{ ++ int err; ++ ++ if (rtnl_lock_killable()) ++ return -EINTR; ++ err = register_netdevice(dev); ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(register_netdev); ++ ++int netdev_refcnt_read(const struct net_device *dev) ++{ ++#ifdef CONFIG_PCPU_DEV_REFCNT ++ int i, refcnt = 0; ++ ++ for_each_possible_cpu(i) ++ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); ++ return refcnt; ++#else ++ return refcount_read(&dev->dev_refcnt); ++#endif ++} ++EXPORT_SYMBOL(netdev_refcnt_read); ++ ++int netdev_unregister_timeout_secs __read_mostly = 10; ++ ++#define WAIT_REFS_MIN_MSECS 1 ++#define WAIT_REFS_MAX_MSECS 250 ++/** ++ * netdev_wait_allrefs_any - wait until all references are gone. ++ * @list: list of net_devices to wait on ++ * ++ * This is called when unregistering network devices. ++ * ++ * Any protocol or device that holds a reference should register ++ * for netdevice notification, and cleanup and put back the ++ * reference if they receive an UNREGISTER event. ++ * We can get stuck here if buggy protocols don't correctly ++ * call dev_put. ++ */ ++static struct net_device *netdev_wait_allrefs_any(struct list_head *list) ++{ ++ unsigned long rebroadcast_time, warning_time; ++ struct net_device *dev; ++ int wait = 0; ++ ++ rebroadcast_time = warning_time = jiffies; ++ ++ list_for_each_entry(dev, list, todo_list) ++ if (netdev_refcnt_read(dev) == 1) ++ return dev; ++ ++ while (true) { ++ if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { ++ rtnl_lock(); ++ ++ /* Rebroadcast unregister notification */ ++ list_for_each_entry(dev, list, todo_list) ++ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); ++ ++ __rtnl_unlock(); ++ rcu_barrier(); ++ rtnl_lock(); ++ ++ list_for_each_entry(dev, list, todo_list) ++ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, ++ &dev->state)) { ++ /* We must not have linkwatch events ++ * pending on unregister. If this ++ * happens, we simply run the queue ++ * unscheduled, resulting in a noop ++ * for this device. ++ */ ++ linkwatch_run_queue(); ++ break; ++ } ++ ++ __rtnl_unlock(); ++ ++ rebroadcast_time = jiffies; ++ } ++ ++ if (!wait) { ++ rcu_barrier(); ++ wait = WAIT_REFS_MIN_MSECS; ++ } else { ++ msleep(wait); ++ wait = min(wait << 1, WAIT_REFS_MAX_MSECS); ++ } ++ ++ list_for_each_entry(dev, list, todo_list) ++ if (netdev_refcnt_read(dev) == 1) ++ return dev; ++ ++ if (time_after(jiffies, warning_time + ++ READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { ++ list_for_each_entry(dev, list, todo_list) { ++ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", ++ dev->name, netdev_refcnt_read(dev)); ++ ref_tracker_dir_print(&dev->refcnt_tracker, 10); ++ } ++ ++ warning_time = jiffies; ++ } ++ } ++} ++ ++/* The sequence is: ++ * ++ * rtnl_lock(); ++ * ... ++ * register_netdevice(x1); ++ * register_netdevice(x2); ++ * ... ++ * unregister_netdevice(y1); ++ * unregister_netdevice(y2); ++ * ... ++ * rtnl_unlock(); ++ * free_netdev(y1); ++ * free_netdev(y2); ++ * ++ * We are invoked by rtnl_unlock(). ++ * This allows us to deal with problems: ++ * 1) We can delete sysfs objects which invoke hotplug ++ * without deadlocking with linkwatch via keventd. ++ * 2) Since we run with the RTNL semaphore not held, we can sleep ++ * safely in order to wait for the netdev refcnt to drop to zero. ++ * ++ * We must not return until all unregister events added during ++ * the interval the lock was held have been completed. ++ */ ++void netdev_run_todo(void) ++{ ++ struct net_device *dev, *tmp; ++ struct list_head list; ++#ifdef CONFIG_LOCKDEP ++ struct list_head unlink_list; ++ ++ list_replace_init(&net_unlink_list, &unlink_list); ++ ++ while (!list_empty(&unlink_list)) { ++ struct net_device *dev = list_first_entry(&unlink_list, ++ struct net_device, ++ unlink_list); ++ list_del_init(&dev->unlink_list); ++ dev->nested_level = dev->lower_level - 1; ++ } ++#endif ++ ++ /* Snapshot list, allow later requests */ ++ list_replace_init(&net_todo_list, &list); ++ ++ __rtnl_unlock(); ++ ++ /* Wait for rcu callbacks to finish before next phase */ ++ if (!list_empty(&list)) ++ rcu_barrier(); ++ ++ list_for_each_entry_safe(dev, tmp, &list, todo_list) { ++ if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { ++ netdev_WARN(dev, "run_todo but not unregistering\n"); ++ list_del(&dev->todo_list); ++ continue; ++ } ++ ++ write_lock(&dev_base_lock); ++ dev->reg_state = NETREG_UNREGISTERED; ++ write_unlock(&dev_base_lock); ++ linkwatch_forget_dev(dev); ++ } ++ ++ while (!list_empty(&list)) { ++ dev = netdev_wait_allrefs_any(&list); ++ list_del(&dev->todo_list); ++ ++ /* paranoia */ ++ BUG_ON(netdev_refcnt_read(dev) != 1); ++ BUG_ON(!list_empty(&dev->ptype_all)); ++ BUG_ON(!list_empty(&dev->ptype_specific)); ++ WARN_ON(rcu_access_pointer(dev->ip_ptr)); ++ WARN_ON(rcu_access_pointer(dev->ip6_ptr)); ++#if IS_ENABLED(CONFIG_DECNET) ++ WARN_ON(dev->dn_ptr); ++#endif ++ if (dev->priv_destructor) ++ dev->priv_destructor(dev); ++ if (dev->needs_free_netdev) ++ free_netdev(dev); ++ ++ if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) ++ wake_up(&netdev_unregistering_wq); ++ ++ /* Free network device */ ++ kobject_put(&dev->dev.kobj); ++ } ++} ++ ++/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has ++ * all the same fields in the same order as net_device_stats, with only ++ * the type differing, but rtnl_link_stats64 may have additional fields ++ * at the end for newer counters. ++ */ ++void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, ++ const struct net_device_stats *netdev_stats) ++{ ++#if BITS_PER_LONG == 64 ++ BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); ++ memcpy(stats64, netdev_stats, sizeof(*netdev_stats)); ++ /* zero out counters that only exist in rtnl_link_stats64 */ ++ memset((char *)stats64 + sizeof(*netdev_stats), 0, ++ sizeof(*stats64) - sizeof(*netdev_stats)); ++#else ++ size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); ++ const unsigned long *src = (const unsigned long *)netdev_stats; ++ u64 *dst = (u64 *)stats64; ++ ++ BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); ++ for (i = 0; i < n; i++) ++ dst[i] = src[i]; ++ /* zero out counters that only exist in rtnl_link_stats64 */ ++ memset((char *)stats64 + n * sizeof(u64), 0, ++ sizeof(*stats64) - n * sizeof(u64)); ++#endif ++} ++EXPORT_SYMBOL(netdev_stats_to_stats64); ++ ++struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) ++{ ++ struct net_device_core_stats __percpu *p; ++ ++ p = alloc_percpu_gfp(struct net_device_core_stats, ++ GFP_ATOMIC | __GFP_NOWARN); ++ ++ if (p && cmpxchg(&dev->core_stats, NULL, p)) ++ free_percpu(p); ++ ++ /* This READ_ONCE() pairs with the cmpxchg() above */ ++ return READ_ONCE(dev->core_stats); ++} ++EXPORT_SYMBOL(netdev_core_stats_alloc); ++ ++/** ++ * dev_get_stats - get network device statistics ++ * @dev: device to get statistics from ++ * @storage: place to store stats ++ * ++ * Get network statistics from device. Return @storage. ++ * The device driver may provide its own method by setting ++ * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; ++ * otherwise the internal statistics structure is used. ++ */ ++struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, ++ struct rtnl_link_stats64 *storage) ++{ ++ const struct net_device_ops *ops = dev->netdev_ops; ++ const struct net_device_core_stats __percpu *p; ++ ++ if (ops->ndo_get_stats64) { ++ memset(storage, 0, sizeof(*storage)); ++ ops->ndo_get_stats64(dev, storage); ++ } else if (ops->ndo_get_stats) { ++ netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); ++ } else { ++ netdev_stats_to_stats64(storage, &dev->stats); ++ } ++ ++ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ ++ p = READ_ONCE(dev->core_stats); ++ if (p) { ++ const struct net_device_core_stats *core_stats; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ core_stats = per_cpu_ptr(p, i); ++ storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); ++ storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); ++ storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); ++ storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); ++ } ++ } ++ return storage; ++} ++EXPORT_SYMBOL(dev_get_stats); ++ ++/** ++ * dev_fetch_sw_netstats - get per-cpu network device statistics ++ * @s: place to store stats ++ * @netstats: per-cpu network stats to read from ++ * ++ * Read per-cpu network statistics and populate the related fields in @s. ++ */ ++void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, ++ const struct pcpu_sw_netstats __percpu *netstats) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ u64 rx_packets, rx_bytes, tx_packets, tx_bytes; ++ const struct pcpu_sw_netstats *stats; ++ unsigned int start; ++ ++ stats = per_cpu_ptr(netstats, cpu); ++ do { ++ start = u64_stats_fetch_begin_irq(&stats->syncp); ++ rx_packets = u64_stats_read(&stats->rx_packets); ++ rx_bytes = u64_stats_read(&stats->rx_bytes); ++ tx_packets = u64_stats_read(&stats->tx_packets); ++ tx_bytes = u64_stats_read(&stats->tx_bytes); ++ } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); ++ ++ s->rx_packets += rx_packets; ++ s->rx_bytes += rx_bytes; ++ s->tx_packets += tx_packets; ++ s->tx_bytes += tx_bytes; ++ } ++} ++EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); ++ ++/** ++ * dev_get_tstats64 - ndo_get_stats64 implementation ++ * @dev: device to get statistics from ++ * @s: place to store stats ++ * ++ * Populate @s from dev->stats and dev->tstats. Can be used as ++ * ndo_get_stats64() callback. ++ */ ++void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) ++{ ++ netdev_stats_to_stats64(s, &dev->stats); ++ dev_fetch_sw_netstats(s, dev->tstats); ++} ++EXPORT_SYMBOL_GPL(dev_get_tstats64); ++ ++struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) ++{ ++ struct netdev_queue *queue = dev_ingress_queue(dev); ++ ++#ifdef CONFIG_NET_CLS_ACT ++ if (queue) ++ return queue; ++ queue = kzalloc(sizeof(*queue), GFP_KERNEL); ++ if (!queue) ++ return NULL; ++ netdev_init_one_queue(dev, queue, NULL); ++ RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); ++ queue->qdisc_sleeping = &noop_qdisc; ++ rcu_assign_pointer(dev->ingress_queue, queue); ++#endif ++ return queue; ++} ++ ++static const struct ethtool_ops default_ethtool_ops; ++ ++void netdev_set_default_ethtool_ops(struct net_device *dev, ++ const struct ethtool_ops *ops) ++{ ++ if (dev->ethtool_ops == &default_ethtool_ops) ++ dev->ethtool_ops = ops; ++} ++EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); ++ ++void netdev_freemem(struct net_device *dev) ++{ ++ char *addr = (char *)dev - dev->padded; ++ ++ kvfree(addr); ++} ++ ++/** ++ * alloc_netdev_mqs - allocate network device ++ * @sizeof_priv: size of private data to allocate space for ++ * @name: device name format string ++ * @name_assign_type: origin of device name ++ * @setup: callback to initialize device ++ * @txqs: the number of TX subqueues to allocate ++ * @rxqs: the number of RX subqueues to allocate ++ * ++ * Allocates a struct net_device with private data area for driver use ++ * and performs basic initialization. Also allocates subqueue structs ++ * for each queue on the device. ++ */ ++struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, ++ unsigned char name_assign_type, ++ void (*setup)(struct net_device *), ++ unsigned int txqs, unsigned int rxqs) ++{ ++ struct net_device *dev; ++ unsigned int alloc_size; ++ struct net_device *p; ++ ++ BUG_ON(strlen(name) >= sizeof(dev->name)); ++ ++ if (txqs < 1) { ++ pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); ++ return NULL; ++ } ++ ++ if (rxqs < 1) { ++ pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); ++ return NULL; ++ } ++ ++ alloc_size = sizeof(struct net_device); ++ if (sizeof_priv) { ++ /* ensure 32-byte alignment of private area */ ++ alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); ++ alloc_size += sizeof_priv; ++ } ++ /* ensure 32-byte alignment of whole construct */ ++ alloc_size += NETDEV_ALIGN - 1; ++ ++ p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); ++ if (!p) ++ return NULL; ++ ++ dev = PTR_ALIGN(p, NETDEV_ALIGN); ++ dev->padded = (char *)dev - (char *)p; ++ ++ ref_tracker_dir_init(&dev->refcnt_tracker, 128); ++#ifdef CONFIG_PCPU_DEV_REFCNT ++ dev->pcpu_refcnt = alloc_percpu(int); ++ if (!dev->pcpu_refcnt) ++ goto free_dev; ++ __dev_hold(dev); ++#else ++ refcount_set(&dev->dev_refcnt, 1); ++#endif ++ ++ if (dev_addr_init(dev)) ++ goto free_pcpu; ++ ++ dev_mc_init(dev); ++ dev_uc_init(dev); ++ ++ dev_net_set(dev, &init_net); ++ ++ dev->gso_max_size = GSO_LEGACY_MAX_SIZE; ++ dev->gso_max_segs = GSO_MAX_SEGS; ++ dev->gro_max_size = GRO_LEGACY_MAX_SIZE; ++ dev->tso_max_size = TSO_LEGACY_MAX_SIZE; ++ dev->tso_max_segs = TSO_MAX_SEGS; ++ dev->upper_level = 1; ++ dev->lower_level = 1; ++#ifdef CONFIG_LOCKDEP ++ dev->nested_level = 0; ++ INIT_LIST_HEAD(&dev->unlink_list); ++#endif ++ ++ INIT_LIST_HEAD(&dev->napi_list); ++ INIT_LIST_HEAD(&dev->unreg_list); ++ INIT_LIST_HEAD(&dev->close_list); ++ INIT_LIST_HEAD(&dev->link_watch_list); ++ INIT_LIST_HEAD(&dev->adj_list.upper); ++ INIT_LIST_HEAD(&dev->adj_list.lower); ++ INIT_LIST_HEAD(&dev->ptype_all); ++ INIT_LIST_HEAD(&dev->ptype_specific); ++ INIT_LIST_HEAD(&dev->net_notifier_list); ++#ifdef CONFIG_NET_SCHED ++ hash_init(dev->qdisc_hash); ++#endif ++ dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; ++ setup(dev); ++ ++ if (!dev->tx_queue_len) { ++ dev->priv_flags |= IFF_NO_QUEUE; ++ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; ++ } ++ ++ dev->num_tx_queues = txqs; ++ dev->real_num_tx_queues = txqs; ++ if (netif_alloc_netdev_queues(dev)) ++ goto free_all; ++ ++ dev->num_rx_queues = rxqs; ++ dev->real_num_rx_queues = rxqs; ++ if (netif_alloc_rx_queues(dev)) ++ goto free_all; ++ ++ strcpy(dev->name, name); ++ dev->name_assign_type = name_assign_type; ++ dev->group = INIT_NETDEV_GROUP; ++ if (!dev->ethtool_ops) ++ dev->ethtool_ops = &default_ethtool_ops; ++ ++ nf_hook_netdev_init(dev); ++ ++ return dev; ++ ++free_all: ++ free_netdev(dev); ++ return NULL; ++ ++free_pcpu: ++#ifdef CONFIG_PCPU_DEV_REFCNT ++ free_percpu(dev->pcpu_refcnt); ++free_dev: ++#endif ++ netdev_freemem(dev); ++ return NULL; ++} ++EXPORT_SYMBOL(alloc_netdev_mqs); ++ ++/** ++ * free_netdev - free network device ++ * @dev: device ++ * ++ * This function does the last stage of destroying an allocated device ++ * interface. The reference to the device object is released. If this ++ * is the last reference then it will be freed.Must be called in process ++ * context. ++ */ ++void free_netdev(struct net_device *dev) ++{ ++ struct napi_struct *p, *n; ++ ++ might_sleep(); ++ ++ /* When called immediately after register_netdevice() failed the unwind ++ * handling may still be dismantling the device. Handle that case by ++ * deferring the free. ++ */ ++ if (dev->reg_state == NETREG_UNREGISTERING) { ++ ASSERT_RTNL(); ++ dev->needs_free_netdev = true; ++ return; ++ } ++ ++ netif_free_tx_queues(dev); ++ netif_free_rx_queues(dev); ++ ++ kfree(rcu_dereference_protected(dev->ingress_queue, 1)); ++ ++ /* Flush device addresses */ ++ dev_addr_flush(dev); ++ ++ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) ++ netif_napi_del(p); ++ ++ ref_tracker_dir_exit(&dev->refcnt_tracker); ++#ifdef CONFIG_PCPU_DEV_REFCNT ++ free_percpu(dev->pcpu_refcnt); ++ dev->pcpu_refcnt = NULL; ++#endif ++ free_percpu(dev->core_stats); ++ dev->core_stats = NULL; ++ free_percpu(dev->xdp_bulkq); ++ dev->xdp_bulkq = NULL; ++ ++ /* Compatibility with error handling in drivers */ ++ if (dev->reg_state == NETREG_UNINITIALIZED) { ++ netdev_freemem(dev); ++ return; ++ } ++ ++ BUG_ON(dev->reg_state != NETREG_UNREGISTERED); ++ dev->reg_state = NETREG_RELEASED; ++ ++ /* will free via device release */ ++ put_device(&dev->dev); ++} ++EXPORT_SYMBOL(free_netdev); ++ ++/** ++ * synchronize_net - Synchronize with packet receive processing ++ * ++ * Wait for packets currently being received to be done. ++ * Does not block later packets from starting. ++ */ ++void synchronize_net(void) ++{ ++ might_sleep(); ++ if (rtnl_is_locked()) ++ synchronize_rcu_expedited(); ++ else ++ synchronize_rcu(); ++} ++EXPORT_SYMBOL(synchronize_net); ++ ++/** ++ * unregister_netdevice_queue - remove device from the kernel ++ * @dev: device ++ * @head: list ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. ++ * If head not NULL, device is queued to be unregistered later. ++ * ++ * Callers must hold the rtnl semaphore. You may want ++ * unregister_netdev() instead of this. ++ */ ++ ++void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) ++{ ++ ASSERT_RTNL(); ++ ++ if (head) { ++ list_move_tail(&dev->unreg_list, head); ++ } else { ++ LIST_HEAD(single); ++ ++ list_add(&dev->unreg_list, &single); ++ unregister_netdevice_many(&single); ++ } ++} ++EXPORT_SYMBOL(unregister_netdevice_queue); ++ ++/** ++ * unregister_netdevice_many - unregister many devices ++ * @head: list of devices ++ * ++ * Note: As most callers use a stack allocated list_head, ++ * we force a list_del() to make sure stack wont be corrupted later. ++ */ ++void unregister_netdevice_many(struct list_head *head) ++{ ++ struct net_device *dev, *tmp; ++ LIST_HEAD(close_head); ++ ++ BUG_ON(dev_boot_phase); ++ ASSERT_RTNL(); ++ ++ if (list_empty(head)) ++ return; ++ ++ list_for_each_entry_safe(dev, tmp, head, unreg_list) { ++ /* Some devices call without registering ++ * for initialization unwind. Remove those ++ * devices and proceed with the remaining. ++ */ ++ if (dev->reg_state == NETREG_UNINITIALIZED) { ++ pr_debug("unregister_netdevice: device %s/%p never was registered\n", ++ dev->name, dev); ++ ++ WARN_ON(1); ++ list_del(&dev->unreg_list); ++ continue; ++ } ++ dev->dismantle = true; ++ BUG_ON(dev->reg_state != NETREG_REGISTERED); ++ } ++ ++ /* If device is running, close it first. */ ++ list_for_each_entry(dev, head, unreg_list) ++ list_add_tail(&dev->close_list, &close_head); ++ dev_close_many(&close_head, true); ++ ++ list_for_each_entry(dev, head, unreg_list) { ++ /* And unlink it from device chain. */ ++ write_lock(&dev_base_lock); ++ unlist_netdevice(dev, false); ++ dev->reg_state = NETREG_UNREGISTERING; ++ write_unlock(&dev_base_lock); ++ } ++ flush_all_backlogs(); ++ ++ synchronize_net(); ++ ++ list_for_each_entry(dev, head, unreg_list) { ++ struct sk_buff *skb = NULL; ++ ++ /* Shutdown queueing discipline. */ ++ dev_shutdown(dev); ++ ++ dev_xdp_uninstall(dev); ++ ++ netdev_offload_xstats_disable_all(dev); ++ ++ /* Notify protocols, that we are about to destroy ++ * this device. They should clean all the things. ++ */ ++ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); ++ ++ if (!dev->rtnl_link_ops || ++ dev->rtnl_link_state == RTNL_LINK_INITIALIZED) ++ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, ++ GFP_KERNEL, NULL, 0); ++ ++ /* ++ * Flush the unicast and multicast chains ++ */ ++ dev_uc_flush(dev); ++ dev_mc_flush(dev); ++ ++ netdev_name_node_alt_flush(dev); ++ netdev_name_node_free(dev->name_node); ++ ++ if (dev->netdev_ops->ndo_uninit) ++ dev->netdev_ops->ndo_uninit(dev); ++ ++ if (skb) ++ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); ++ ++ /* Notifier chain MUST detach us all upper devices. */ ++ WARN_ON(netdev_has_any_upper_dev(dev)); ++ WARN_ON(netdev_has_any_lower_dev(dev)); ++ ++ /* Remove entries from kobject tree */ ++ netdev_unregister_kobject(dev); ++#ifdef CONFIG_XPS ++ /* Remove XPS queueing entries */ ++ netif_reset_xps_queues_gt(dev, 0); ++#endif ++ } ++ ++ synchronize_net(); ++ ++ list_for_each_entry(dev, head, unreg_list) { ++ netdev_put(dev, &dev->dev_registered_tracker); ++ net_set_todo(dev); ++ } ++ ++ list_del(head); ++} ++EXPORT_SYMBOL(unregister_netdevice_many); ++ ++/** ++ * unregister_netdev - remove device from the kernel ++ * @dev: device ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. ++ * ++ * This is just a wrapper for unregister_netdevice that takes ++ * the rtnl semaphore. In general you want to use this and not ++ * unregister_netdevice. ++ */ ++void unregister_netdev(struct net_device *dev) ++{ ++ rtnl_lock(); ++ unregister_netdevice(dev); ++ rtnl_unlock(); ++} ++EXPORT_SYMBOL(unregister_netdev); ++ ++/** ++ * __dev_change_net_namespace - move device to different nethost namespace ++ * @dev: device ++ * @net: network namespace ++ * @pat: If not NULL name pattern to try if the current device name ++ * is already taken in the destination network namespace. ++ * @new_ifindex: If not zero, specifies device index in the target ++ * namespace. ++ * ++ * This function shuts down a device interface and moves it ++ * to a new network namespace. On success 0 is returned, on ++ * a failure a netagive errno code is returned. ++ * ++ * Callers must hold the rtnl semaphore. ++ */ ++ ++int __dev_change_net_namespace(struct net_device *dev, struct net *net, ++ const char *pat, int new_ifindex) ++{ ++ struct net *net_old = dev_net(dev); ++ int err, new_nsid; ++ ++ ASSERT_RTNL(); ++ ++ /* Don't allow namespace local devices to be moved. */ ++ err = -EINVAL; ++ if (dev->features & NETIF_F_NETNS_LOCAL) ++ goto out; ++ ++ /* Ensure the device has been registrered */ ++ if (dev->reg_state != NETREG_REGISTERED) ++ goto out; ++ ++ /* Get out if there is nothing todo */ ++ err = 0; ++ if (net_eq(net_old, net)) ++ goto out; ++ ++ /* Pick the destination device name, and ensure ++ * we can use it in the destination network namespace. ++ */ ++ err = -EEXIST; ++ if (netdev_name_in_use(net, dev->name)) { ++ /* We get here if we can't use the current device name */ ++ if (!pat) ++ goto out; ++ err = dev_get_valid_name(net, dev, pat); ++ if (err < 0) ++ goto out; ++ } ++ ++ /* Check that new_ifindex isn't used yet. */ ++ err = -EBUSY; ++ if (new_ifindex && __dev_get_by_index(net, new_ifindex)) ++ goto out; ++ ++ /* ++ * And now a mini version of register_netdevice unregister_netdevice. ++ */ ++ ++ /* If device is running close it first. */ ++ dev_close(dev); ++ ++ /* And unlink it from device chain */ ++ unlist_netdevice(dev, true); ++ ++ synchronize_net(); ++ ++ /* Shutdown queueing discipline. */ ++ dev_shutdown(dev); ++ ++ /* Notify protocols, that we are about to destroy ++ * this device. They should clean all the things. ++ * ++ * Note that dev->reg_state stays at NETREG_REGISTERED. ++ * This is wanted because this way 8021q and macvlan know ++ * the device is just moving and can keep their slaves up. ++ */ ++ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); ++ rcu_barrier(); ++ ++ new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); ++ /* If there is an ifindex conflict assign a new one */ ++ if (!new_ifindex) { ++ if (__dev_get_by_index(net, dev->ifindex)) ++ new_ifindex = dev_new_index(net); ++ else ++ new_ifindex = dev->ifindex; ++ } ++ ++ rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, ++ new_ifindex); ++ ++ /* ++ * Flush the unicast and multicast chains ++ */ ++ dev_uc_flush(dev); ++ dev_mc_flush(dev); ++ ++ /* Send a netdev-removed uevent to the old namespace */ ++ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); ++ netdev_adjacent_del_links(dev); ++ ++ /* Move per-net netdevice notifiers that are following the netdevice */ ++ move_netdevice_notifiers_dev_net(dev, net); ++ ++ /* Actually switch the network namespace */ ++ dev_net_set(dev, net); ++ dev->ifindex = new_ifindex; ++ ++ /* Send a netdev-add uevent to the new namespace */ ++ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); ++ netdev_adjacent_add_links(dev); ++ ++ /* Fixup kobjects */ ++ err = device_rename(&dev->dev, dev->name); ++ WARN_ON(err); ++ ++ /* Adapt owner in case owning user namespace of target network ++ * namespace is different from the original one. ++ */ ++ err = netdev_change_owner(dev, net_old, net); ++ WARN_ON(err); ++ ++ /* Add the device back in the hashes */ ++ list_netdevice(dev); ++ ++ /* Notify protocols, that a new device appeared. */ ++ call_netdevice_notifiers(NETDEV_REGISTER, dev); ++ ++ /* ++ * Prevent userspace races by waiting until the network ++ * device is fully setup before sending notifications. ++ */ ++ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); ++ ++ synchronize_net(); ++ err = 0; ++out: ++ return err; ++} ++EXPORT_SYMBOL_GPL(__dev_change_net_namespace); ++ ++static int dev_cpu_dead(unsigned int oldcpu) ++{ ++ struct sk_buff **list_skb; ++ struct sk_buff *skb; ++ unsigned int cpu; ++ struct softnet_data *sd, *oldsd, *remsd = NULL; ++ ++ local_irq_disable(); ++ cpu = smp_processor_id(); ++ sd = &per_cpu(softnet_data, cpu); ++ oldsd = &per_cpu(softnet_data, oldcpu); ++ ++ /* Find end of our completion_queue. */ ++ list_skb = &sd->completion_queue; ++ while (*list_skb) ++ list_skb = &(*list_skb)->next; ++ /* Append completion queue from offline CPU. */ ++ *list_skb = oldsd->completion_queue; ++ oldsd->completion_queue = NULL; ++ ++ /* Append output queue from offline CPU. */ ++ if (oldsd->output_queue) { ++ *sd->output_queue_tailp = oldsd->output_queue; ++ sd->output_queue_tailp = oldsd->output_queue_tailp; ++ oldsd->output_queue = NULL; ++ oldsd->output_queue_tailp = &oldsd->output_queue; ++ } ++ /* Append NAPI poll list from offline CPU, with one exception : ++ * process_backlog() must be called by cpu owning percpu backlog. ++ * We properly handle process_queue & input_pkt_queue later. ++ */ ++ while (!list_empty(&oldsd->poll_list)) { ++ struct napi_struct *napi = list_first_entry(&oldsd->poll_list, ++ struct napi_struct, ++ poll_list); ++ ++ list_del_init(&napi->poll_list); ++ if (napi->poll == process_backlog) ++ napi->state = 0; ++ else ++ ____napi_schedule(sd, napi); ++ } ++ ++ raise_softirq_irqoff(NET_TX_SOFTIRQ); ++ local_irq_enable(); ++ ++#ifdef CONFIG_RPS ++ remsd = oldsd->rps_ipi_list; ++ oldsd->rps_ipi_list = NULL; ++#endif ++ /* send out pending IPI's on offline CPU */ ++ net_rps_send_ipi(remsd); ++ ++ /* Process offline CPU's input_pkt_queue */ ++ while ((skb = __skb_dequeue(&oldsd->process_queue))) { ++ netif_rx(skb); ++ input_queue_head_incr(oldsd); ++ } ++ while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { ++ netif_rx(skb); ++ input_queue_head_incr(oldsd); ++ } ++ ++ return 0; ++} ++ ++/** ++ * netdev_increment_features - increment feature set by one ++ * @all: current feature set ++ * @one: new feature set ++ * @mask: mask feature set ++ * ++ * Computes a new feature set after adding a device with feature set ++ * @one to the master device with current feature set @all. Will not ++ * enable anything that is off in @mask. Returns the new feature set. ++ */ ++netdev_features_t netdev_increment_features(netdev_features_t all, ++ netdev_features_t one, netdev_features_t mask) ++{ ++ if (mask & NETIF_F_HW_CSUM) ++ mask |= NETIF_F_CSUM_MASK; ++ mask |= NETIF_F_VLAN_CHALLENGED; ++ ++ all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; ++ all &= one | ~NETIF_F_ALL_FOR_ALL; ++ ++ /* If one device supports hw checksumming, set for all. */ ++ if (all & NETIF_F_HW_CSUM) ++ all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); ++ ++ return all; ++} ++EXPORT_SYMBOL(netdev_increment_features); ++ ++static struct hlist_head * __net_init netdev_create_hash(void) ++{ ++ int i; ++ struct hlist_head *hash; ++ ++ hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); ++ if (hash != NULL) ++ for (i = 0; i < NETDEV_HASHENTRIES; i++) ++ INIT_HLIST_HEAD(&hash[i]); ++ ++ return hash; ++} ++ ++/* Initialize per network namespace state */ ++static int __net_init netdev_init(struct net *net) ++{ ++ BUILD_BUG_ON(GRO_HASH_BUCKETS > ++ 8 * sizeof_field(struct napi_struct, gro_bitmask)); ++ ++ INIT_LIST_HEAD(&net->dev_base_head); ++ ++ net->dev_name_head = netdev_create_hash(); ++ if (net->dev_name_head == NULL) ++ goto err_name; ++ ++ net->dev_index_head = netdev_create_hash(); ++ if (net->dev_index_head == NULL) ++ goto err_idx; ++ ++ RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); ++ ++ return 0; ++ ++err_idx: ++ kfree(net->dev_name_head); ++err_name: ++ return -ENOMEM; ++} ++ ++/** ++ * netdev_drivername - network driver for the device ++ * @dev: network device ++ * ++ * Determine network driver for device. ++ */ ++const char *netdev_drivername(const struct net_device *dev) ++{ ++ const struct device_driver *driver; ++ const struct device *parent; ++ const char *empty = ""; ++ ++ parent = dev->dev.parent; ++ if (!parent) ++ return empty; ++ ++ driver = parent->driver; ++ if (driver && driver->name) ++ return driver->name; ++ return empty; ++} ++ ++static void __netdev_printk(const char *level, const struct net_device *dev, ++ struct va_format *vaf) ++{ ++ if (dev && dev->dev.parent) { ++ dev_printk_emit(level[1] - '0', ++ dev->dev.parent, ++ "%s %s %s%s: %pV", ++ dev_driver_string(dev->dev.parent), ++ dev_name(dev->dev.parent), ++ netdev_name(dev), netdev_reg_state(dev), ++ vaf); ++ } else if (dev) { ++ printk("%s%s%s: %pV", ++ level, netdev_name(dev), netdev_reg_state(dev), vaf); ++ } else { ++ printk("%s(NULL net_device): %pV", level, vaf); ++ } ++} ++ ++void netdev_printk(const char *level, const struct net_device *dev, ++ const char *format, ...) ++{ ++ struct va_format vaf; ++ va_list args; ++ ++ va_start(args, format); ++ ++ vaf.fmt = format; ++ vaf.va = &args; ++ ++ __netdev_printk(level, dev, &vaf); ++ ++ va_end(args); ++} ++EXPORT_SYMBOL(netdev_printk); ++ ++#define define_netdev_printk_level(func, level) \ ++void func(const struct net_device *dev, const char *fmt, ...) \ ++{ \ ++ struct va_format vaf; \ ++ va_list args; \ ++ \ ++ va_start(args, fmt); \ ++ \ ++ vaf.fmt = fmt; \ ++ vaf.va = &args; \ ++ \ ++ __netdev_printk(level, dev, &vaf); \ ++ \ ++ va_end(args); \ ++} \ ++EXPORT_SYMBOL(func); ++ ++define_netdev_printk_level(netdev_emerg, KERN_EMERG); ++define_netdev_printk_level(netdev_alert, KERN_ALERT); ++define_netdev_printk_level(netdev_crit, KERN_CRIT); ++define_netdev_printk_level(netdev_err, KERN_ERR); ++define_netdev_printk_level(netdev_warn, KERN_WARNING); ++define_netdev_printk_level(netdev_notice, KERN_NOTICE); ++define_netdev_printk_level(netdev_info, KERN_INFO); ++ ++static void __net_exit netdev_exit(struct net *net) ++{ ++ kfree(net->dev_name_head); ++ kfree(net->dev_index_head); ++ if (net != &init_net) ++ WARN_ON_ONCE(!list_empty(&net->dev_base_head)); ++} ++ ++static struct pernet_operations __net_initdata netdev_net_ops = { ++ .init = netdev_init, ++ .exit = netdev_exit, ++}; ++ ++static void __net_exit default_device_exit_net(struct net *net) ++{ ++ struct net_device *dev, *aux; ++ /* ++ * Push all migratable network devices back to the ++ * initial network namespace ++ */ ++ ASSERT_RTNL(); ++ for_each_netdev_safe(net, dev, aux) { ++ int err; ++ char fb_name[IFNAMSIZ]; ++ ++ /* Ignore unmoveable devices (i.e. loopback) */ ++ if (dev->features & NETIF_F_NETNS_LOCAL) ++ continue; ++ ++ /* Leave virtual devices for the generic cleanup */ ++ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) ++ continue; ++ ++ /* Push remaining network devices to init_net */ ++ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); ++ if (netdev_name_in_use(&init_net, fb_name)) ++ snprintf(fb_name, IFNAMSIZ, "dev%%d"); ++ err = dev_change_net_namespace(dev, &init_net, fb_name); ++ if (err) { ++ pr_emerg("%s: failed to move %s to init_net: %d\n", ++ __func__, dev->name, err); ++ BUG(); ++ } ++ } ++} ++ ++static void __net_exit default_device_exit_batch(struct list_head *net_list) ++{ ++ /* At exit all network devices most be removed from a network ++ * namespace. Do this in the reverse order of registration. ++ * Do this across as many network namespaces as possible to ++ * improve batching efficiency. ++ */ ++ struct net_device *dev; ++ struct net *net; ++ LIST_HEAD(dev_kill_list); ++ ++ rtnl_lock(); ++ list_for_each_entry(net, net_list, exit_list) { ++ default_device_exit_net(net); ++ cond_resched(); ++ } ++ ++ list_for_each_entry(net, net_list, exit_list) { ++ for_each_netdev_reverse(net, dev) { ++ if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) ++ dev->rtnl_link_ops->dellink(dev, &dev_kill_list); ++ else ++ unregister_netdevice_queue(dev, &dev_kill_list); ++ } ++ } ++ unregister_netdevice_many(&dev_kill_list); ++ rtnl_unlock(); ++} ++ ++static struct pernet_operations __net_initdata default_device_ops = { ++ .exit_batch = default_device_exit_batch, ++}; ++ ++/* ++ * Initialize the DEV module. At boot time this walks the device list and ++ * unhooks any devices that fail to initialise (normally hardware not ++ * present) and leaves us with a valid list of present and active devices. ++ * ++ */ ++ ++/* ++ * This is called single threaded during boot, so no need ++ * to take the rtnl semaphore. ++ */ ++static int __init net_dev_init(void) ++{ ++ int i, rc = -ENOMEM; ++ ++ BUG_ON(!dev_boot_phase); ++ ++ if (dev_proc_init()) ++ goto out; ++ ++ if (netdev_kobject_init()) ++ goto out; ++ ++ INIT_LIST_HEAD(&ptype_all); ++ for (i = 0; i < PTYPE_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ptype_base[i]); ++ ++ if (register_pernet_subsys(&netdev_net_ops)) ++ goto out; ++ ++ /* ++ * Initialise the packet receive queues. ++ */ ++ ++ for_each_possible_cpu(i) { ++ struct work_struct *flush = per_cpu_ptr(&flush_works, i); ++ struct softnet_data *sd = &per_cpu(softnet_data, i); ++ ++ INIT_WORK(flush, flush_backlog); ++ ++ skb_queue_head_init(&sd->input_pkt_queue); ++ skb_queue_head_init(&sd->process_queue); ++#ifdef CONFIG_XFRM_OFFLOAD ++ skb_queue_head_init(&sd->xfrm_backlog); ++#endif ++ INIT_LIST_HEAD(&sd->poll_list); ++ sd->output_queue_tailp = &sd->output_queue; ++#ifdef CONFIG_RPS ++ INIT_CSD(&sd->csd, rps_trigger_softirq, sd); ++ sd->cpu = i; ++#endif ++ INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); ++ spin_lock_init(&sd->defer_lock); ++ ++ init_gro_hash(&sd->backlog); ++ sd->backlog.poll = process_backlog; ++ sd->backlog.weight = weight_p; ++ } ++ ++ dev_boot_phase = 0; ++ ++ /* The loopback device is special if any other network devices ++ * is present in a network namespace the loopback device must ++ * be present. Since we now dynamically allocate and free the ++ * loopback device ensure this invariant is maintained by ++ * keeping the loopback device as the first device on the ++ * list of network devices. Ensuring the loopback devices ++ * is the first device that appears and the last network device ++ * that disappears. ++ */ ++ if (register_pernet_device(&loopback_net_ops)) ++ goto out; ++ ++ if (register_pernet_device(&default_device_ops)) ++ goto out; ++ ++ open_softirq(NET_TX_SOFTIRQ, net_tx_action); ++ open_softirq(NET_RX_SOFTIRQ, net_rx_action); ++ ++ rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", ++ NULL, dev_cpu_dead); ++ WARN_ON(rc < 0); ++ rc = 0; ++out: ++ return rc; ++} ++ ++subsys_initcall(net_dev_init); +diff -rupN linux.orig/net/core/devlink.c linux/net/core/devlink.c +--- linux.orig/net/core/devlink.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/devlink.c 2022-12-04 10:40:26.732034003 -0500 +@@ -8268,10 +8268,10 @@ static void devlink_trap_stats_read(stru cpu_stats = per_cpu_ptr(trap_stats, i); do { @@ -8713,11 +50605,10 @@ index b50bcc18b8d9e..cfa6a099457ae 100644 u64_stats_add(&stats->rx_packets, rx_packets); u64_stats_add(&stats->rx_bytes, rx_bytes); -diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c -index 75501e1bdd25b..dfcaf61d972c7 100644 ---- a/net/core/drop_monitor.c -+++ b/net/core/drop_monitor.c -@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net_dm_stats *stats) +diff -rupN linux.orig/net/core/drop_monitor.c linux/net/core/drop_monitor.c +--- linux.orig/net/core/drop_monitor.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/drop_monitor.c 2022-12-04 10:40:26.732034003 -0500 +@@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net u64 dropped; do { @@ -8729,7 +50620,7 @@ index 75501e1bdd25b..dfcaf61d972c7 100644 u64_stats_add(&stats->dropped, dropped); } -@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats) +@@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct u64 dropped; do { @@ -8741,11 +50632,10 @@ index 75501e1bdd25b..dfcaf61d972c7 100644 u64_stats_add(&stats->dropped, dropped); } -diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c -index c8d137ef5980e..b71ccaec09914 100644 ---- a/net/core/gen_stats.c -+++ b/net/core/gen_stats.c -@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, +diff -rupN linux.orig/net/core/gen_stats.c linux/net/core/gen_stats.c +--- linux.orig/net/core/gen_stats.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/gen_stats.c 2022-12-04 10:40:26.732034003 -0500 +@@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(str u64 bytes, packets; do { @@ -8758,7 +50648,7 @@ index c8d137ef5980e..b71ccaec09914 100644 t_bytes += bytes; t_packets += packets; -@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, +@@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_st } do { if (running) @@ -8771,7 +50661,7 @@ index c8d137ef5980e..b71ccaec09914 100644 _bstats_update(bstats, bytes, packets); } -@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, +@@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *r u64 bytes, packets; do { @@ -8784,7 +50674,7 @@ index c8d137ef5980e..b71ccaec09914 100644 t_bytes += bytes; t_packets += packets; -@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, +@@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *r } do { if (running) @@ -8797,11 +50687,10 @@ index c8d137ef5980e..b71ccaec09914 100644 } static int -diff --git a/net/core/skbuff.c b/net/core/skbuff.c -index 417463da4fac7..505c72a9b1534 100644 ---- a/net/core/skbuff.c -+++ b/net/core/skbuff.c -@@ -6555,6 +6555,11 @@ nodefer: __kfree_skb(skb); +diff -rupN linux.orig/net/core/skbuff.c linux/net/core/skbuff.c +--- linux.orig/net/core/skbuff.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/core/skbuff.c 2022-12-04 10:40:26.732034003 -0500 +@@ -6557,6 +6557,11 @@ nodefer: __kfree_skb(skb); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ @@ -8814,11 +50703,6576 @@ index 417463da4fac7..505c72a9b1534 100644 +#endif + } } -diff --git a/net/dsa/slave.c b/net/dsa/slave.c -index 1291c2431d440..dcc550b871623 100644 ---- a/net/dsa/slave.c -+++ b/net/dsa/slave.c -@@ -934,12 +934,12 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, +diff -rupN linux.orig/net/core/skbuff.c.orig linux/net/core/skbuff.c.orig +--- linux.orig/net/core/skbuff.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/net/core/skbuff.c.orig 2022-12-04 10:40:18.728054516 -0500 +@@ -0,0 +1,6562 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * Routines having to do with the 'struct sk_buff' memory handlers. ++ * ++ * Authors: Alan Cox ++ * Florian La Roche ++ * ++ * Fixes: ++ * Alan Cox : Fixed the worst of the load ++ * balancer bugs. ++ * Dave Platt : Interrupt stacking fix. ++ * Richard Kooijman : Timestamp fixes. ++ * Alan Cox : Changed buffer format. ++ * Alan Cox : destructor hook for AF_UNIX etc. ++ * Linus Torvalds : Better skb_clone. ++ * Alan Cox : Added skb_copy. ++ * Alan Cox : Added all the changed routines Linus ++ * only put in the headers ++ * Ray VanTassle : Fixed --skb->lock in free ++ * Alan Cox : skb_copy copy arp field ++ * Andi Kleen : slabified it. ++ * Robert Olsson : Removed skb_head_pool ++ * ++ * NOTE: ++ * The __skb_ routines should be called with interrupts ++ * disabled, or you better be *real* sure that the operation is atomic ++ * with respect to whatever list is being frobbed (e.g. via lock_sock() ++ * or via disabling bottom half handlers, etc). ++ */ ++ ++/* ++ * The functions in this file will not compile correctly with gcc 2.4.x ++ */ ++ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_NET_CLS_ACT ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "dev.h" ++#include "sock_destructor.h" ++ ++struct kmem_cache *skbuff_head_cache __ro_after_init; ++static struct kmem_cache *skbuff_fclone_cache __ro_after_init; ++#ifdef CONFIG_SKB_EXTENSIONS ++static struct kmem_cache *skbuff_ext_cache __ro_after_init; ++#endif ++int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; ++EXPORT_SYMBOL(sysctl_max_skb_frags); ++ ++#undef FN ++#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, ++const char * const drop_reasons[] = { ++ DEFINE_DROP_REASON(FN, FN) ++}; ++EXPORT_SYMBOL(drop_reasons); ++ ++/** ++ * skb_panic - private function for out-of-line support ++ * @skb: buffer ++ * @sz: size ++ * @addr: address ++ * @msg: skb_over_panic or skb_under_panic ++ * ++ * Out-of-line support for skb_put() and skb_push(). ++ * Called via the wrapper skb_over_panic() or skb_under_panic(). ++ * Keep out of line to prevent kernel bloat. ++ * __builtin_return_address is not used because it is not always reliable. ++ */ ++static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, ++ const char msg[]) ++{ ++ pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", ++ msg, addr, skb->len, sz, skb->head, skb->data, ++ (unsigned long)skb->tail, (unsigned long)skb->end, ++ skb->dev ? skb->dev->name : ""); ++ BUG(); ++} ++ ++static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) ++{ ++ skb_panic(skb, sz, addr, __func__); ++} ++ ++static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) ++{ ++ skb_panic(skb, sz, addr, __func__); ++} ++ ++#define NAPI_SKB_CACHE_SIZE 64 ++#define NAPI_SKB_CACHE_BULK 16 ++#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) ++ ++struct napi_alloc_cache { ++ struct page_frag_cache page; ++ unsigned int skb_count; ++ void *skb_cache[NAPI_SKB_CACHE_SIZE]; ++}; ++ ++static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); ++static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); ++ ++void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) ++{ ++ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); ++ ++ fragsz = SKB_DATA_ALIGN(fragsz); ++ ++ return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); ++} ++EXPORT_SYMBOL(__napi_alloc_frag_align); ++ ++void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) ++{ ++ void *data; ++ ++ fragsz = SKB_DATA_ALIGN(fragsz); ++ if (in_hardirq() || irqs_disabled()) { ++ struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); ++ ++ data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); ++ } else { ++ struct napi_alloc_cache *nc; ++ ++ local_bh_disable(); ++ nc = this_cpu_ptr(&napi_alloc_cache); ++ data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); ++ local_bh_enable(); ++ } ++ return data; ++} ++EXPORT_SYMBOL(__netdev_alloc_frag_align); ++ ++static struct sk_buff *napi_skb_cache_get(void) ++{ ++ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); ++ struct sk_buff *skb; ++ ++ if (unlikely(!nc->skb_count)) { ++ nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, ++ GFP_ATOMIC, ++ NAPI_SKB_CACHE_BULK, ++ nc->skb_cache); ++ if (unlikely(!nc->skb_count)) ++ return NULL; ++ } ++ ++ skb = nc->skb_cache[--nc->skb_count]; ++ kasan_unpoison_object_data(skbuff_head_cache, skb); ++ ++ return skb; ++} ++ ++/* Caller must provide SKB that is memset cleared */ ++static void __build_skb_around(struct sk_buff *skb, void *data, ++ unsigned int frag_size) ++{ ++ struct skb_shared_info *shinfo; ++ unsigned int size = frag_size ? : ksize(data); ++ ++ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ ++ /* Assumes caller memset cleared SKB */ ++ skb->truesize = SKB_TRUESIZE(size); ++ refcount_set(&skb->users, 1); ++ skb->head = data; ++ skb->data = data; ++ skb_reset_tail_pointer(skb); ++ skb_set_end_offset(skb, size); ++ skb->mac_header = (typeof(skb->mac_header))~0U; ++ skb->transport_header = (typeof(skb->transport_header))~0U; ++ skb->alloc_cpu = raw_smp_processor_id(); ++ /* make sure we initialize shinfo sequentially */ ++ shinfo = skb_shinfo(skb); ++ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); ++ atomic_set(&shinfo->dataref, 1); ++ ++ skb_set_kcov_handle(skb, kcov_common_handle()); ++} ++ ++/** ++ * __build_skb - build a network buffer ++ * @data: data buffer provided by caller ++ * @frag_size: size of data, or 0 if head was kmalloced ++ * ++ * Allocate a new &sk_buff. Caller provides space holding head and ++ * skb_shared_info. @data must have been allocated by kmalloc() only if ++ * @frag_size is 0, otherwise data should come from the page allocator ++ * or vmalloc() ++ * The return is the new skb buffer. ++ * On a failure the return is %NULL, and @data is not freed. ++ * Notes : ++ * Before IO, driver allocates only data buffer where NIC put incoming frame ++ * Driver should add room at head (NET_SKB_PAD) and ++ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) ++ * After IO, driver calls build_skb(), to allocate sk_buff and populate it ++ * before giving packet to stack. ++ * RX rings only contains data buffers, not full skbs. ++ */ ++struct sk_buff *__build_skb(void *data, unsigned int frag_size) ++{ ++ struct sk_buff *skb; ++ ++ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); ++ if (unlikely(!skb)) ++ return NULL; ++ ++ memset(skb, 0, offsetof(struct sk_buff, tail)); ++ __build_skb_around(skb, data, frag_size); ++ ++ return skb; ++} ++ ++/* build_skb() is wrapper over __build_skb(), that specifically ++ * takes care of skb->head and skb->pfmemalloc ++ * This means that if @frag_size is not zero, then @data must be backed ++ * by a page fragment, not kmalloc() or vmalloc() ++ */ ++struct sk_buff *build_skb(void *data, unsigned int frag_size) ++{ ++ struct sk_buff *skb = __build_skb(data, frag_size); ++ ++ if (skb && frag_size) { ++ skb->head_frag = 1; ++ if (page_is_pfmemalloc(virt_to_head_page(data))) ++ skb->pfmemalloc = 1; ++ } ++ return skb; ++} ++EXPORT_SYMBOL(build_skb); ++ ++/** ++ * build_skb_around - build a network buffer around provided skb ++ * @skb: sk_buff provide by caller, must be memset cleared ++ * @data: data buffer provided by caller ++ * @frag_size: size of data, or 0 if head was kmalloced ++ */ ++struct sk_buff *build_skb_around(struct sk_buff *skb, ++ void *data, unsigned int frag_size) ++{ ++ if (unlikely(!skb)) ++ return NULL; ++ ++ __build_skb_around(skb, data, frag_size); ++ ++ if (frag_size) { ++ skb->head_frag = 1; ++ if (page_is_pfmemalloc(virt_to_head_page(data))) ++ skb->pfmemalloc = 1; ++ } ++ return skb; ++} ++EXPORT_SYMBOL(build_skb_around); ++ ++/** ++ * __napi_build_skb - build a network buffer ++ * @data: data buffer provided by caller ++ * @frag_size: size of data, or 0 if head was kmalloced ++ * ++ * Version of __build_skb() that uses NAPI percpu caches to obtain ++ * skbuff_head instead of inplace allocation. ++ * ++ * Returns a new &sk_buff on success, %NULL on allocation failure. ++ */ ++static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) ++{ ++ struct sk_buff *skb; ++ ++ skb = napi_skb_cache_get(); ++ if (unlikely(!skb)) ++ return NULL; ++ ++ memset(skb, 0, offsetof(struct sk_buff, tail)); ++ __build_skb_around(skb, data, frag_size); ++ ++ return skb; ++} ++ ++/** ++ * napi_build_skb - build a network buffer ++ * @data: data buffer provided by caller ++ * @frag_size: size of data, or 0 if head was kmalloced ++ * ++ * Version of __napi_build_skb() that takes care of skb->head_frag ++ * and skb->pfmemalloc when the data is a page or page fragment. ++ * ++ * Returns a new &sk_buff on success, %NULL on allocation failure. ++ */ ++struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) ++{ ++ struct sk_buff *skb = __napi_build_skb(data, frag_size); ++ ++ if (likely(skb) && frag_size) { ++ skb->head_frag = 1; ++ skb_propagate_pfmemalloc(virt_to_head_page(data), skb); ++ } ++ ++ return skb; ++} ++EXPORT_SYMBOL(napi_build_skb); ++ ++/* ++ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells ++ * the caller if emergency pfmemalloc reserves are being used. If it is and ++ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves ++ * may be used. Otherwise, the packet data may be discarded until enough ++ * memory is free ++ */ ++static void *kmalloc_reserve(size_t size, gfp_t flags, int node, ++ bool *pfmemalloc) ++{ ++ void *obj; ++ bool ret_pfmemalloc = false; ++ ++ /* ++ * Try a regular allocation, when that fails and we're not entitled ++ * to the reserves, fail. ++ */ ++ obj = kmalloc_node_track_caller(size, ++ flags | __GFP_NOMEMALLOC | __GFP_NOWARN, ++ node); ++ if (obj || !(gfp_pfmemalloc_allowed(flags))) ++ goto out; ++ ++ /* Try again but now we are using pfmemalloc reserves */ ++ ret_pfmemalloc = true; ++ obj = kmalloc_node_track_caller(size, flags, node); ++ ++out: ++ if (pfmemalloc) ++ *pfmemalloc = ret_pfmemalloc; ++ ++ return obj; ++} ++ ++/* Allocate a new skbuff. We do this ourselves so we can fill in a few ++ * 'private' fields and also do memory statistics to find all the ++ * [BEEP] leaks. ++ * ++ */ ++ ++/** ++ * __alloc_skb - allocate a network buffer ++ * @size: size to allocate ++ * @gfp_mask: allocation mask ++ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache ++ * instead of head cache and allocate a cloned (child) skb. ++ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for ++ * allocations in case the data is required for writeback ++ * @node: numa node to allocate memory on ++ * ++ * Allocate a new &sk_buff. The returned buffer has no headroom and a ++ * tail room of at least size bytes. The object has a reference count ++ * of one. The return is the buffer. On a failure the return is %NULL. ++ * ++ * Buffers may only be allocated from interrupts using a @gfp_mask of ++ * %GFP_ATOMIC. ++ */ ++struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, ++ int flags, int node) ++{ ++ struct kmem_cache *cache; ++ struct sk_buff *skb; ++ unsigned int osize; ++ bool pfmemalloc; ++ u8 *data; ++ ++ cache = (flags & SKB_ALLOC_FCLONE) ++ ? skbuff_fclone_cache : skbuff_head_cache; ++ ++ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ /* Get the HEAD */ ++ if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && ++ likely(node == NUMA_NO_NODE || node == numa_mem_id())) ++ skb = napi_skb_cache_get(); ++ else ++ skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); ++ if (unlikely(!skb)) ++ return NULL; ++ prefetchw(skb); ++ ++ /* We do our best to align skb_shared_info on a separate cache ++ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives ++ * aligned memory blocks, unless SLUB/SLAB debug is enabled. ++ * Both skb->head and skb_shared_info are cache line aligned. ++ */ ++ size = SKB_DATA_ALIGN(size); ++ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); ++ if (unlikely(!data)) ++ goto nodata; ++ /* kmalloc(size) might give us more room than requested. ++ * Put skb_shared_info exactly at the end of allocated zone, ++ * to allow max possible filling before reallocation. ++ */ ++ osize = ksize(data); ++ size = SKB_WITH_OVERHEAD(osize); ++ prefetchw(data + size); ++ ++ /* ++ * Only clear those fields we need to clear, not those that we will ++ * actually initialise below. Hence, don't put any more fields after ++ * the tail pointer in struct sk_buff! ++ */ ++ memset(skb, 0, offsetof(struct sk_buff, tail)); ++ __build_skb_around(skb, data, osize); ++ skb->pfmemalloc = pfmemalloc; ++ ++ if (flags & SKB_ALLOC_FCLONE) { ++ struct sk_buff_fclones *fclones; ++ ++ fclones = container_of(skb, struct sk_buff_fclones, skb1); ++ ++ skb->fclone = SKB_FCLONE_ORIG; ++ refcount_set(&fclones->fclone_ref, 1); ++ } ++ ++ return skb; ++ ++nodata: ++ kmem_cache_free(cache, skb); ++ return NULL; ++} ++EXPORT_SYMBOL(__alloc_skb); ++ ++/** ++ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device ++ * @dev: network device to receive on ++ * @len: length to allocate ++ * @gfp_mask: get_free_pages mask, passed to alloc_skb ++ * ++ * Allocate a new &sk_buff and assign it a usage count of one. The ++ * buffer has NET_SKB_PAD headroom built in. Users should allocate ++ * the headroom they think they need without accounting for the ++ * built in space. The built in space is used for optimisations. ++ * ++ * %NULL is returned if there is no free memory. ++ */ ++struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, ++ gfp_t gfp_mask) ++{ ++ struct page_frag_cache *nc; ++ struct sk_buff *skb; ++ bool pfmemalloc; ++ void *data; ++ ++ len += NET_SKB_PAD; ++ ++ /* If requested length is either too small or too big, ++ * we use kmalloc() for skb->head allocation. ++ */ ++ if (len <= SKB_WITH_OVERHEAD(1024) || ++ len > SKB_WITH_OVERHEAD(PAGE_SIZE) || ++ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { ++ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); ++ if (!skb) ++ goto skb_fail; ++ goto skb_success; ++ } ++ ++ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ len = SKB_DATA_ALIGN(len); ++ ++ if (sk_memalloc_socks()) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ if (in_hardirq() || irqs_disabled()) { ++ nc = this_cpu_ptr(&netdev_alloc_cache); ++ data = page_frag_alloc(nc, len, gfp_mask); ++ pfmemalloc = nc->pfmemalloc; ++ } else { ++ local_bh_disable(); ++ nc = this_cpu_ptr(&napi_alloc_cache.page); ++ data = page_frag_alloc(nc, len, gfp_mask); ++ pfmemalloc = nc->pfmemalloc; ++ local_bh_enable(); ++ } ++ ++ if (unlikely(!data)) ++ return NULL; ++ ++ skb = __build_skb(data, len); ++ if (unlikely(!skb)) { ++ skb_free_frag(data); ++ return NULL; ++ } ++ ++ if (pfmemalloc) ++ skb->pfmemalloc = 1; ++ skb->head_frag = 1; ++ ++skb_success: ++ skb_reserve(skb, NET_SKB_PAD); ++ skb->dev = dev; ++ ++skb_fail: ++ return skb; ++} ++EXPORT_SYMBOL(__netdev_alloc_skb); ++ ++/** ++ * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance ++ * @napi: napi instance this buffer was allocated for ++ * @len: length to allocate ++ * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages ++ * ++ * Allocate a new sk_buff for use in NAPI receive. This buffer will ++ * attempt to allocate the head from a special reserved region used ++ * only for NAPI Rx allocation. By doing this we can save several ++ * CPU cycles by avoiding having to disable and re-enable IRQs. ++ * ++ * %NULL is returned if there is no free memory. ++ */ ++struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, ++ gfp_t gfp_mask) ++{ ++ struct napi_alloc_cache *nc; ++ struct sk_buff *skb; ++ void *data; ++ ++ DEBUG_NET_WARN_ON_ONCE(!in_softirq()); ++ len += NET_SKB_PAD + NET_IP_ALIGN; ++ ++ /* If requested length is either too small or too big, ++ * we use kmalloc() for skb->head allocation. ++ */ ++ if (len <= SKB_WITH_OVERHEAD(1024) || ++ len > SKB_WITH_OVERHEAD(PAGE_SIZE) || ++ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { ++ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, ++ NUMA_NO_NODE); ++ if (!skb) ++ goto skb_fail; ++ goto skb_success; ++ } ++ ++ nc = this_cpu_ptr(&napi_alloc_cache); ++ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ len = SKB_DATA_ALIGN(len); ++ ++ if (sk_memalloc_socks()) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ data = page_frag_alloc(&nc->page, len, gfp_mask); ++ if (unlikely(!data)) ++ return NULL; ++ ++ skb = __napi_build_skb(data, len); ++ if (unlikely(!skb)) { ++ skb_free_frag(data); ++ return NULL; ++ } ++ ++ if (nc->page.pfmemalloc) ++ skb->pfmemalloc = 1; ++ skb->head_frag = 1; ++ ++skb_success: ++ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); ++ skb->dev = napi->dev; ++ ++skb_fail: ++ return skb; ++} ++EXPORT_SYMBOL(__napi_alloc_skb); ++ ++void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, ++ int size, unsigned int truesize) ++{ ++ skb_fill_page_desc(skb, i, page, off, size); ++ skb->len += size; ++ skb->data_len += size; ++ skb->truesize += truesize; ++} ++EXPORT_SYMBOL(skb_add_rx_frag); ++ ++void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, ++ unsigned int truesize) ++{ ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ skb_frag_size_add(frag, size); ++ skb->len += size; ++ skb->data_len += size; ++ skb->truesize += truesize; ++} ++EXPORT_SYMBOL(skb_coalesce_rx_frag); ++ ++static void skb_drop_list(struct sk_buff **listp) ++{ ++ kfree_skb_list(*listp); ++ *listp = NULL; ++} ++ ++static inline void skb_drop_fraglist(struct sk_buff *skb) ++{ ++ skb_drop_list(&skb_shinfo(skb)->frag_list); ++} ++ ++static void skb_clone_fraglist(struct sk_buff *skb) ++{ ++ struct sk_buff *list; ++ ++ skb_walk_frags(skb, list) ++ skb_get(list); ++} ++ ++static void skb_free_head(struct sk_buff *skb) ++{ ++ unsigned char *head = skb->head; ++ ++ if (skb->head_frag) { ++ if (skb_pp_recycle(skb, head)) ++ return; ++ skb_free_frag(head); ++ } else { ++ kfree(head); ++ } ++} ++ ++static void skb_release_data(struct sk_buff *skb) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ int i; ++ ++ if (skb->cloned && ++ atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, ++ &shinfo->dataref)) ++ goto exit; ++ ++ if (skb_zcopy(skb)) { ++ bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; ++ ++ skb_zcopy_clear(skb, true); ++ if (skip_unref) ++ goto free_head; ++ } ++ ++ for (i = 0; i < shinfo->nr_frags; i++) ++ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); ++ ++free_head: ++ if (shinfo->frag_list) ++ kfree_skb_list(shinfo->frag_list); ++ ++ skb_free_head(skb); ++exit: ++ /* When we clone an SKB we copy the reycling bit. The pp_recycle ++ * bit is only set on the head though, so in order to avoid races ++ * while trying to recycle fragments on __skb_frag_unref() we need ++ * to make one SKB responsible for triggering the recycle path. ++ * So disable the recycling bit if an SKB is cloned and we have ++ * additional references to the fragmented part of the SKB. ++ * Eventually the last SKB will have the recycling bit set and it's ++ * dataref set to 0, which will trigger the recycling ++ */ ++ skb->pp_recycle = 0; ++} ++ ++/* ++ * Free an skbuff by memory without cleaning the state. ++ */ ++static void kfree_skbmem(struct sk_buff *skb) ++{ ++ struct sk_buff_fclones *fclones; ++ ++ switch (skb->fclone) { ++ case SKB_FCLONE_UNAVAILABLE: ++ kmem_cache_free(skbuff_head_cache, skb); ++ return; ++ ++ case SKB_FCLONE_ORIG: ++ fclones = container_of(skb, struct sk_buff_fclones, skb1); ++ ++ /* We usually free the clone (TX completion) before original skb ++ * This test would have no chance to be true for the clone, ++ * while here, branch prediction will be good. ++ */ ++ if (refcount_read(&fclones->fclone_ref) == 1) ++ goto fastpath; ++ break; ++ ++ default: /* SKB_FCLONE_CLONE */ ++ fclones = container_of(skb, struct sk_buff_fclones, skb2); ++ break; ++ } ++ if (!refcount_dec_and_test(&fclones->fclone_ref)) ++ return; ++fastpath: ++ kmem_cache_free(skbuff_fclone_cache, fclones); ++} ++ ++void skb_release_head_state(struct sk_buff *skb) ++{ ++ skb_dst_drop(skb); ++ if (skb->destructor) { ++ DEBUG_NET_WARN_ON_ONCE(in_hardirq()); ++ skb->destructor(skb); ++ } ++#if IS_ENABLED(CONFIG_NF_CONNTRACK) ++ nf_conntrack_put(skb_nfct(skb)); ++#endif ++ skb_ext_put(skb); ++} ++ ++/* Free everything but the sk_buff shell. */ ++static void skb_release_all(struct sk_buff *skb) ++{ ++ skb_release_head_state(skb); ++ if (likely(skb->head)) ++ skb_release_data(skb); ++} ++ ++/** ++ * __kfree_skb - private function ++ * @skb: buffer ++ * ++ * Free an sk_buff. Release anything attached to the buffer. ++ * Clean the state. This is an internal helper function. Users should ++ * always call kfree_skb ++ */ ++ ++void __kfree_skb(struct sk_buff *skb) ++{ ++ skb_release_all(skb); ++ kfree_skbmem(skb); ++} ++EXPORT_SYMBOL(__kfree_skb); ++ ++/** ++ * kfree_skb_reason - free an sk_buff with special reason ++ * @skb: buffer to free ++ * @reason: reason why this skb is dropped ++ * ++ * Drop a reference to the buffer and free it if the usage count has ++ * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' ++ * tracepoint. ++ */ ++void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) ++{ ++ if (!skb_unref(skb)) ++ return; ++ ++ DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); ++ ++ trace_kfree_skb(skb, __builtin_return_address(0), reason); ++ __kfree_skb(skb); ++} ++EXPORT_SYMBOL(kfree_skb_reason); ++ ++void kfree_skb_list_reason(struct sk_buff *segs, ++ enum skb_drop_reason reason) ++{ ++ while (segs) { ++ struct sk_buff *next = segs->next; ++ ++ kfree_skb_reason(segs, reason); ++ segs = next; ++ } ++} ++EXPORT_SYMBOL(kfree_skb_list_reason); ++ ++/* Dump skb information and contents. ++ * ++ * Must only be called from net_ratelimit()-ed paths. ++ * ++ * Dumps whole packets if full_pkt, only headers otherwise. ++ */ ++void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) ++{ ++ struct skb_shared_info *sh = skb_shinfo(skb); ++ struct net_device *dev = skb->dev; ++ struct sock *sk = skb->sk; ++ struct sk_buff *list_skb; ++ bool has_mac, has_trans; ++ int headroom, tailroom; ++ int i, len, seg_len; ++ ++ if (full_pkt) ++ len = skb->len; ++ else ++ len = min_t(int, skb->len, MAX_HEADER + 128); ++ ++ headroom = skb_headroom(skb); ++ tailroom = skb_tailroom(skb); ++ ++ has_mac = skb_mac_header_was_set(skb); ++ has_trans = skb_transport_header_was_set(skb); ++ ++ printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" ++ "mac=(%d,%d) net=(%d,%d) trans=%d\n" ++ "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" ++ "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" ++ "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", ++ level, skb->len, headroom, skb_headlen(skb), tailroom, ++ has_mac ? skb->mac_header : -1, ++ has_mac ? skb_mac_header_len(skb) : -1, ++ skb->network_header, ++ has_trans ? skb_network_header_len(skb) : -1, ++ has_trans ? skb->transport_header : -1, ++ sh->tx_flags, sh->nr_frags, ++ sh->gso_size, sh->gso_type, sh->gso_segs, ++ skb->csum, skb->ip_summed, skb->csum_complete_sw, ++ skb->csum_valid, skb->csum_level, ++ skb->hash, skb->sw_hash, skb->l4_hash, ++ ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); ++ ++ if (dev) ++ printk("%sdev name=%s feat=%pNF\n", ++ level, dev->name, &dev->features); ++ if (sk) ++ printk("%ssk family=%hu type=%u proto=%u\n", ++ level, sk->sk_family, sk->sk_type, sk->sk_protocol); ++ ++ if (full_pkt && headroom) ++ print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, ++ 16, 1, skb->head, headroom, false); ++ ++ seg_len = min_t(int, skb_headlen(skb), len); ++ if (seg_len) ++ print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, ++ 16, 1, skb->data, seg_len, false); ++ len -= seg_len; ++ ++ if (full_pkt && tailroom) ++ print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, ++ 16, 1, skb_tail_pointer(skb), tailroom, false); ++ ++ for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ u32 p_off, p_len, copied; ++ struct page *p; ++ u8 *vaddr; ++ ++ skb_frag_foreach_page(frag, skb_frag_off(frag), ++ skb_frag_size(frag), p, p_off, p_len, ++ copied) { ++ seg_len = min_t(int, p_len, len); ++ vaddr = kmap_atomic(p); ++ print_hex_dump(level, "skb frag: ", ++ DUMP_PREFIX_OFFSET, ++ 16, 1, vaddr + p_off, seg_len, false); ++ kunmap_atomic(vaddr); ++ len -= seg_len; ++ if (!len) ++ break; ++ } ++ } ++ ++ if (full_pkt && skb_has_frag_list(skb)) { ++ printk("skb fraglist:\n"); ++ skb_walk_frags(skb, list_skb) ++ skb_dump(level, list_skb, true); ++ } ++} ++EXPORT_SYMBOL(skb_dump); ++ ++/** ++ * skb_tx_error - report an sk_buff xmit error ++ * @skb: buffer that triggered an error ++ * ++ * Report xmit error if a device callback is tracking this skb. ++ * skb must be freed afterwards. ++ */ ++void skb_tx_error(struct sk_buff *skb) ++{ ++ if (skb) { ++ skb_zcopy_downgrade_managed(skb); ++ skb_zcopy_clear(skb, true); ++ } ++} ++EXPORT_SYMBOL(skb_tx_error); ++ ++#ifdef CONFIG_TRACEPOINTS ++/** ++ * consume_skb - free an skbuff ++ * @skb: buffer to free ++ * ++ * Drop a ref to the buffer and free it if the usage count has hit zero ++ * Functions identically to kfree_skb, but kfree_skb assumes that the frame ++ * is being dropped after a failure and notes that ++ */ ++void consume_skb(struct sk_buff *skb) ++{ ++ if (!skb_unref(skb)) ++ return; ++ ++ trace_consume_skb(skb); ++ __kfree_skb(skb); ++} ++EXPORT_SYMBOL(consume_skb); ++#endif ++ ++/** ++ * __consume_stateless_skb - free an skbuff, assuming it is stateless ++ * @skb: buffer to free ++ * ++ * Alike consume_skb(), but this variant assumes that this is the last ++ * skb reference and all the head states have been already dropped ++ */ ++void __consume_stateless_skb(struct sk_buff *skb) ++{ ++ trace_consume_skb(skb); ++ skb_release_data(skb); ++ kfree_skbmem(skb); ++} ++ ++static void napi_skb_cache_put(struct sk_buff *skb) ++{ ++ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); ++ u32 i; ++ ++ kasan_poison_object_data(skbuff_head_cache, skb); ++ nc->skb_cache[nc->skb_count++] = skb; ++ ++ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { ++ for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) ++ kasan_unpoison_object_data(skbuff_head_cache, ++ nc->skb_cache[i]); ++ ++ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, ++ nc->skb_cache + NAPI_SKB_CACHE_HALF); ++ nc->skb_count = NAPI_SKB_CACHE_HALF; ++ } ++} ++ ++void __kfree_skb_defer(struct sk_buff *skb) ++{ ++ skb_release_all(skb); ++ napi_skb_cache_put(skb); ++} ++ ++void napi_skb_free_stolen_head(struct sk_buff *skb) ++{ ++ if (unlikely(skb->slow_gro)) { ++ nf_reset_ct(skb); ++ skb_dst_drop(skb); ++ skb_ext_put(skb); ++ skb_orphan(skb); ++ skb->slow_gro = 0; ++ } ++ napi_skb_cache_put(skb); ++} ++ ++void napi_consume_skb(struct sk_buff *skb, int budget) ++{ ++ /* Zero budget indicate non-NAPI context called us, like netpoll */ ++ if (unlikely(!budget)) { ++ dev_consume_skb_any(skb); ++ return; ++ } ++ ++ DEBUG_NET_WARN_ON_ONCE(!in_softirq()); ++ ++ if (!skb_unref(skb)) ++ return; ++ ++ /* if reaching here SKB is ready to free */ ++ trace_consume_skb(skb); ++ ++ /* if SKB is a clone, don't handle this case */ ++ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { ++ __kfree_skb(skb); ++ return; ++ } ++ ++ skb_release_all(skb); ++ napi_skb_cache_put(skb); ++} ++EXPORT_SYMBOL(napi_consume_skb); ++ ++/* Make sure a field is contained by headers group */ ++#define CHECK_SKB_FIELD(field) \ ++ BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ ++ offsetof(struct sk_buff, headers.field)); \ ++ ++static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) ++{ ++ new->tstamp = old->tstamp; ++ /* We do not copy old->sk */ ++ new->dev = old->dev; ++ memcpy(new->cb, old->cb, sizeof(old->cb)); ++ skb_dst_copy(new, old); ++ __skb_ext_copy(new, old); ++ __nf_copy(new, old, false); ++ ++ /* Note : this field could be in the headers group. ++ * It is not yet because we do not want to have a 16 bit hole ++ */ ++ new->queue_mapping = old->queue_mapping; ++ ++ memcpy(&new->headers, &old->headers, sizeof(new->headers)); ++ CHECK_SKB_FIELD(protocol); ++ CHECK_SKB_FIELD(csum); ++ CHECK_SKB_FIELD(hash); ++ CHECK_SKB_FIELD(priority); ++ CHECK_SKB_FIELD(skb_iif); ++ CHECK_SKB_FIELD(vlan_proto); ++ CHECK_SKB_FIELD(vlan_tci); ++ CHECK_SKB_FIELD(transport_header); ++ CHECK_SKB_FIELD(network_header); ++ CHECK_SKB_FIELD(mac_header); ++ CHECK_SKB_FIELD(inner_protocol); ++ CHECK_SKB_FIELD(inner_transport_header); ++ CHECK_SKB_FIELD(inner_network_header); ++ CHECK_SKB_FIELD(inner_mac_header); ++ CHECK_SKB_FIELD(mark); ++#ifdef CONFIG_NETWORK_SECMARK ++ CHECK_SKB_FIELD(secmark); ++#endif ++#ifdef CONFIG_NET_RX_BUSY_POLL ++ CHECK_SKB_FIELD(napi_id); ++#endif ++ CHECK_SKB_FIELD(alloc_cpu); ++#ifdef CONFIG_XPS ++ CHECK_SKB_FIELD(sender_cpu); ++#endif ++#ifdef CONFIG_NET_SCHED ++ CHECK_SKB_FIELD(tc_index); ++#endif ++ ++} ++ ++/* ++ * You should not add any new code to this function. Add it to ++ * __copy_skb_header above instead. ++ */ ++static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) ++{ ++#define C(x) n->x = skb->x ++ ++ n->next = n->prev = NULL; ++ n->sk = NULL; ++ __copy_skb_header(n, skb); ++ ++ C(len); ++ C(data_len); ++ C(mac_len); ++ n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; ++ n->cloned = 1; ++ n->nohdr = 0; ++ n->peeked = 0; ++ C(pfmemalloc); ++ C(pp_recycle); ++ n->destructor = NULL; ++ C(tail); ++ C(end); ++ C(head); ++ C(head_frag); ++ C(data); ++ C(truesize); ++ refcount_set(&n->users, 1); ++ ++ atomic_inc(&(skb_shinfo(skb)->dataref)); ++ skb->cloned = 1; ++ ++ return n; ++#undef C ++} ++ ++/** ++ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg ++ * @first: first sk_buff of the msg ++ */ ++struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) ++{ ++ struct sk_buff *n; ++ ++ n = alloc_skb(0, GFP_ATOMIC); ++ if (!n) ++ return NULL; ++ ++ n->len = first->len; ++ n->data_len = first->len; ++ n->truesize = first->truesize; ++ ++ skb_shinfo(n)->frag_list = first; ++ ++ __copy_skb_header(n, first); ++ n->destructor = NULL; ++ ++ return n; ++} ++EXPORT_SYMBOL_GPL(alloc_skb_for_msg); ++ ++/** ++ * skb_morph - morph one skb into another ++ * @dst: the skb to receive the contents ++ * @src: the skb to supply the contents ++ * ++ * This is identical to skb_clone except that the target skb is ++ * supplied by the user. ++ * ++ * The target skb is returned upon exit. ++ */ ++struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) ++{ ++ skb_release_all(dst); ++ return __skb_clone(dst, src); ++} ++EXPORT_SYMBOL_GPL(skb_morph); ++ ++int mm_account_pinned_pages(struct mmpin *mmp, size_t size) ++{ ++ unsigned long max_pg, num_pg, new_pg, old_pg; ++ struct user_struct *user; ++ ++ if (capable(CAP_IPC_LOCK) || !size) ++ return 0; ++ ++ num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ ++ max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; ++ user = mmp->user ? : current_user(); ++ ++ do { ++ old_pg = atomic_long_read(&user->locked_vm); ++ new_pg = old_pg + num_pg; ++ if (new_pg > max_pg) ++ return -ENOBUFS; ++ } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != ++ old_pg); ++ ++ if (!mmp->user) { ++ mmp->user = get_uid(user); ++ mmp->num_pg = num_pg; ++ } else { ++ mmp->num_pg += num_pg; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(mm_account_pinned_pages); ++ ++void mm_unaccount_pinned_pages(struct mmpin *mmp) ++{ ++ if (mmp->user) { ++ atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); ++ free_uid(mmp->user); ++ } ++} ++EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); ++ ++static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) ++{ ++ struct ubuf_info *uarg; ++ struct sk_buff *skb; ++ ++ WARN_ON_ONCE(!in_task()); ++ ++ skb = sock_omalloc(sk, 0, GFP_KERNEL); ++ if (!skb) ++ return NULL; ++ ++ BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); ++ uarg = (void *)skb->cb; ++ uarg->mmp.user = NULL; ++ ++ if (mm_account_pinned_pages(&uarg->mmp, size)) { ++ kfree_skb(skb); ++ return NULL; ++ } ++ ++ uarg->callback = msg_zerocopy_callback; ++ uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; ++ uarg->len = 1; ++ uarg->bytelen = size; ++ uarg->zerocopy = 1; ++ uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; ++ refcount_set(&uarg->refcnt, 1); ++ sock_hold(sk); ++ ++ return uarg; ++} ++ ++static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) ++{ ++ return container_of((void *)uarg, struct sk_buff, cb); ++} ++ ++struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, ++ struct ubuf_info *uarg) ++{ ++ if (uarg) { ++ const u32 byte_limit = 1 << 19; /* limit to a few TSO */ ++ u32 bytelen, next; ++ ++ /* there might be non MSG_ZEROCOPY users */ ++ if (uarg->callback != msg_zerocopy_callback) ++ return NULL; ++ ++ /* realloc only when socket is locked (TCP, UDP cork), ++ * so uarg->len and sk_zckey access is serialized ++ */ ++ if (!sock_owned_by_user(sk)) { ++ WARN_ON_ONCE(1); ++ return NULL; ++ } ++ ++ bytelen = uarg->bytelen + size; ++ if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { ++ /* TCP can create new skb to attach new uarg */ ++ if (sk->sk_type == SOCK_STREAM) ++ goto new_alloc; ++ return NULL; ++ } ++ ++ next = (u32)atomic_read(&sk->sk_zckey); ++ if ((u32)(uarg->id + uarg->len) == next) { ++ if (mm_account_pinned_pages(&uarg->mmp, size)) ++ return NULL; ++ uarg->len++; ++ uarg->bytelen = bytelen; ++ atomic_set(&sk->sk_zckey, ++next); ++ ++ /* no extra ref when appending to datagram (MSG_MORE) */ ++ if (sk->sk_type == SOCK_STREAM) ++ net_zcopy_get(uarg); ++ ++ return uarg; ++ } ++ } ++ ++new_alloc: ++ return msg_zerocopy_alloc(sk, size); ++} ++EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); ++ ++static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) ++{ ++ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); ++ u32 old_lo, old_hi; ++ u64 sum_len; ++ ++ old_lo = serr->ee.ee_info; ++ old_hi = serr->ee.ee_data; ++ sum_len = old_hi - old_lo + 1ULL + len; ++ ++ if (sum_len >= (1ULL << 32)) ++ return false; ++ ++ if (lo != old_hi + 1) ++ return false; ++ ++ serr->ee.ee_data += len; ++ return true; ++} ++ ++static void __msg_zerocopy_callback(struct ubuf_info *uarg) ++{ ++ struct sk_buff *tail, *skb = skb_from_uarg(uarg); ++ struct sock_exterr_skb *serr; ++ struct sock *sk = skb->sk; ++ struct sk_buff_head *q; ++ unsigned long flags; ++ bool is_zerocopy; ++ u32 lo, hi; ++ u16 len; ++ ++ mm_unaccount_pinned_pages(&uarg->mmp); ++ ++ /* if !len, there was only 1 call, and it was aborted ++ * so do not queue a completion notification ++ */ ++ if (!uarg->len || sock_flag(sk, SOCK_DEAD)) ++ goto release; ++ ++ len = uarg->len; ++ lo = uarg->id; ++ hi = uarg->id + len - 1; ++ is_zerocopy = uarg->zerocopy; ++ ++ serr = SKB_EXT_ERR(skb); ++ memset(serr, 0, sizeof(*serr)); ++ serr->ee.ee_errno = 0; ++ serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; ++ serr->ee.ee_data = hi; ++ serr->ee.ee_info = lo; ++ if (!is_zerocopy) ++ serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; ++ ++ q = &sk->sk_error_queue; ++ spin_lock_irqsave(&q->lock, flags); ++ tail = skb_peek_tail(q); ++ if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || ++ !skb_zerocopy_notify_extend(tail, lo, len)) { ++ __skb_queue_tail(q, skb); ++ skb = NULL; ++ } ++ spin_unlock_irqrestore(&q->lock, flags); ++ ++ sk_error_report(sk); ++ ++release: ++ consume_skb(skb); ++ sock_put(sk); ++} ++ ++void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, ++ bool success) ++{ ++ uarg->zerocopy = uarg->zerocopy & success; ++ ++ if (refcount_dec_and_test(&uarg->refcnt)) ++ __msg_zerocopy_callback(uarg); ++} ++EXPORT_SYMBOL_GPL(msg_zerocopy_callback); ++ ++void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) ++{ ++ struct sock *sk = skb_from_uarg(uarg)->sk; ++ ++ atomic_dec(&sk->sk_zckey); ++ uarg->len--; ++ ++ if (have_uref) ++ msg_zerocopy_callback(NULL, uarg, true); ++} ++EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); ++ ++int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, ++ struct msghdr *msg, int len, ++ struct ubuf_info *uarg) ++{ ++ struct ubuf_info *orig_uarg = skb_zcopy(skb); ++ int err, orig_len = skb->len; ++ ++ /* An skb can only point to one uarg. This edge case happens when ++ * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. ++ */ ++ if (orig_uarg && uarg != orig_uarg) ++ return -EEXIST; ++ ++ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); ++ if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { ++ struct sock *save_sk = skb->sk; ++ ++ /* Streams do not free skb on error. Reset to prev state. */ ++ iov_iter_revert(&msg->msg_iter, skb->len - orig_len); ++ skb->sk = sk; ++ ___pskb_trim(skb, orig_len); ++ skb->sk = save_sk; ++ return err; ++ } ++ ++ skb_zcopy_set(skb, uarg, NULL); ++ return skb->len - orig_len; ++} ++EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); ++ ++void __skb_zcopy_downgrade_managed(struct sk_buff *skb) ++{ ++ int i; ++ ++ skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_frag_ref(skb, i); ++} ++EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); ++ ++static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, ++ gfp_t gfp_mask) ++{ ++ if (skb_zcopy(orig)) { ++ if (skb_zcopy(nskb)) { ++ /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ ++ if (!gfp_mask) { ++ WARN_ON_ONCE(1); ++ return -ENOMEM; ++ } ++ if (skb_uarg(nskb) == skb_uarg(orig)) ++ return 0; ++ if (skb_copy_ubufs(nskb, GFP_ATOMIC)) ++ return -EIO; ++ } ++ skb_zcopy_set(nskb, skb_uarg(orig), NULL); ++ } ++ return 0; ++} ++ ++/** ++ * skb_copy_ubufs - copy userspace skb frags buffers to kernel ++ * @skb: the skb to modify ++ * @gfp_mask: allocation priority ++ * ++ * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. ++ * It will copy all frags into kernel and drop the reference ++ * to userspace pages. ++ * ++ * If this function is called from an interrupt gfp_mask() must be ++ * %GFP_ATOMIC. ++ * ++ * Returns 0 on success or a negative error code on failure ++ * to allocate kernel memory to copy to. ++ */ ++int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) ++{ ++ int num_frags = skb_shinfo(skb)->nr_frags; ++ struct page *page, *head = NULL; ++ int i, new_frags; ++ u32 d_off; ++ ++ if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) ++ return -EINVAL; ++ ++ if (!num_frags) ++ goto release; ++ ++ new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ for (i = 0; i < new_frags; i++) { ++ page = alloc_page(gfp_mask); ++ if (!page) { ++ while (head) { ++ struct page *next = (struct page *)page_private(head); ++ put_page(head); ++ head = next; ++ } ++ return -ENOMEM; ++ } ++ set_page_private(page, (unsigned long)head); ++ head = page; ++ } ++ ++ page = head; ++ d_off = 0; ++ for (i = 0; i < num_frags; i++) { ++ skb_frag_t *f = &skb_shinfo(skb)->frags[i]; ++ u32 p_off, p_len, copied; ++ struct page *p; ++ u8 *vaddr; ++ ++ skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), ++ p, p_off, p_len, copied) { ++ u32 copy, done = 0; ++ vaddr = kmap_atomic(p); ++ ++ while (done < p_len) { ++ if (d_off == PAGE_SIZE) { ++ d_off = 0; ++ page = (struct page *)page_private(page); ++ } ++ copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); ++ memcpy(page_address(page) + d_off, ++ vaddr + p_off + done, copy); ++ done += copy; ++ d_off += copy; ++ } ++ kunmap_atomic(vaddr); ++ } ++ } ++ ++ /* skb frags release userspace buffers */ ++ for (i = 0; i < num_frags; i++) ++ skb_frag_unref(skb, i); ++ ++ /* skb frags point to kernel buffers */ ++ for (i = 0; i < new_frags - 1; i++) { ++ __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); ++ head = (struct page *)page_private(head); ++ } ++ __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); ++ skb_shinfo(skb)->nr_frags = new_frags; ++ ++release: ++ skb_zcopy_clear(skb, false); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_copy_ubufs); ++ ++/** ++ * skb_clone - duplicate an sk_buff ++ * @skb: buffer to clone ++ * @gfp_mask: allocation priority ++ * ++ * Duplicate an &sk_buff. The new one is not owned by a socket. Both ++ * copies share the same packet data but not structure. The new ++ * buffer has a reference count of 1. If the allocation fails the ++ * function returns %NULL otherwise the new buffer is returned. ++ * ++ * If this function is called from an interrupt gfp_mask() must be ++ * %GFP_ATOMIC. ++ */ ++ ++struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) ++{ ++ struct sk_buff_fclones *fclones = container_of(skb, ++ struct sk_buff_fclones, ++ skb1); ++ struct sk_buff *n; ++ ++ if (skb_orphan_frags(skb, gfp_mask)) ++ return NULL; ++ ++ if (skb->fclone == SKB_FCLONE_ORIG && ++ refcount_read(&fclones->fclone_ref) == 1) { ++ n = &fclones->skb2; ++ refcount_set(&fclones->fclone_ref, 2); ++ n->fclone = SKB_FCLONE_CLONE; ++ } else { ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); ++ if (!n) ++ return NULL; ++ ++ n->fclone = SKB_FCLONE_UNAVAILABLE; ++ } ++ ++ return __skb_clone(n, skb); ++} ++EXPORT_SYMBOL(skb_clone); ++ ++void skb_headers_offset_update(struct sk_buff *skb, int off) ++{ ++ /* Only adjust this if it actually is csum_start rather than csum */ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) ++ skb->csum_start += off; ++ /* {transport,network,mac}_header and tail are relative to skb->head */ ++ skb->transport_header += off; ++ skb->network_header += off; ++ if (skb_mac_header_was_set(skb)) ++ skb->mac_header += off; ++ skb->inner_transport_header += off; ++ skb->inner_network_header += off; ++ skb->inner_mac_header += off; ++} ++EXPORT_SYMBOL(skb_headers_offset_update); ++ ++void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) ++{ ++ __copy_skb_header(new, old); ++ ++ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; ++ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; ++ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; ++} ++EXPORT_SYMBOL(skb_copy_header); ++ ++static inline int skb_alloc_rx_flag(const struct sk_buff *skb) ++{ ++ if (skb_pfmemalloc(skb)) ++ return SKB_ALLOC_RX; ++ return 0; ++} ++ ++/** ++ * skb_copy - create private copy of an sk_buff ++ * @skb: buffer to copy ++ * @gfp_mask: allocation priority ++ * ++ * Make a copy of both an &sk_buff and its data. This is used when the ++ * caller wishes to modify the data and needs a private copy of the ++ * data to alter. Returns %NULL on failure or the pointer to the buffer ++ * on success. The returned buffer has a reference count of 1. ++ * ++ * As by-product this function converts non-linear &sk_buff to linear ++ * one, so that &sk_buff becomes completely private and caller is allowed ++ * to modify all the data of returned buffer. This means that this ++ * function is not recommended for use in circumstances when only ++ * header is going to be modified. Use pskb_copy() instead. ++ */ ++ ++struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) ++{ ++ int headerlen = skb_headroom(skb); ++ unsigned int size = skb_end_offset(skb) + skb->data_len; ++ struct sk_buff *n = __alloc_skb(size, gfp_mask, ++ skb_alloc_rx_flag(skb), NUMA_NO_NODE); ++ ++ if (!n) ++ return NULL; ++ ++ /* Set the data pointer */ ++ skb_reserve(n, headerlen); ++ /* Set the tail pointer and length */ ++ skb_put(n, skb->len); ++ ++ BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); ++ ++ skb_copy_header(n, skb); ++ return n; ++} ++EXPORT_SYMBOL(skb_copy); ++ ++/** ++ * __pskb_copy_fclone - create copy of an sk_buff with private head. ++ * @skb: buffer to copy ++ * @headroom: headroom of new skb ++ * @gfp_mask: allocation priority ++ * @fclone: if true allocate the copy of the skb from the fclone ++ * cache instead of the head cache; it is recommended to set this ++ * to true for the cases where the copy will likely be cloned ++ * ++ * Make a copy of both an &sk_buff and part of its data, located ++ * in header. Fragmented data remain shared. This is used when ++ * the caller wishes to modify only header of &sk_buff and needs ++ * private copy of the header to alter. Returns %NULL on failure ++ * or the pointer to the buffer on success. ++ * The returned buffer has a reference count of 1. ++ */ ++ ++struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, ++ gfp_t gfp_mask, bool fclone) ++{ ++ unsigned int size = skb_headlen(skb) + headroom; ++ int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); ++ struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); ++ ++ if (!n) ++ goto out; ++ ++ /* Set the data pointer */ ++ skb_reserve(n, headroom); ++ /* Set the tail pointer and length */ ++ skb_put(n, skb_headlen(skb)); ++ /* Copy the bytes */ ++ skb_copy_from_linear_data(skb, n->data, n->len); ++ ++ n->truesize += skb->data_len; ++ n->data_len = skb->data_len; ++ n->len = skb->len; ++ ++ if (skb_shinfo(skb)->nr_frags) { ++ int i; ++ ++ if (skb_orphan_frags(skb, gfp_mask) || ++ skb_zerocopy_clone(n, skb, gfp_mask)) { ++ kfree_skb(n); ++ n = NULL; ++ goto out; ++ } ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; ++ skb_frag_ref(skb, i); ++ } ++ skb_shinfo(n)->nr_frags = i; ++ } ++ ++ if (skb_has_frag_list(skb)) { ++ skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; ++ skb_clone_fraglist(n); ++ } ++ ++ skb_copy_header(n, skb); ++out: ++ return n; ++} ++EXPORT_SYMBOL(__pskb_copy_fclone); ++ ++/** ++ * pskb_expand_head - reallocate header of &sk_buff ++ * @skb: buffer to reallocate ++ * @nhead: room to add at head ++ * @ntail: room to add at tail ++ * @gfp_mask: allocation priority ++ * ++ * Expands (or creates identical copy, if @nhead and @ntail are zero) ++ * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have ++ * reference count of 1. Returns zero in the case of success or error, ++ * if expansion failed. In the last case, &sk_buff is not changed. ++ * ++ * All the pointers pointing into skb header may change and must be ++ * reloaded after call to this function. ++ */ ++ ++int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, ++ gfp_t gfp_mask) ++{ ++ int i, osize = skb_end_offset(skb); ++ int size = osize + nhead + ntail; ++ long off; ++ u8 *data; ++ ++ BUG_ON(nhead < 0); ++ ++ BUG_ON(skb_shared(skb)); ++ ++ skb_zcopy_downgrade_managed(skb); ++ ++ size = SKB_DATA_ALIGN(size); ++ ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ++ gfp_mask, NUMA_NO_NODE, NULL); ++ if (!data) ++ goto nodata; ++ size = SKB_WITH_OVERHEAD(ksize(data)); ++ ++ /* Copy only real data... and, alas, header. This should be ++ * optimized for the cases when header is void. ++ */ ++ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); ++ ++ memcpy((struct skb_shared_info *)(data + size), ++ skb_shinfo(skb), ++ offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); ++ ++ /* ++ * if shinfo is shared we must drop the old head gracefully, but if it ++ * is not we can just drop the old head and let the existing refcount ++ * be since all we did is relocate the values ++ */ ++ if (skb_cloned(skb)) { ++ if (skb_orphan_frags(skb, gfp_mask)) ++ goto nofrags; ++ if (skb_zcopy(skb)) ++ refcount_inc(&skb_uarg(skb)->refcnt); ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_frag_ref(skb, i); ++ ++ if (skb_has_frag_list(skb)) ++ skb_clone_fraglist(skb); ++ ++ skb_release_data(skb); ++ } else { ++ skb_free_head(skb); ++ } ++ off = (data + nhead) - skb->head; ++ ++ skb->head = data; ++ skb->head_frag = 0; ++ skb->data += off; ++ ++ skb_set_end_offset(skb, size); ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ off = nhead; ++#endif ++ skb->tail += off; ++ skb_headers_offset_update(skb, nhead); ++ skb->cloned = 0; ++ skb->hdr_len = 0; ++ skb->nohdr = 0; ++ atomic_set(&skb_shinfo(skb)->dataref, 1); ++ ++ skb_metadata_clear(skb); ++ ++ /* It is not generally safe to change skb->truesize. ++ * For the moment, we really care of rx path, or ++ * when skb is orphaned (not attached to a socket). ++ */ ++ if (!skb->sk || skb->destructor == sock_edemux) ++ skb->truesize += size - osize; ++ ++ return 0; ++ ++nofrags: ++ kfree(data); ++nodata: ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(pskb_expand_head); ++ ++/* Make private copy of skb with writable head and some headroom */ ++ ++struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) ++{ ++ struct sk_buff *skb2; ++ int delta = headroom - skb_headroom(skb); ++ ++ if (delta <= 0) ++ skb2 = pskb_copy(skb, GFP_ATOMIC); ++ else { ++ skb2 = skb_clone(skb, GFP_ATOMIC); ++ if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ++ GFP_ATOMIC)) { ++ kfree_skb(skb2); ++ skb2 = NULL; ++ } ++ } ++ return skb2; ++} ++EXPORT_SYMBOL(skb_realloc_headroom); ++ ++int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) ++{ ++ unsigned int saved_end_offset, saved_truesize; ++ struct skb_shared_info *shinfo; ++ int res; ++ ++ saved_end_offset = skb_end_offset(skb); ++ saved_truesize = skb->truesize; ++ ++ res = pskb_expand_head(skb, 0, 0, pri); ++ if (res) ++ return res; ++ ++ skb->truesize = saved_truesize; ++ ++ if (likely(skb_end_offset(skb) == saved_end_offset)) ++ return 0; ++ ++ shinfo = skb_shinfo(skb); ++ ++ /* We are about to change back skb->end, ++ * we need to move skb_shinfo() to its new location. ++ */ ++ memmove(skb->head + saved_end_offset, ++ shinfo, ++ offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); ++ ++ skb_set_end_offset(skb, saved_end_offset); ++ ++ return 0; ++} ++ ++/** ++ * skb_expand_head - reallocate header of &sk_buff ++ * @skb: buffer to reallocate ++ * @headroom: needed headroom ++ * ++ * Unlike skb_realloc_headroom, this one does not allocate a new skb ++ * if possible; copies skb->sk to new skb as needed ++ * and frees original skb in case of failures. ++ * ++ * It expect increased headroom and generates warning otherwise. ++ */ ++ ++struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) ++{ ++ int delta = headroom - skb_headroom(skb); ++ int osize = skb_end_offset(skb); ++ struct sock *sk = skb->sk; ++ ++ if (WARN_ONCE(delta <= 0, ++ "%s is expecting an increase in the headroom", __func__)) ++ return skb; ++ ++ delta = SKB_DATA_ALIGN(delta); ++ /* pskb_expand_head() might crash, if skb is shared. */ ++ if (skb_shared(skb) || !is_skb_wmem(skb)) { ++ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); ++ ++ if (unlikely(!nskb)) ++ goto fail; ++ ++ if (sk) ++ skb_set_owner_w(nskb, sk); ++ consume_skb(skb); ++ skb = nskb; ++ } ++ if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) ++ goto fail; ++ ++ if (sk && is_skb_wmem(skb)) { ++ delta = skb_end_offset(skb) - osize; ++ refcount_add(delta, &sk->sk_wmem_alloc); ++ skb->truesize += delta; ++ } ++ return skb; ++ ++fail: ++ kfree_skb(skb); ++ return NULL; ++} ++EXPORT_SYMBOL(skb_expand_head); ++ ++/** ++ * skb_copy_expand - copy and expand sk_buff ++ * @skb: buffer to copy ++ * @newheadroom: new free bytes at head ++ * @newtailroom: new free bytes at tail ++ * @gfp_mask: allocation priority ++ * ++ * Make a copy of both an &sk_buff and its data and while doing so ++ * allocate additional space. ++ * ++ * This is used when the caller wishes to modify the data and needs a ++ * private copy of the data to alter as well as more space for new fields. ++ * Returns %NULL on failure or the pointer to the buffer ++ * on success. The returned buffer has a reference count of 1. ++ * ++ * You must pass %GFP_ATOMIC as the allocation priority if this function ++ * is called from an interrupt. ++ */ ++struct sk_buff *skb_copy_expand(const struct sk_buff *skb, ++ int newheadroom, int newtailroom, ++ gfp_t gfp_mask) ++{ ++ /* ++ * Allocate the copy buffer ++ */ ++ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, ++ gfp_mask, skb_alloc_rx_flag(skb), ++ NUMA_NO_NODE); ++ int oldheadroom = skb_headroom(skb); ++ int head_copy_len, head_copy_off; ++ ++ if (!n) ++ return NULL; ++ ++ skb_reserve(n, newheadroom); ++ ++ /* Set the tail pointer and length */ ++ skb_put(n, skb->len); ++ ++ head_copy_len = oldheadroom; ++ head_copy_off = 0; ++ if (newheadroom <= head_copy_len) ++ head_copy_len = newheadroom; ++ else ++ head_copy_off = newheadroom - head_copy_len; ++ ++ /* Copy the linear header and data. */ ++ BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, ++ skb->len + head_copy_len)); ++ ++ skb_copy_header(n, skb); ++ ++ skb_headers_offset_update(n, newheadroom - oldheadroom); ++ ++ return n; ++} ++EXPORT_SYMBOL(skb_copy_expand); ++ ++/** ++ * __skb_pad - zero pad the tail of an skb ++ * @skb: buffer to pad ++ * @pad: space to pad ++ * @free_on_error: free buffer on error ++ * ++ * Ensure that a buffer is followed by a padding area that is zero ++ * filled. Used by network drivers which may DMA or transfer data ++ * beyond the buffer end onto the wire. ++ * ++ * May return error in out of memory cases. The skb is freed on error ++ * if @free_on_error is true. ++ */ ++ ++int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) ++{ ++ int err; ++ int ntail; ++ ++ /* If the skbuff is non linear tailroom is always zero.. */ ++ if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { ++ memset(skb->data+skb->len, 0, pad); ++ return 0; ++ } ++ ++ ntail = skb->data_len + pad - (skb->end - skb->tail); ++ if (likely(skb_cloned(skb) || ntail > 0)) { ++ err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); ++ if (unlikely(err)) ++ goto free_skb; ++ } ++ ++ /* FIXME: The use of this function with non-linear skb's really needs ++ * to be audited. ++ */ ++ err = skb_linearize(skb); ++ if (unlikely(err)) ++ goto free_skb; ++ ++ memset(skb->data + skb->len, 0, pad); ++ return 0; ++ ++free_skb: ++ if (free_on_error) ++ kfree_skb(skb); ++ return err; ++} ++EXPORT_SYMBOL(__skb_pad); ++ ++/** ++ * pskb_put - add data to the tail of a potentially fragmented buffer ++ * @skb: start of the buffer to use ++ * @tail: tail fragment of the buffer to use ++ * @len: amount of data to add ++ * ++ * This function extends the used data area of the potentially ++ * fragmented buffer. @tail must be the last fragment of @skb -- or ++ * @skb itself. If this would exceed the total buffer size the kernel ++ * will panic. A pointer to the first byte of the extra data is ++ * returned. ++ */ ++ ++void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) ++{ ++ if (tail != skb) { ++ skb->data_len += len; ++ skb->len += len; ++ } ++ return skb_put(tail, len); ++} ++EXPORT_SYMBOL_GPL(pskb_put); ++ ++/** ++ * skb_put - add data to a buffer ++ * @skb: buffer to use ++ * @len: amount of data to add ++ * ++ * This function extends the used data area of the buffer. If this would ++ * exceed the total buffer size the kernel will panic. A pointer to the ++ * first byte of the extra data is returned. ++ */ ++void *skb_put(struct sk_buff *skb, unsigned int len) ++{ ++ void *tmp = skb_tail_pointer(skb); ++ SKB_LINEAR_ASSERT(skb); ++ skb->tail += len; ++ skb->len += len; ++ if (unlikely(skb->tail > skb->end)) ++ skb_over_panic(skb, len, __builtin_return_address(0)); ++ return tmp; ++} ++EXPORT_SYMBOL(skb_put); ++ ++/** ++ * skb_push - add data to the start of a buffer ++ * @skb: buffer to use ++ * @len: amount of data to add ++ * ++ * This function extends the used data area of the buffer at the buffer ++ * start. If this would exceed the total buffer headroom the kernel will ++ * panic. A pointer to the first byte of the extra data is returned. ++ */ ++void *skb_push(struct sk_buff *skb, unsigned int len) ++{ ++ skb->data -= len; ++ skb->len += len; ++ if (unlikely(skb->data < skb->head)) ++ skb_under_panic(skb, len, __builtin_return_address(0)); ++ return skb->data; ++} ++EXPORT_SYMBOL(skb_push); ++ ++/** ++ * skb_pull - remove data from the start of a buffer ++ * @skb: buffer to use ++ * @len: amount of data to remove ++ * ++ * This function removes data from the start of a buffer, returning ++ * the memory to the headroom. A pointer to the next data in the buffer ++ * is returned. Once the data has been pulled future pushes will overwrite ++ * the old data. ++ */ ++void *skb_pull(struct sk_buff *skb, unsigned int len) ++{ ++ return skb_pull_inline(skb, len); ++} ++EXPORT_SYMBOL(skb_pull); ++ ++/** ++ * skb_pull_data - remove data from the start of a buffer returning its ++ * original position. ++ * @skb: buffer to use ++ * @len: amount of data to remove ++ * ++ * This function removes data from the start of a buffer, returning ++ * the memory to the headroom. A pointer to the original data in the buffer ++ * is returned after checking if there is enough data to pull. Once the ++ * data has been pulled future pushes will overwrite the old data. ++ */ ++void *skb_pull_data(struct sk_buff *skb, size_t len) ++{ ++ void *data = skb->data; ++ ++ if (skb->len < len) ++ return NULL; ++ ++ skb_pull(skb, len); ++ ++ return data; ++} ++EXPORT_SYMBOL(skb_pull_data); ++ ++/** ++ * skb_trim - remove end from a buffer ++ * @skb: buffer to alter ++ * @len: new length ++ * ++ * Cut the length of a buffer down by removing data from the tail. If ++ * the buffer is already under the length specified it is not modified. ++ * The skb must be linear. ++ */ ++void skb_trim(struct sk_buff *skb, unsigned int len) ++{ ++ if (skb->len > len) ++ __skb_trim(skb, len); ++} ++EXPORT_SYMBOL(skb_trim); ++ ++/* Trims skb to length len. It can change skb pointers. ++ */ ++ ++int ___pskb_trim(struct sk_buff *skb, unsigned int len) ++{ ++ struct sk_buff **fragp; ++ struct sk_buff *frag; ++ int offset = skb_headlen(skb); ++ int nfrags = skb_shinfo(skb)->nr_frags; ++ int i; ++ int err; ++ ++ if (skb_cloned(skb) && ++ unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) ++ return err; ++ ++ i = 0; ++ if (offset >= len) ++ goto drop_pages; ++ ++ for (; i < nfrags; i++) { ++ int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (end < len) { ++ offset = end; ++ continue; ++ } ++ ++ skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); ++ ++drop_pages: ++ skb_shinfo(skb)->nr_frags = i; ++ ++ for (; i < nfrags; i++) ++ skb_frag_unref(skb, i); ++ ++ if (skb_has_frag_list(skb)) ++ skb_drop_fraglist(skb); ++ goto done; ++ } ++ ++ for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); ++ fragp = &frag->next) { ++ int end = offset + frag->len; ++ ++ if (skb_shared(frag)) { ++ struct sk_buff *nfrag; ++ ++ nfrag = skb_clone(frag, GFP_ATOMIC); ++ if (unlikely(!nfrag)) ++ return -ENOMEM; ++ ++ nfrag->next = frag->next; ++ consume_skb(frag); ++ frag = nfrag; ++ *fragp = frag; ++ } ++ ++ if (end < len) { ++ offset = end; ++ continue; ++ } ++ ++ if (end > len && ++ unlikely((err = pskb_trim(frag, len - offset)))) ++ return err; ++ ++ if (frag->next) ++ skb_drop_list(&frag->next); ++ break; ++ } ++ ++done: ++ if (len > skb_headlen(skb)) { ++ skb->data_len -= skb->len - len; ++ skb->len = len; ++ } else { ++ skb->len = len; ++ skb->data_len = 0; ++ skb_set_tail_pointer(skb, len); ++ } ++ ++ if (!skb->sk || skb->destructor == sock_edemux) ++ skb_condense(skb); ++ return 0; ++} ++EXPORT_SYMBOL(___pskb_trim); ++ ++/* Note : use pskb_trim_rcsum() instead of calling this directly ++ */ ++int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) ++{ ++ if (skb->ip_summed == CHECKSUM_COMPLETE) { ++ int delta = skb->len - len; ++ ++ skb->csum = csum_block_sub(skb->csum, ++ skb_checksum(skb, len, delta, 0), ++ len); ++ } else if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; ++ int offset = skb_checksum_start_offset(skb) + skb->csum_offset; ++ ++ if (offset + sizeof(__sum16) > hdlen) ++ return -EINVAL; ++ } ++ return __pskb_trim(skb, len); ++} ++EXPORT_SYMBOL(pskb_trim_rcsum_slow); ++ ++/** ++ * __pskb_pull_tail - advance tail of skb header ++ * @skb: buffer to reallocate ++ * @delta: number of bytes to advance tail ++ * ++ * The function makes a sense only on a fragmented &sk_buff, ++ * it expands header moving its tail forward and copying necessary ++ * data from fragmented part. ++ * ++ * &sk_buff MUST have reference count of 1. ++ * ++ * Returns %NULL (and &sk_buff does not change) if pull failed ++ * or value of new tail of skb in the case of success. ++ * ++ * All the pointers pointing into skb header may change and must be ++ * reloaded after call to this function. ++ */ ++ ++/* Moves tail of skb head forward, copying data from fragmented part, ++ * when it is necessary. ++ * 1. It may fail due to malloc failure. ++ * 2. It may change skb pointers. ++ * ++ * It is pretty complicated. Luckily, it is called only in exceptional cases. ++ */ ++void *__pskb_pull_tail(struct sk_buff *skb, int delta) ++{ ++ /* If skb has not enough free space at tail, get new one ++ * plus 128 bytes for future expansions. If we have enough ++ * room at tail, reallocate without expansion only if skb is cloned. ++ */ ++ int i, k, eat = (skb->tail + delta) - skb->end; ++ ++ if (eat > 0 || skb_cloned(skb)) { ++ if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, ++ GFP_ATOMIC)) ++ return NULL; ++ } ++ ++ BUG_ON(skb_copy_bits(skb, skb_headlen(skb), ++ skb_tail_pointer(skb), delta)); ++ ++ /* Optimization: no fragments, no reasons to preestimate ++ * size of pulled pages. Superb. ++ */ ++ if (!skb_has_frag_list(skb)) ++ goto pull_pages; ++ ++ /* Estimate size of pulled pages. */ ++ eat = delta; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (size >= eat) ++ goto pull_pages; ++ eat -= size; ++ } ++ ++ /* If we need update frag list, we are in troubles. ++ * Certainly, it is possible to add an offset to skb data, ++ * but taking into account that pulling is expected to ++ * be very rare operation, it is worth to fight against ++ * further bloating skb head and crucify ourselves here instead. ++ * Pure masohism, indeed. 8)8) ++ */ ++ if (eat) { ++ struct sk_buff *list = skb_shinfo(skb)->frag_list; ++ struct sk_buff *clone = NULL; ++ struct sk_buff *insp = NULL; ++ ++ do { ++ if (list->len <= eat) { ++ /* Eaten as whole. */ ++ eat -= list->len; ++ list = list->next; ++ insp = list; ++ } else { ++ /* Eaten partially. */ ++ ++ if (skb_shared(list)) { ++ /* Sucks! We need to fork list. :-( */ ++ clone = skb_clone(list, GFP_ATOMIC); ++ if (!clone) ++ return NULL; ++ insp = list->next; ++ list = clone; ++ } else { ++ /* This may be pulled without ++ * problems. */ ++ insp = list; ++ } ++ if (!pskb_pull(list, eat)) { ++ kfree_skb(clone); ++ return NULL; ++ } ++ break; ++ } ++ } while (eat); ++ ++ /* Free pulled out fragments. */ ++ while ((list = skb_shinfo(skb)->frag_list) != insp) { ++ skb_shinfo(skb)->frag_list = list->next; ++ consume_skb(list); ++ } ++ /* And insert new clone at head. */ ++ if (clone) { ++ clone->next = list; ++ skb_shinfo(skb)->frag_list = clone; ++ } ++ } ++ /* Success! Now we may commit changes to skb data. */ ++ ++pull_pages: ++ eat = delta; ++ k = 0; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (size <= eat) { ++ skb_frag_unref(skb, i); ++ eat -= size; ++ } else { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; ++ ++ *frag = skb_shinfo(skb)->frags[i]; ++ if (eat) { ++ skb_frag_off_add(frag, eat); ++ skb_frag_size_sub(frag, eat); ++ if (!i) ++ goto end; ++ eat = 0; ++ } ++ k++; ++ } ++ } ++ skb_shinfo(skb)->nr_frags = k; ++ ++end: ++ skb->tail += delta; ++ skb->data_len -= delta; ++ ++ if (!skb->data_len) ++ skb_zcopy_clear(skb, false); ++ ++ return skb_tail_pointer(skb); ++} ++EXPORT_SYMBOL(__pskb_pull_tail); ++ ++/** ++ * skb_copy_bits - copy bits from skb to kernel buffer ++ * @skb: source skb ++ * @offset: offset in source ++ * @to: destination buffer ++ * @len: number of bytes to copy ++ * ++ * Copy the specified number of bytes from the source skb to the ++ * destination buffer. ++ * ++ * CAUTION ! : ++ * If its prototype is ever changed, ++ * check arch/{*}/net/{*}.S files, ++ * since it is called from BPF assembly code. ++ */ ++int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) ++{ ++ int start = skb_headlen(skb); ++ struct sk_buff *frag_iter; ++ int i, copy; ++ ++ if (offset > (int)skb->len - len) ++ goto fault; ++ ++ /* Copy header. */ ++ if ((copy = start - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ skb_copy_from_linear_data_offset(skb, offset, to, copy); ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ to += copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ skb_frag_t *f = &skb_shinfo(skb)->frags[i]; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(f); ++ if ((copy = end - offset) > 0) { ++ u32 p_off, p_len, copied; ++ struct page *p; ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ skb_frag_foreach_page(f, ++ skb_frag_off(f) + offset - start, ++ copy, p, p_off, p_len, copied) { ++ vaddr = kmap_atomic(p); ++ memcpy(to + copied, vaddr + p_off, p_len); ++ kunmap_atomic(vaddr); ++ } ++ ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ to += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ if (skb_copy_bits(frag_iter, offset - start, to, copy)) ++ goto fault; ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ to += copy; ++ } ++ start = end; ++ } ++ ++ if (!len) ++ return 0; ++ ++fault: ++ return -EFAULT; ++} ++EXPORT_SYMBOL(skb_copy_bits); ++ ++/* ++ * Callback from splice_to_pipe(), if we need to release some pages ++ * at the end of the spd in case we error'ed out in filling the pipe. ++ */ ++static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) ++{ ++ put_page(spd->pages[i]); ++} ++ ++static struct page *linear_to_page(struct page *page, unsigned int *len, ++ unsigned int *offset, ++ struct sock *sk) ++{ ++ struct page_frag *pfrag = sk_page_frag(sk); ++ ++ if (!sk_page_frag_refill(sk, pfrag)) ++ return NULL; ++ ++ *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); ++ ++ memcpy(page_address(pfrag->page) + pfrag->offset, ++ page_address(page) + *offset, *len); ++ *offset = pfrag->offset; ++ pfrag->offset += *len; ++ ++ return pfrag->page; ++} ++ ++static bool spd_can_coalesce(const struct splice_pipe_desc *spd, ++ struct page *page, ++ unsigned int offset) ++{ ++ return spd->nr_pages && ++ spd->pages[spd->nr_pages - 1] == page && ++ (spd->partial[spd->nr_pages - 1].offset + ++ spd->partial[spd->nr_pages - 1].len == offset); ++} ++ ++/* ++ * Fill page/offset/length into spd, if it can hold more pages. ++ */ ++static bool spd_fill_page(struct splice_pipe_desc *spd, ++ struct pipe_inode_info *pipe, struct page *page, ++ unsigned int *len, unsigned int offset, ++ bool linear, ++ struct sock *sk) ++{ ++ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) ++ return true; ++ ++ if (linear) { ++ page = linear_to_page(page, len, &offset, sk); ++ if (!page) ++ return true; ++ } ++ if (spd_can_coalesce(spd, page, offset)) { ++ spd->partial[spd->nr_pages - 1].len += *len; ++ return false; ++ } ++ get_page(page); ++ spd->pages[spd->nr_pages] = page; ++ spd->partial[spd->nr_pages].len = *len; ++ spd->partial[spd->nr_pages].offset = offset; ++ spd->nr_pages++; ++ ++ return false; ++} ++ ++static bool __splice_segment(struct page *page, unsigned int poff, ++ unsigned int plen, unsigned int *off, ++ unsigned int *len, ++ struct splice_pipe_desc *spd, bool linear, ++ struct sock *sk, ++ struct pipe_inode_info *pipe) ++{ ++ if (!*len) ++ return true; ++ ++ /* skip this segment if already processed */ ++ if (*off >= plen) { ++ *off -= plen; ++ return false; ++ } ++ ++ /* ignore any bits we already processed */ ++ poff += *off; ++ plen -= *off; ++ *off = 0; ++ ++ do { ++ unsigned int flen = min(*len, plen); ++ ++ if (spd_fill_page(spd, pipe, page, &flen, poff, ++ linear, sk)) ++ return true; ++ poff += flen; ++ plen -= flen; ++ *len -= flen; ++ } while (*len && plen); ++ ++ return false; ++} ++ ++/* ++ * Map linear and fragment data from the skb to spd. It reports true if the ++ * pipe is full or if we already spliced the requested length. ++ */ ++static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, ++ unsigned int *offset, unsigned int *len, ++ struct splice_pipe_desc *spd, struct sock *sk) ++{ ++ int seg; ++ struct sk_buff *iter; ++ ++ /* map the linear part : ++ * If skb->head_frag is set, this 'linear' part is backed by a ++ * fragment, and if the head is not shared with any clones then ++ * we can avoid a copy since we own the head portion of this page. ++ */ ++ if (__splice_segment(virt_to_page(skb->data), ++ (unsigned long) skb->data & (PAGE_SIZE - 1), ++ skb_headlen(skb), ++ offset, len, spd, ++ skb_head_is_locked(skb), ++ sk, pipe)) ++ return true; ++ ++ /* ++ * then map the fragments ++ */ ++ for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { ++ const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; ++ ++ if (__splice_segment(skb_frag_page(f), ++ skb_frag_off(f), skb_frag_size(f), ++ offset, len, spd, false, sk, pipe)) ++ return true; ++ } ++ ++ skb_walk_frags(skb, iter) { ++ if (*offset >= iter->len) { ++ *offset -= iter->len; ++ continue; ++ } ++ /* __skb_splice_bits() only fails if the output has no room ++ * left, so no point in going over the frag_list for the error ++ * case. ++ */ ++ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Map data from the skb to a pipe. Should handle both the linear part, ++ * the fragments, and the frag list. ++ */ ++int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, ++ struct pipe_inode_info *pipe, unsigned int tlen, ++ unsigned int flags) ++{ ++ struct partial_page partial[MAX_SKB_FRAGS]; ++ struct page *pages[MAX_SKB_FRAGS]; ++ struct splice_pipe_desc spd = { ++ .pages = pages, ++ .partial = partial, ++ .nr_pages_max = MAX_SKB_FRAGS, ++ .ops = &nosteal_pipe_buf_ops, ++ .spd_release = sock_spd_release, ++ }; ++ int ret = 0; ++ ++ __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); ++ ++ if (spd.nr_pages) ++ ret = splice_to_pipe(pipe, &spd); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(skb_splice_bits); ++ ++static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, ++ struct kvec *vec, size_t num, size_t size) ++{ ++ struct socket *sock = sk->sk_socket; ++ ++ if (!sock) ++ return -EINVAL; ++ return kernel_sendmsg(sock, msg, vec, num, size); ++} ++ ++static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, ++ size_t size, int flags) ++{ ++ struct socket *sock = sk->sk_socket; ++ ++ if (!sock) ++ return -EINVAL; ++ return kernel_sendpage(sock, page, offset, size, flags); ++} ++ ++typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, ++ struct kvec *vec, size_t num, size_t size); ++typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, ++ size_t size, int flags); ++static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, ++ int len, sendmsg_func sendmsg, sendpage_func sendpage) ++{ ++ unsigned int orig_len = len; ++ struct sk_buff *head = skb; ++ unsigned short fragidx; ++ int slen, ret; ++ ++do_frag_list: ++ ++ /* Deal with head data */ ++ while (offset < skb_headlen(skb) && len) { ++ struct kvec kv; ++ struct msghdr msg; ++ ++ slen = min_t(int, len, skb_headlen(skb) - offset); ++ kv.iov_base = skb->data + offset; ++ kv.iov_len = slen; ++ memset(&msg, 0, sizeof(msg)); ++ msg.msg_flags = MSG_DONTWAIT; ++ ++ ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, ++ sendmsg_unlocked, sk, &msg, &kv, 1, slen); ++ if (ret <= 0) ++ goto error; ++ ++ offset += ret; ++ len -= ret; ++ } ++ ++ /* All the data was skb head? */ ++ if (!len) ++ goto out; ++ ++ /* Make offset relative to start of frags */ ++ offset -= skb_headlen(skb); ++ ++ /* Find where we are in frag list */ ++ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; ++ ++ if (offset < skb_frag_size(frag)) ++ break; ++ ++ offset -= skb_frag_size(frag); ++ } ++ ++ for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; ++ ++ slen = min_t(size_t, len, skb_frag_size(frag) - offset); ++ ++ while (slen) { ++ ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, ++ sendpage_unlocked, sk, ++ skb_frag_page(frag), ++ skb_frag_off(frag) + offset, ++ slen, MSG_DONTWAIT); ++ if (ret <= 0) ++ goto error; ++ ++ len -= ret; ++ offset += ret; ++ slen -= ret; ++ } ++ ++ offset = 0; ++ } ++ ++ if (len) { ++ /* Process any frag lists */ ++ ++ if (skb == head) { ++ if (skb_has_frag_list(skb)) { ++ skb = skb_shinfo(skb)->frag_list; ++ goto do_frag_list; ++ } ++ } else if (skb->next) { ++ skb = skb->next; ++ goto do_frag_list; ++ } ++ } ++ ++out: ++ return orig_len - len; ++ ++error: ++ return orig_len == len ? ret : orig_len - len; ++} ++ ++/* Send skb data on a socket. Socket must be locked. */ ++int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, ++ int len) ++{ ++ return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, ++ kernel_sendpage_locked); ++} ++EXPORT_SYMBOL_GPL(skb_send_sock_locked); ++ ++/* Send skb data on a socket. Socket must be unlocked. */ ++int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) ++{ ++ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, ++ sendpage_unlocked); ++} ++ ++/** ++ * skb_store_bits - store bits from kernel buffer to skb ++ * @skb: destination buffer ++ * @offset: offset in destination ++ * @from: source buffer ++ * @len: number of bytes to copy ++ * ++ * Copy the specified number of bytes from the source buffer to the ++ * destination skb. This function handles all the messy bits of ++ * traversing fragment lists and such. ++ */ ++ ++int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) ++{ ++ int start = skb_headlen(skb); ++ struct sk_buff *frag_iter; ++ int i, copy; ++ ++ if (offset > (int)skb->len - len) ++ goto fault; ++ ++ if ((copy = start - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ skb_copy_to_linear_data_offset(skb, offset, from, copy); ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ from += copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(frag); ++ if ((copy = end - offset) > 0) { ++ u32 p_off, p_len, copied; ++ struct page *p; ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ skb_frag_foreach_page(frag, ++ skb_frag_off(frag) + offset - start, ++ copy, p, p_off, p_len, copied) { ++ vaddr = kmap_atomic(p); ++ memcpy(vaddr + p_off, from + copied, p_len); ++ kunmap_atomic(vaddr); ++ } ++ ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ from += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ if (skb_store_bits(frag_iter, offset - start, ++ from, copy)) ++ goto fault; ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ from += copy; ++ } ++ start = end; ++ } ++ if (!len) ++ return 0; ++ ++fault: ++ return -EFAULT; ++} ++EXPORT_SYMBOL(skb_store_bits); ++ ++/* Checksum skb data. */ ++__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, ++ __wsum csum, const struct skb_checksum_ops *ops) ++{ ++ int start = skb_headlen(skb); ++ int i, copy = start - offset; ++ struct sk_buff *frag_iter; ++ int pos = 0; ++ ++ /* Checksum header. */ ++ if (copy > 0) { ++ if (copy > len) ++ copy = len; ++ csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, ++ skb->data + offset, copy, csum); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ pos = copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(frag); ++ if ((copy = end - offset) > 0) { ++ u32 p_off, p_len, copied; ++ struct page *p; ++ __wsum csum2; ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ skb_frag_foreach_page(frag, ++ skb_frag_off(frag) + offset - start, ++ copy, p, p_off, p_len, copied) { ++ vaddr = kmap_atomic(p); ++ csum2 = INDIRECT_CALL_1(ops->update, ++ csum_partial_ext, ++ vaddr + p_off, p_len, 0); ++ kunmap_atomic(vaddr); ++ csum = INDIRECT_CALL_1(ops->combine, ++ csum_block_add_ext, csum, ++ csum2, pos, p_len); ++ pos += p_len; ++ } ++ ++ if (!(len -= copy)) ++ return csum; ++ offset += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ __wsum csum2; ++ if (copy > len) ++ copy = len; ++ csum2 = __skb_checksum(frag_iter, offset - start, ++ copy, 0, ops); ++ csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, ++ csum, csum2, pos, copy); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ pos += copy; ++ } ++ start = end; ++ } ++ BUG_ON(len); ++ ++ return csum; ++} ++EXPORT_SYMBOL(__skb_checksum); ++ ++__wsum skb_checksum(const struct sk_buff *skb, int offset, ++ int len, __wsum csum) ++{ ++ const struct skb_checksum_ops ops = { ++ .update = csum_partial_ext, ++ .combine = csum_block_add_ext, ++ }; ++ ++ return __skb_checksum(skb, offset, len, csum, &ops); ++} ++EXPORT_SYMBOL(skb_checksum); ++ ++/* Both of above in one bottle. */ ++ ++__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, ++ u8 *to, int len) ++{ ++ int start = skb_headlen(skb); ++ int i, copy = start - offset; ++ struct sk_buff *frag_iter; ++ int pos = 0; ++ __wsum csum = 0; ++ ++ /* Copy header. */ ++ if (copy > 0) { ++ if (copy > len) ++ copy = len; ++ csum = csum_partial_copy_nocheck(skb->data + offset, to, ++ copy); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ to += copy; ++ pos = copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ if ((copy = end - offset) > 0) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ u32 p_off, p_len, copied; ++ struct page *p; ++ __wsum csum2; ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ skb_frag_foreach_page(frag, ++ skb_frag_off(frag) + offset - start, ++ copy, p, p_off, p_len, copied) { ++ vaddr = kmap_atomic(p); ++ csum2 = csum_partial_copy_nocheck(vaddr + p_off, ++ to + copied, ++ p_len); ++ kunmap_atomic(vaddr); ++ csum = csum_block_add(csum, csum2, pos); ++ pos += p_len; ++ } ++ ++ if (!(len -= copy)) ++ return csum; ++ offset += copy; ++ to += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ __wsum csum2; ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ csum2 = skb_copy_and_csum_bits(frag_iter, ++ offset - start, ++ to, copy); ++ csum = csum_block_add(csum, csum2, pos); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ to += copy; ++ pos += copy; ++ } ++ start = end; ++ } ++ BUG_ON(len); ++ return csum; ++} ++EXPORT_SYMBOL(skb_copy_and_csum_bits); ++ ++__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) ++{ ++ __sum16 sum; ++ ++ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); ++ /* See comments in __skb_checksum_complete(). */ ++ if (likely(!sum)) { ++ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && ++ !skb->csum_complete_sw) ++ netdev_rx_csum_fault(skb->dev, skb); ++ } ++ if (!skb_shared(skb)) ++ skb->csum_valid = !sum; ++ return sum; ++} ++EXPORT_SYMBOL(__skb_checksum_complete_head); ++ ++/* This function assumes skb->csum already holds pseudo header's checksum, ++ * which has been changed from the hardware checksum, for example, by ++ * __skb_checksum_validate_complete(). And, the original skb->csum must ++ * have been validated unsuccessfully for CHECKSUM_COMPLETE case. ++ * ++ * It returns non-zero if the recomputed checksum is still invalid, otherwise ++ * zero. The new checksum is stored back into skb->csum unless the skb is ++ * shared. ++ */ ++__sum16 __skb_checksum_complete(struct sk_buff *skb) ++{ ++ __wsum csum; ++ __sum16 sum; ++ ++ csum = skb_checksum(skb, 0, skb->len, 0); ++ ++ sum = csum_fold(csum_add(skb->csum, csum)); ++ /* This check is inverted, because we already knew the hardware ++ * checksum is invalid before calling this function. So, if the ++ * re-computed checksum is valid instead, then we have a mismatch ++ * between the original skb->csum and skb_checksum(). This means either ++ * the original hardware checksum is incorrect or we screw up skb->csum ++ * when moving skb->data around. ++ */ ++ if (likely(!sum)) { ++ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && ++ !skb->csum_complete_sw) ++ netdev_rx_csum_fault(skb->dev, skb); ++ } ++ ++ if (!skb_shared(skb)) { ++ /* Save full packet checksum */ ++ skb->csum = csum; ++ skb->ip_summed = CHECKSUM_COMPLETE; ++ skb->csum_complete_sw = 1; ++ skb->csum_valid = !sum; ++ } ++ ++ return sum; ++} ++EXPORT_SYMBOL(__skb_checksum_complete); ++ ++static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) ++{ ++ net_warn_ratelimited( ++ "%s: attempt to compute crc32c without libcrc32c.ko\n", ++ __func__); ++ return 0; ++} ++ ++static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, ++ int offset, int len) ++{ ++ net_warn_ratelimited( ++ "%s: attempt to compute crc32c without libcrc32c.ko\n", ++ __func__); ++ return 0; ++} ++ ++static const struct skb_checksum_ops default_crc32c_ops = { ++ .update = warn_crc32c_csum_update, ++ .combine = warn_crc32c_csum_combine, ++}; ++ ++const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = ++ &default_crc32c_ops; ++EXPORT_SYMBOL(crc32c_csum_stub); ++ ++ /** ++ * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() ++ * @from: source buffer ++ * ++ * Calculates the amount of linear headroom needed in the 'to' skb passed ++ * into skb_zerocopy(). ++ */ ++unsigned int ++skb_zerocopy_headlen(const struct sk_buff *from) ++{ ++ unsigned int hlen = 0; ++ ++ if (!from->head_frag || ++ skb_headlen(from) < L1_CACHE_BYTES || ++ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { ++ hlen = skb_headlen(from); ++ if (!hlen) ++ hlen = from->len; ++ } ++ ++ if (skb_has_frag_list(from)) ++ hlen = from->len; ++ ++ return hlen; ++} ++EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); ++ ++/** ++ * skb_zerocopy - Zero copy skb to skb ++ * @to: destination buffer ++ * @from: source buffer ++ * @len: number of bytes to copy from source buffer ++ * @hlen: size of linear headroom in destination buffer ++ * ++ * Copies up to `len` bytes from `from` to `to` by creating references ++ * to the frags in the source buffer. ++ * ++ * The `hlen` as calculated by skb_zerocopy_headlen() specifies the ++ * headroom in the `to` buffer. ++ * ++ * Return value: ++ * 0: everything is OK ++ * -ENOMEM: couldn't orphan frags of @from due to lack of memory ++ * -EFAULT: skb_copy_bits() found some problem with skb geometry ++ */ ++int ++skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) ++{ ++ int i, j = 0; ++ int plen = 0; /* length of skb->head fragment */ ++ int ret; ++ struct page *page; ++ unsigned int offset; ++ ++ BUG_ON(!from->head_frag && !hlen); ++ ++ /* dont bother with small payloads */ ++ if (len <= skb_tailroom(to)) ++ return skb_copy_bits(from, 0, skb_put(to, len), len); ++ ++ if (hlen) { ++ ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); ++ if (unlikely(ret)) ++ return ret; ++ len -= hlen; ++ } else { ++ plen = min_t(int, skb_headlen(from), len); ++ if (plen) { ++ page = virt_to_head_page(from->head); ++ offset = from->data - (unsigned char *)page_address(page); ++ __skb_fill_page_desc(to, 0, page, offset, plen); ++ get_page(page); ++ j = 1; ++ len -= plen; ++ } ++ } ++ ++ skb_len_add(to, len + plen); ++ ++ if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { ++ skb_tx_error(from); ++ return -ENOMEM; ++ } ++ skb_zerocopy_clone(to, from, GFP_ATOMIC); ++ ++ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { ++ int size; ++ ++ if (!len) ++ break; ++ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; ++ size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), ++ len); ++ skb_frag_size_set(&skb_shinfo(to)->frags[j], size); ++ len -= size; ++ skb_frag_ref(to, j); ++ j++; ++ } ++ skb_shinfo(to)->nr_frags = j; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_zerocopy); ++ ++void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) ++{ ++ __wsum csum; ++ long csstart; ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) ++ csstart = skb_checksum_start_offset(skb); ++ else ++ csstart = skb_headlen(skb); ++ ++ BUG_ON(csstart > skb_headlen(skb)); ++ ++ skb_copy_from_linear_data(skb, to, csstart); ++ ++ csum = 0; ++ if (csstart != skb->len) ++ csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, ++ skb->len - csstart); ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ long csstuff = csstart + skb->csum_offset; ++ ++ *((__sum16 *)(to + csstuff)) = csum_fold(csum); ++ } ++} ++EXPORT_SYMBOL(skb_copy_and_csum_dev); ++ ++/** ++ * skb_dequeue - remove from the head of the queue ++ * @list: list to dequeue from ++ * ++ * Remove the head of the list. The list lock is taken so the function ++ * may be used safely with other locking list functions. The head item is ++ * returned or %NULL if the list is empty. ++ */ ++ ++struct sk_buff *skb_dequeue(struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ struct sk_buff *result; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ result = __skb_dequeue(list); ++ spin_unlock_irqrestore(&list->lock, flags); ++ return result; ++} ++EXPORT_SYMBOL(skb_dequeue); ++ ++/** ++ * skb_dequeue_tail - remove from the tail of the queue ++ * @list: list to dequeue from ++ * ++ * Remove the tail of the list. The list lock is taken so the function ++ * may be used safely with other locking list functions. The tail item is ++ * returned or %NULL if the list is empty. ++ */ ++struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ struct sk_buff *result; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ result = __skb_dequeue_tail(list); ++ spin_unlock_irqrestore(&list->lock, flags); ++ return result; ++} ++EXPORT_SYMBOL(skb_dequeue_tail); ++ ++/** ++ * skb_queue_purge - empty a list ++ * @list: list to empty ++ * ++ * Delete all buffers on an &sk_buff list. Each buffer is removed from ++ * the list and one reference dropped. This function takes the list ++ * lock and is atomic with respect to other list locking functions. ++ */ ++void skb_queue_purge(struct sk_buff_head *list) ++{ ++ struct sk_buff *skb; ++ while ((skb = skb_dequeue(list)) != NULL) ++ kfree_skb(skb); ++} ++EXPORT_SYMBOL(skb_queue_purge); ++ ++/** ++ * skb_rbtree_purge - empty a skb rbtree ++ * @root: root of the rbtree to empty ++ * Return value: the sum of truesizes of all purged skbs. ++ * ++ * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from ++ * the list and one reference dropped. This function does not take ++ * any lock. Synchronization should be handled by the caller (e.g., TCP ++ * out-of-order queue is protected by the socket lock). ++ */ ++unsigned int skb_rbtree_purge(struct rb_root *root) ++{ ++ struct rb_node *p = rb_first(root); ++ unsigned int sum = 0; ++ ++ while (p) { ++ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); ++ ++ p = rb_next(p); ++ rb_erase(&skb->rbnode, root); ++ sum += skb->truesize; ++ kfree_skb(skb); ++ } ++ return sum; ++} ++ ++/** ++ * skb_queue_head - queue a buffer at the list head ++ * @list: list to use ++ * @newsk: buffer to queue ++ * ++ * Queue a buffer at the start of the list. This function takes the ++ * list lock and can be used safely with other locking &sk_buff functions ++ * safely. ++ * ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_queue_head(list, newsk); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_queue_head); ++ ++/** ++ * skb_queue_tail - queue a buffer at the list tail ++ * @list: list to use ++ * @newsk: buffer to queue ++ * ++ * Queue a buffer at the tail of the list. This function takes the ++ * list lock and can be used safely with other locking &sk_buff functions ++ * safely. ++ * ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_queue_tail(list, newsk); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_queue_tail); ++ ++/** ++ * skb_unlink - remove a buffer from a list ++ * @skb: buffer to remove ++ * @list: list to use ++ * ++ * Remove a packet from a list. The list locks are taken and this ++ * function is atomic with respect to other list locked calls ++ * ++ * You must know what list the SKB is on. ++ */ ++void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_unlink(skb, list); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_unlink); ++ ++/** ++ * skb_append - append a buffer ++ * @old: buffer to insert after ++ * @newsk: buffer to insert ++ * @list: list to use ++ * ++ * Place a packet after a given packet in a list. The list locks are taken ++ * and this function is atomic with respect to other list locked calls. ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_queue_after(list, old, newsk); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_append); ++ ++static inline void skb_split_inside_header(struct sk_buff *skb, ++ struct sk_buff* skb1, ++ const u32 len, const int pos) ++{ ++ int i; ++ ++ skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), ++ pos - len); ++ /* And move data appendix as is. */ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; ++ ++ skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; ++ skb_shinfo(skb)->nr_frags = 0; ++ skb1->data_len = skb->data_len; ++ skb1->len += skb1->data_len; ++ skb->data_len = 0; ++ skb->len = len; ++ skb_set_tail_pointer(skb, len); ++} ++ ++static inline void skb_split_no_header(struct sk_buff *skb, ++ struct sk_buff* skb1, ++ const u32 len, int pos) ++{ ++ int i, k = 0; ++ const int nfrags = skb_shinfo(skb)->nr_frags; ++ ++ skb_shinfo(skb)->nr_frags = 0; ++ skb1->len = skb1->data_len = skb->len - len; ++ skb->len = len; ++ skb->data_len = len - pos; ++ ++ for (i = 0; i < nfrags; i++) { ++ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (pos + size > len) { ++ skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; ++ ++ if (pos < len) { ++ /* Split frag. ++ * We have two variants in this case: ++ * 1. Move all the frag to the second ++ * part, if it is possible. F.e. ++ * this approach is mandatory for TUX, ++ * where splitting is expensive. ++ * 2. Split is accurately. We make this. ++ */ ++ skb_frag_ref(skb, i); ++ skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); ++ skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); ++ skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); ++ skb_shinfo(skb)->nr_frags++; ++ } ++ k++; ++ } else ++ skb_shinfo(skb)->nr_frags++; ++ pos += size; ++ } ++ skb_shinfo(skb1)->nr_frags = k; ++} ++ ++/** ++ * skb_split - Split fragmented skb to two parts at length len. ++ * @skb: the buffer to split ++ * @skb1: the buffer to receive the second part ++ * @len: new length for skb ++ */ ++void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) ++{ ++ int pos = skb_headlen(skb); ++ const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; ++ ++ skb_zcopy_downgrade_managed(skb); ++ ++ skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; ++ skb_zerocopy_clone(skb1, skb, 0); ++ if (len < pos) /* Split line is inside header. */ ++ skb_split_inside_header(skb, skb1, len, pos); ++ else /* Second chunk has no header, nothing to copy. */ ++ skb_split_no_header(skb, skb1, len, pos); ++} ++EXPORT_SYMBOL(skb_split); ++ ++/* Shifting from/to a cloned skb is a no-go. ++ * ++ * Caller cannot keep skb_shinfo related pointers past calling here! ++ */ ++static int skb_prepare_for_shift(struct sk_buff *skb) ++{ ++ return skb_unclone_keeptruesize(skb, GFP_ATOMIC); ++} ++ ++/** ++ * skb_shift - Shifts paged data partially from skb to another ++ * @tgt: buffer into which tail data gets added ++ * @skb: buffer from which the paged data comes from ++ * @shiftlen: shift up to this many bytes ++ * ++ * Attempts to shift up to shiftlen worth of bytes, which may be less than ++ * the length of the skb, from skb to tgt. Returns number bytes shifted. ++ * It's up to caller to free skb if everything was shifted. ++ * ++ * If @tgt runs out of frags, the whole operation is aborted. ++ * ++ * Skb cannot include anything else but paged data while tgt is allowed ++ * to have non-paged data as well. ++ * ++ * TODO: full sized shift could be optimized but that would need ++ * specialized skb free'er to handle frags without up-to-date nr_frags. ++ */ ++int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) ++{ ++ int from, to, merge, todo; ++ skb_frag_t *fragfrom, *fragto; ++ ++ BUG_ON(shiftlen > skb->len); ++ ++ if (skb_headlen(skb)) ++ return 0; ++ if (skb_zcopy(tgt) || skb_zcopy(skb)) ++ return 0; ++ ++ todo = shiftlen; ++ from = 0; ++ to = skb_shinfo(tgt)->nr_frags; ++ fragfrom = &skb_shinfo(skb)->frags[from]; ++ ++ /* Actual merge is delayed until the point when we know we can ++ * commit all, so that we don't have to undo partial changes ++ */ ++ if (!to || ++ !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), ++ skb_frag_off(fragfrom))) { ++ merge = -1; ++ } else { ++ merge = to - 1; ++ ++ todo -= skb_frag_size(fragfrom); ++ if (todo < 0) { ++ if (skb_prepare_for_shift(skb) || ++ skb_prepare_for_shift(tgt)) ++ return 0; ++ ++ /* All previous frag pointers might be stale! */ ++ fragfrom = &skb_shinfo(skb)->frags[from]; ++ fragto = &skb_shinfo(tgt)->frags[merge]; ++ ++ skb_frag_size_add(fragto, shiftlen); ++ skb_frag_size_sub(fragfrom, shiftlen); ++ skb_frag_off_add(fragfrom, shiftlen); ++ ++ goto onlymerged; ++ } ++ ++ from++; ++ } ++ ++ /* Skip full, not-fitting skb to avoid expensive operations */ ++ if ((shiftlen == skb->len) && ++ (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) ++ return 0; ++ ++ if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) ++ return 0; ++ ++ while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { ++ if (to == MAX_SKB_FRAGS) ++ return 0; ++ ++ fragfrom = &skb_shinfo(skb)->frags[from]; ++ fragto = &skb_shinfo(tgt)->frags[to]; ++ ++ if (todo >= skb_frag_size(fragfrom)) { ++ *fragto = *fragfrom; ++ todo -= skb_frag_size(fragfrom); ++ from++; ++ to++; ++ ++ } else { ++ __skb_frag_ref(fragfrom); ++ skb_frag_page_copy(fragto, fragfrom); ++ skb_frag_off_copy(fragto, fragfrom); ++ skb_frag_size_set(fragto, todo); ++ ++ skb_frag_off_add(fragfrom, todo); ++ skb_frag_size_sub(fragfrom, todo); ++ todo = 0; ++ ++ to++; ++ break; ++ } ++ } ++ ++ /* Ready to "commit" this state change to tgt */ ++ skb_shinfo(tgt)->nr_frags = to; ++ ++ if (merge >= 0) { ++ fragfrom = &skb_shinfo(skb)->frags[0]; ++ fragto = &skb_shinfo(tgt)->frags[merge]; ++ ++ skb_frag_size_add(fragto, skb_frag_size(fragfrom)); ++ __skb_frag_unref(fragfrom, skb->pp_recycle); ++ } ++ ++ /* Reposition in the original skb */ ++ to = 0; ++ while (from < skb_shinfo(skb)->nr_frags) ++ skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; ++ skb_shinfo(skb)->nr_frags = to; ++ ++ BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); ++ ++onlymerged: ++ /* Most likely the tgt won't ever need its checksum anymore, skb on ++ * the other hand might need it if it needs to be resent ++ */ ++ tgt->ip_summed = CHECKSUM_PARTIAL; ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ ++ skb_len_add(skb, -shiftlen); ++ skb_len_add(tgt, shiftlen); ++ ++ return shiftlen; ++} ++ ++/** ++ * skb_prepare_seq_read - Prepare a sequential read of skb data ++ * @skb: the buffer to read ++ * @from: lower offset of data to be read ++ * @to: upper offset of data to be read ++ * @st: state variable ++ * ++ * Initializes the specified state variable. Must be called before ++ * invoking skb_seq_read() for the first time. ++ */ ++void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, ++ unsigned int to, struct skb_seq_state *st) ++{ ++ st->lower_offset = from; ++ st->upper_offset = to; ++ st->root_skb = st->cur_skb = skb; ++ st->frag_idx = st->stepped_offset = 0; ++ st->frag_data = NULL; ++ st->frag_off = 0; ++} ++EXPORT_SYMBOL(skb_prepare_seq_read); ++ ++/** ++ * skb_seq_read - Sequentially read skb data ++ * @consumed: number of bytes consumed by the caller so far ++ * @data: destination pointer for data to be returned ++ * @st: state variable ++ * ++ * Reads a block of skb data at @consumed relative to the ++ * lower offset specified to skb_prepare_seq_read(). Assigns ++ * the head of the data block to @data and returns the length ++ * of the block or 0 if the end of the skb data or the upper ++ * offset has been reached. ++ * ++ * The caller is not required to consume all of the data ++ * returned, i.e. @consumed is typically set to the number ++ * of bytes already consumed and the next call to ++ * skb_seq_read() will return the remaining part of the block. ++ * ++ * Note 1: The size of each block of data returned can be arbitrary, ++ * this limitation is the cost for zerocopy sequential ++ * reads of potentially non linear data. ++ * ++ * Note 2: Fragment lists within fragments are not implemented ++ * at the moment, state->root_skb could be replaced with ++ * a stack for this purpose. ++ */ ++unsigned int skb_seq_read(unsigned int consumed, const u8 **data, ++ struct skb_seq_state *st) ++{ ++ unsigned int block_limit, abs_offset = consumed + st->lower_offset; ++ skb_frag_t *frag; ++ ++ if (unlikely(abs_offset >= st->upper_offset)) { ++ if (st->frag_data) { ++ kunmap_atomic(st->frag_data); ++ st->frag_data = NULL; ++ } ++ return 0; ++ } ++ ++next_skb: ++ block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; ++ ++ if (abs_offset < block_limit && !st->frag_data) { ++ *data = st->cur_skb->data + (abs_offset - st->stepped_offset); ++ return block_limit - abs_offset; ++ } ++ ++ if (st->frag_idx == 0 && !st->frag_data) ++ st->stepped_offset += skb_headlen(st->cur_skb); ++ ++ while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { ++ unsigned int pg_idx, pg_off, pg_sz; ++ ++ frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; ++ ++ pg_idx = 0; ++ pg_off = skb_frag_off(frag); ++ pg_sz = skb_frag_size(frag); ++ ++ if (skb_frag_must_loop(skb_frag_page(frag))) { ++ pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; ++ pg_off = offset_in_page(pg_off + st->frag_off); ++ pg_sz = min_t(unsigned int, pg_sz - st->frag_off, ++ PAGE_SIZE - pg_off); ++ } ++ ++ block_limit = pg_sz + st->stepped_offset; ++ if (abs_offset < block_limit) { ++ if (!st->frag_data) ++ st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); ++ ++ *data = (u8 *)st->frag_data + pg_off + ++ (abs_offset - st->stepped_offset); ++ ++ return block_limit - abs_offset; ++ } ++ ++ if (st->frag_data) { ++ kunmap_atomic(st->frag_data); ++ st->frag_data = NULL; ++ } ++ ++ st->stepped_offset += pg_sz; ++ st->frag_off += pg_sz; ++ if (st->frag_off == skb_frag_size(frag)) { ++ st->frag_off = 0; ++ st->frag_idx++; ++ } ++ } ++ ++ if (st->frag_data) { ++ kunmap_atomic(st->frag_data); ++ st->frag_data = NULL; ++ } ++ ++ if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { ++ st->cur_skb = skb_shinfo(st->root_skb)->frag_list; ++ st->frag_idx = 0; ++ goto next_skb; ++ } else if (st->cur_skb->next) { ++ st->cur_skb = st->cur_skb->next; ++ st->frag_idx = 0; ++ goto next_skb; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_seq_read); ++ ++/** ++ * skb_abort_seq_read - Abort a sequential read of skb data ++ * @st: state variable ++ * ++ * Must be called if skb_seq_read() was not called until it ++ * returned 0. ++ */ ++void skb_abort_seq_read(struct skb_seq_state *st) ++{ ++ if (st->frag_data) ++ kunmap_atomic(st->frag_data); ++} ++EXPORT_SYMBOL(skb_abort_seq_read); ++ ++#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) ++ ++static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, ++ struct ts_config *conf, ++ struct ts_state *state) ++{ ++ return skb_seq_read(offset, text, TS_SKB_CB(state)); ++} ++ ++static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) ++{ ++ skb_abort_seq_read(TS_SKB_CB(state)); ++} ++ ++/** ++ * skb_find_text - Find a text pattern in skb data ++ * @skb: the buffer to look in ++ * @from: search offset ++ * @to: search limit ++ * @config: textsearch configuration ++ * ++ * Finds a pattern in the skb data according to the specified ++ * textsearch configuration. Use textsearch_next() to retrieve ++ * subsequent occurrences of the pattern. Returns the offset ++ * to the first occurrence or UINT_MAX if no match was found. ++ */ ++unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, ++ unsigned int to, struct ts_config *config) ++{ ++ struct ts_state state; ++ unsigned int ret; ++ ++ BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); ++ ++ config->get_next_block = skb_ts_get_next_block; ++ config->finish = skb_ts_finish; ++ ++ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); ++ ++ ret = textsearch_find(config, &state); ++ return (ret <= to - from ? ret : UINT_MAX); ++} ++EXPORT_SYMBOL(skb_find_text); ++ ++int skb_append_pagefrags(struct sk_buff *skb, struct page *page, ++ int offset, size_t size) ++{ ++ int i = skb_shinfo(skb)->nr_frags; ++ ++ if (skb_can_coalesce(skb, i, page, offset)) { ++ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); ++ } else if (i < MAX_SKB_FRAGS) { ++ skb_zcopy_downgrade_managed(skb); ++ get_page(page); ++ skb_fill_page_desc_noacc(skb, i, page, offset, size); ++ } else { ++ return -EMSGSIZE; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_append_pagefrags); ++ ++/** ++ * skb_pull_rcsum - pull skb and update receive checksum ++ * @skb: buffer to update ++ * @len: length of data pulled ++ * ++ * This function performs an skb_pull on the packet and updates ++ * the CHECKSUM_COMPLETE checksum. It should be used on ++ * receive path processing instead of skb_pull unless you know ++ * that the checksum difference is zero (e.g., a valid IP header) ++ * or you are setting ip_summed to CHECKSUM_NONE. ++ */ ++void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) ++{ ++ unsigned char *data = skb->data; ++ ++ BUG_ON(len > skb->len); ++ __skb_pull(skb, len); ++ skb_postpull_rcsum(skb, data, len); ++ return skb->data; ++} ++EXPORT_SYMBOL_GPL(skb_pull_rcsum); ++ ++static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) ++{ ++ skb_frag_t head_frag; ++ struct page *page; ++ ++ page = virt_to_head_page(frag_skb->head); ++ __skb_frag_set_page(&head_frag, page); ++ skb_frag_off_set(&head_frag, frag_skb->data - ++ (unsigned char *)page_address(page)); ++ skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); ++ return head_frag; ++} ++ ++struct sk_buff *skb_segment_list(struct sk_buff *skb, ++ netdev_features_t features, ++ unsigned int offset) ++{ ++ struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; ++ unsigned int tnl_hlen = skb_tnl_header_len(skb); ++ unsigned int delta_truesize = 0; ++ unsigned int delta_len = 0; ++ struct sk_buff *tail = NULL; ++ struct sk_buff *nskb, *tmp; ++ int len_diff, err; ++ ++ skb_push(skb, -skb_network_offset(skb) + offset); ++ ++ skb_shinfo(skb)->frag_list = NULL; ++ ++ do { ++ nskb = list_skb; ++ list_skb = list_skb->next; ++ ++ err = 0; ++ delta_truesize += nskb->truesize; ++ if (skb_shared(nskb)) { ++ tmp = skb_clone(nskb, GFP_ATOMIC); ++ if (tmp) { ++ consume_skb(nskb); ++ nskb = tmp; ++ err = skb_unclone(nskb, GFP_ATOMIC); ++ } else { ++ err = -ENOMEM; ++ } ++ } ++ ++ if (!tail) ++ skb->next = nskb; ++ else ++ tail->next = nskb; ++ ++ if (unlikely(err)) { ++ nskb->next = list_skb; ++ goto err_linearize; ++ } ++ ++ tail = nskb; ++ ++ delta_len += nskb->len; ++ ++ skb_push(nskb, -skb_network_offset(nskb) + offset); ++ ++ skb_release_head_state(nskb); ++ len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); ++ __copy_skb_header(nskb, skb); ++ ++ skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); ++ nskb->transport_header += len_diff; ++ skb_copy_from_linear_data_offset(skb, -tnl_hlen, ++ nskb->data - tnl_hlen, ++ offset + tnl_hlen); ++ ++ if (skb_needs_linearize(nskb, features) && ++ __skb_linearize(nskb)) ++ goto err_linearize; ++ ++ } while (list_skb); ++ ++ skb->truesize = skb->truesize - delta_truesize; ++ skb->data_len = skb->data_len - delta_len; ++ skb->len = skb->len - delta_len; ++ ++ skb_gso_reset(skb); ++ ++ skb->prev = tail; ++ ++ if (skb_needs_linearize(skb, features) && ++ __skb_linearize(skb)) ++ goto err_linearize; ++ ++ skb_get(skb); ++ ++ return skb; ++ ++err_linearize: ++ kfree_skb_list(skb->next); ++ skb->next = NULL; ++ return ERR_PTR(-ENOMEM); ++} ++EXPORT_SYMBOL_GPL(skb_segment_list); ++ ++/** ++ * skb_segment - Perform protocol segmentation on skb. ++ * @head_skb: buffer to segment ++ * @features: features for the output path (see dev->features) ++ * ++ * This function performs segmentation on the given skb. It returns ++ * a pointer to the first in a list of new skbs for the segments. ++ * In case of error it returns ERR_PTR(err). ++ */ ++struct sk_buff *skb_segment(struct sk_buff *head_skb, ++ netdev_features_t features) ++{ ++ struct sk_buff *segs = NULL; ++ struct sk_buff *tail = NULL; ++ struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; ++ skb_frag_t *frag = skb_shinfo(head_skb)->frags; ++ unsigned int mss = skb_shinfo(head_skb)->gso_size; ++ unsigned int doffset = head_skb->data - skb_mac_header(head_skb); ++ struct sk_buff *frag_skb = head_skb; ++ unsigned int offset = doffset; ++ unsigned int tnl_hlen = skb_tnl_header_len(head_skb); ++ unsigned int partial_segs = 0; ++ unsigned int headroom; ++ unsigned int len = head_skb->len; ++ __be16 proto; ++ bool csum, sg; ++ int nfrags = skb_shinfo(head_skb)->nr_frags; ++ int err = -ENOMEM; ++ int i = 0; ++ int pos; ++ ++ if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && ++ mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { ++ struct sk_buff *check_skb; ++ ++ for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { ++ if (skb_headlen(check_skb) && !check_skb->head_frag) { ++ /* gso_size is untrusted, and we have a frag_list with ++ * a linear non head_frag item. ++ * ++ * If head_skb's headlen does not fit requested gso_size, ++ * it means that the frag_list members do NOT terminate ++ * on exact gso_size boundaries. Hence we cannot perform ++ * skb_frag_t page sharing. Therefore we must fallback to ++ * copying the frag_list skbs; we do so by disabling SG. ++ */ ++ features &= ~NETIF_F_SG; ++ break; ++ } ++ } ++ } ++ ++ __skb_push(head_skb, doffset); ++ proto = skb_network_protocol(head_skb, NULL); ++ if (unlikely(!proto)) ++ return ERR_PTR(-EINVAL); ++ ++ sg = !!(features & NETIF_F_SG); ++ csum = !!can_checksum_protocol(features, proto); ++ ++ if (sg && csum && (mss != GSO_BY_FRAGS)) { ++ if (!(features & NETIF_F_GSO_PARTIAL)) { ++ struct sk_buff *iter; ++ unsigned int frag_len; ++ ++ if (!list_skb || ++ !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) ++ goto normal; ++ ++ /* If we get here then all the required ++ * GSO features except frag_list are supported. ++ * Try to split the SKB to multiple GSO SKBs ++ * with no frag_list. ++ * Currently we can do that only when the buffers don't ++ * have a linear part and all the buffers except ++ * the last are of the same length. ++ */ ++ frag_len = list_skb->len; ++ skb_walk_frags(head_skb, iter) { ++ if (frag_len != iter->len && iter->next) ++ goto normal; ++ if (skb_headlen(iter) && !iter->head_frag) ++ goto normal; ++ ++ len -= iter->len; ++ } ++ ++ if (len != frag_len) ++ goto normal; ++ } ++ ++ /* GSO partial only requires that we trim off any excess that ++ * doesn't fit into an MSS sized block, so take care of that ++ * now. ++ */ ++ partial_segs = len / mss; ++ if (partial_segs > 1) ++ mss *= partial_segs; ++ else ++ partial_segs = 0; ++ } ++ ++normal: ++ headroom = skb_headroom(head_skb); ++ pos = skb_headlen(head_skb); ++ ++ do { ++ struct sk_buff *nskb; ++ skb_frag_t *nskb_frag; ++ int hsize; ++ int size; ++ ++ if (unlikely(mss == GSO_BY_FRAGS)) { ++ len = list_skb->len; ++ } else { ++ len = head_skb->len - offset; ++ if (len > mss) ++ len = mss; ++ } ++ ++ hsize = skb_headlen(head_skb) - offset; ++ ++ if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && ++ (skb_headlen(list_skb) == len || sg)) { ++ BUG_ON(skb_headlen(list_skb) > len); ++ ++ i = 0; ++ nfrags = skb_shinfo(list_skb)->nr_frags; ++ frag = skb_shinfo(list_skb)->frags; ++ frag_skb = list_skb; ++ pos += skb_headlen(list_skb); ++ ++ while (pos < offset + len) { ++ BUG_ON(i >= nfrags); ++ ++ size = skb_frag_size(frag); ++ if (pos + size > offset + len) ++ break; ++ ++ i++; ++ pos += size; ++ frag++; ++ } ++ ++ nskb = skb_clone(list_skb, GFP_ATOMIC); ++ list_skb = list_skb->next; ++ ++ if (unlikely(!nskb)) ++ goto err; ++ ++ if (unlikely(pskb_trim(nskb, len))) { ++ kfree_skb(nskb); ++ goto err; ++ } ++ ++ hsize = skb_end_offset(nskb); ++ if (skb_cow_head(nskb, doffset + headroom)) { ++ kfree_skb(nskb); ++ goto err; ++ } ++ ++ nskb->truesize += skb_end_offset(nskb) - hsize; ++ skb_release_head_state(nskb); ++ __skb_push(nskb, doffset); ++ } else { ++ if (hsize < 0) ++ hsize = 0; ++ if (hsize > len || !sg) ++ hsize = len; ++ ++ nskb = __alloc_skb(hsize + doffset + headroom, ++ GFP_ATOMIC, skb_alloc_rx_flag(head_skb), ++ NUMA_NO_NODE); ++ ++ if (unlikely(!nskb)) ++ goto err; ++ ++ skb_reserve(nskb, headroom); ++ __skb_put(nskb, doffset); ++ } ++ ++ if (segs) ++ tail->next = nskb; ++ else ++ segs = nskb; ++ tail = nskb; ++ ++ __copy_skb_header(nskb, head_skb); ++ ++ skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); ++ skb_reset_mac_len(nskb); ++ ++ skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, ++ nskb->data - tnl_hlen, ++ doffset + tnl_hlen); ++ ++ if (nskb->len == len + doffset) ++ goto perform_csum_check; ++ ++ if (!sg) { ++ if (!csum) { ++ if (!nskb->remcsum_offload) ++ nskb->ip_summed = CHECKSUM_NONE; ++ SKB_GSO_CB(nskb)->csum = ++ skb_copy_and_csum_bits(head_skb, offset, ++ skb_put(nskb, ++ len), ++ len); ++ SKB_GSO_CB(nskb)->csum_start = ++ skb_headroom(nskb) + doffset; ++ } else { ++ if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) ++ goto err; ++ } ++ continue; ++ } ++ ++ nskb_frag = skb_shinfo(nskb)->frags; ++ ++ skb_copy_from_linear_data_offset(head_skb, offset, ++ skb_put(nskb, hsize), hsize); ++ ++ skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & ++ SKBFL_SHARED_FRAG; ++ ++ if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || ++ skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) ++ goto err; ++ ++ while (pos < offset + len) { ++ if (i >= nfrags) { ++ i = 0; ++ nfrags = skb_shinfo(list_skb)->nr_frags; ++ frag = skb_shinfo(list_skb)->frags; ++ frag_skb = list_skb; ++ if (!skb_headlen(list_skb)) { ++ BUG_ON(!nfrags); ++ } else { ++ BUG_ON(!list_skb->head_frag); ++ ++ /* to make room for head_frag. */ ++ i--; ++ frag--; ++ } ++ if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || ++ skb_zerocopy_clone(nskb, frag_skb, ++ GFP_ATOMIC)) ++ goto err; ++ ++ list_skb = list_skb->next; ++ } ++ ++ if (unlikely(skb_shinfo(nskb)->nr_frags >= ++ MAX_SKB_FRAGS)) { ++ net_warn_ratelimited( ++ "skb_segment: too many frags: %u %u\n", ++ pos, mss); ++ err = -EINVAL; ++ goto err; ++ } ++ ++ *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; ++ __skb_frag_ref(nskb_frag); ++ size = skb_frag_size(nskb_frag); ++ ++ if (pos < offset) { ++ skb_frag_off_add(nskb_frag, offset - pos); ++ skb_frag_size_sub(nskb_frag, offset - pos); ++ } ++ ++ skb_shinfo(nskb)->nr_frags++; ++ ++ if (pos + size <= offset + len) { ++ i++; ++ frag++; ++ pos += size; ++ } else { ++ skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); ++ goto skip_fraglist; ++ } ++ ++ nskb_frag++; ++ } ++ ++skip_fraglist: ++ nskb->data_len = len - hsize; ++ nskb->len += nskb->data_len; ++ nskb->truesize += nskb->data_len; ++ ++perform_csum_check: ++ if (!csum) { ++ if (skb_has_shared_frag(nskb) && ++ __skb_linearize(nskb)) ++ goto err; ++ ++ if (!nskb->remcsum_offload) ++ nskb->ip_summed = CHECKSUM_NONE; ++ SKB_GSO_CB(nskb)->csum = ++ skb_checksum(nskb, doffset, ++ nskb->len - doffset, 0); ++ SKB_GSO_CB(nskb)->csum_start = ++ skb_headroom(nskb) + doffset; ++ } ++ } while ((offset += len) < head_skb->len); ++ ++ /* Some callers want to get the end of the list. ++ * Put it in segs->prev to avoid walking the list. ++ * (see validate_xmit_skb_list() for example) ++ */ ++ segs->prev = tail; ++ ++ if (partial_segs) { ++ struct sk_buff *iter; ++ int type = skb_shinfo(head_skb)->gso_type; ++ unsigned short gso_size = skb_shinfo(head_skb)->gso_size; ++ ++ /* Update type to add partial and then remove dodgy if set */ ++ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; ++ type &= ~SKB_GSO_DODGY; ++ ++ /* Update GSO info and prepare to start updating headers on ++ * our way back down the stack of protocols. ++ */ ++ for (iter = segs; iter; iter = iter->next) { ++ skb_shinfo(iter)->gso_size = gso_size; ++ skb_shinfo(iter)->gso_segs = partial_segs; ++ skb_shinfo(iter)->gso_type = type; ++ SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; ++ } ++ ++ if (tail->len - doffset <= gso_size) ++ skb_shinfo(tail)->gso_size = 0; ++ else if (tail != segs) ++ skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); ++ } ++ ++ /* Following permits correct backpressure, for protocols ++ * using skb_set_owner_w(). ++ * Idea is to tranfert ownership from head_skb to last segment. ++ */ ++ if (head_skb->destructor == sock_wfree) { ++ swap(tail->truesize, head_skb->truesize); ++ swap(tail->destructor, head_skb->destructor); ++ swap(tail->sk, head_skb->sk); ++ } ++ return segs; ++ ++err: ++ kfree_skb_list(segs); ++ return ERR_PTR(err); ++} ++EXPORT_SYMBOL_GPL(skb_segment); ++ ++#ifdef CONFIG_SKB_EXTENSIONS ++#define SKB_EXT_ALIGN_VALUE 8 ++#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) ++ ++static const u8 skb_ext_type_len[] = { ++#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) ++ [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), ++#endif ++#ifdef CONFIG_XFRM ++ [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), ++#endif ++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) ++ [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), ++#endif ++#if IS_ENABLED(CONFIG_MPTCP) ++ [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), ++#endif ++#if IS_ENABLED(CONFIG_MCTP_FLOWS) ++ [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), ++#endif ++}; ++ ++static __always_inline unsigned int skb_ext_total_length(void) ++{ ++ return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + ++#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) ++ skb_ext_type_len[SKB_EXT_BRIDGE_NF] + ++#endif ++#ifdef CONFIG_XFRM ++ skb_ext_type_len[SKB_EXT_SEC_PATH] + ++#endif ++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) ++ skb_ext_type_len[TC_SKB_EXT] + ++#endif ++#if IS_ENABLED(CONFIG_MPTCP) ++ skb_ext_type_len[SKB_EXT_MPTCP] + ++#endif ++#if IS_ENABLED(CONFIG_MCTP_FLOWS) ++ skb_ext_type_len[SKB_EXT_MCTP] + ++#endif ++ 0; ++} ++ ++static void skb_extensions_init(void) ++{ ++ BUILD_BUG_ON(SKB_EXT_NUM >= 8); ++ BUILD_BUG_ON(skb_ext_total_length() > 255); ++ ++ skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", ++ SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ NULL); ++} ++#else ++static void skb_extensions_init(void) {} ++#endif ++ ++void __init skb_init(void) ++{ ++ skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", ++ sizeof(struct sk_buff), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ offsetof(struct sk_buff, cb), ++ sizeof_field(struct sk_buff, cb), ++ NULL); ++ skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", ++ sizeof(struct sk_buff_fclones), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ NULL); ++ skb_extensions_init(); ++} ++ ++static int ++__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, ++ unsigned int recursion_level) ++{ ++ int start = skb_headlen(skb); ++ int i, copy = start - offset; ++ struct sk_buff *frag_iter; ++ int elt = 0; ++ ++ if (unlikely(recursion_level >= 24)) ++ return -EMSGSIZE; ++ ++ if (copy > 0) { ++ if (copy > len) ++ copy = len; ++ sg_set_buf(sg, skb->data + offset, copy); ++ elt++; ++ if ((len -= copy) == 0) ++ return elt; ++ offset += copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ if ((copy = end - offset) > 0) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ if (unlikely(elt && sg_is_last(&sg[elt - 1]))) ++ return -EMSGSIZE; ++ ++ if (copy > len) ++ copy = len; ++ sg_set_page(&sg[elt], skb_frag_page(frag), copy, ++ skb_frag_off(frag) + offset - start); ++ elt++; ++ if (!(len -= copy)) ++ return elt; ++ offset += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end, ret; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (unlikely(elt && sg_is_last(&sg[elt - 1]))) ++ return -EMSGSIZE; ++ ++ if (copy > len) ++ copy = len; ++ ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, ++ copy, recursion_level + 1); ++ if (unlikely(ret < 0)) ++ return ret; ++ elt += ret; ++ if ((len -= copy) == 0) ++ return elt; ++ offset += copy; ++ } ++ start = end; ++ } ++ BUG_ON(len); ++ return elt; ++} ++ ++/** ++ * skb_to_sgvec - Fill a scatter-gather list from a socket buffer ++ * @skb: Socket buffer containing the buffers to be mapped ++ * @sg: The scatter-gather list to map into ++ * @offset: The offset into the buffer's contents to start mapping ++ * @len: Length of buffer space to be mapped ++ * ++ * Fill the specified scatter-gather list with mappings/pointers into a ++ * region of the buffer space attached to a socket buffer. Returns either ++ * the number of scatterlist items used, or -EMSGSIZE if the contents ++ * could not fit. ++ */ ++int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) ++{ ++ int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); ++ ++ if (nsg <= 0) ++ return nsg; ++ ++ sg_mark_end(&sg[nsg - 1]); ++ ++ return nsg; ++} ++EXPORT_SYMBOL_GPL(skb_to_sgvec); ++ ++/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given ++ * sglist without mark the sg which contain last skb data as the end. ++ * So the caller can mannipulate sg list as will when padding new data after ++ * the first call without calling sg_unmark_end to expend sg list. ++ * ++ * Scenario to use skb_to_sgvec_nomark: ++ * 1. sg_init_table ++ * 2. skb_to_sgvec_nomark(payload1) ++ * 3. skb_to_sgvec_nomark(payload2) ++ * ++ * This is equivalent to: ++ * 1. sg_init_table ++ * 2. skb_to_sgvec(payload1) ++ * 3. sg_unmark_end ++ * 4. skb_to_sgvec(payload2) ++ * ++ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark ++ * is more preferable. ++ */ ++int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, ++ int offset, int len) ++{ ++ return __skb_to_sgvec(skb, sg, offset, len, 0); ++} ++EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); ++ ++ ++ ++/** ++ * skb_cow_data - Check that a socket buffer's data buffers are writable ++ * @skb: The socket buffer to check. ++ * @tailbits: Amount of trailing space to be added ++ * @trailer: Returned pointer to the skb where the @tailbits space begins ++ * ++ * Make sure that the data buffers attached to a socket buffer are ++ * writable. If they are not, private copies are made of the data buffers ++ * and the socket buffer is set to use these instead. ++ * ++ * If @tailbits is given, make sure that there is space to write @tailbits ++ * bytes of data beyond current end of socket buffer. @trailer will be ++ * set to point to the skb in which this space begins. ++ * ++ * The number of scatterlist elements required to completely map the ++ * COW'd and extended socket buffer will be returned. ++ */ ++int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) ++{ ++ int copyflag; ++ int elt; ++ struct sk_buff *skb1, **skb_p; ++ ++ /* If skb is cloned or its head is paged, reallocate ++ * head pulling out all the pages (pages are considered not writable ++ * at the moment even if they are anonymous). ++ */ ++ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && ++ !__pskb_pull_tail(skb, __skb_pagelen(skb))) ++ return -ENOMEM; ++ ++ /* Easy case. Most of packets will go this way. */ ++ if (!skb_has_frag_list(skb)) { ++ /* A little of trouble, not enough of space for trailer. ++ * This should not happen, when stack is tuned to generate ++ * good frames. OK, on miss we reallocate and reserve even more ++ * space, 128 bytes is fair. */ ++ ++ if (skb_tailroom(skb) < tailbits && ++ pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) ++ return -ENOMEM; ++ ++ /* Voila! */ ++ *trailer = skb; ++ return 1; ++ } ++ ++ /* Misery. We are in troubles, going to mincer fragments... */ ++ ++ elt = 1; ++ skb_p = &skb_shinfo(skb)->frag_list; ++ copyflag = 0; ++ ++ while ((skb1 = *skb_p) != NULL) { ++ int ntail = 0; ++ ++ /* The fragment is partially pulled by someone, ++ * this can happen on input. Copy it and everything ++ * after it. */ ++ ++ if (skb_shared(skb1)) ++ copyflag = 1; ++ ++ /* If the skb is the last, worry about trailer. */ ++ ++ if (skb1->next == NULL && tailbits) { ++ if (skb_shinfo(skb1)->nr_frags || ++ skb_has_frag_list(skb1) || ++ skb_tailroom(skb1) < tailbits) ++ ntail = tailbits + 128; ++ } ++ ++ if (copyflag || ++ skb_cloned(skb1) || ++ ntail || ++ skb_shinfo(skb1)->nr_frags || ++ skb_has_frag_list(skb1)) { ++ struct sk_buff *skb2; ++ ++ /* Fuck, we are miserable poor guys... */ ++ if (ntail == 0) ++ skb2 = skb_copy(skb1, GFP_ATOMIC); ++ else ++ skb2 = skb_copy_expand(skb1, ++ skb_headroom(skb1), ++ ntail, ++ GFP_ATOMIC); ++ if (unlikely(skb2 == NULL)) ++ return -ENOMEM; ++ ++ if (skb1->sk) ++ skb_set_owner_w(skb2, skb1->sk); ++ ++ /* Looking around. Are we still alive? ++ * OK, link new skb, drop old one */ ++ ++ skb2->next = skb1->next; ++ *skb_p = skb2; ++ kfree_skb(skb1); ++ skb1 = skb2; ++ } ++ elt++; ++ *trailer = skb1; ++ skb_p = &skb1->next; ++ } ++ ++ return elt; ++} ++EXPORT_SYMBOL_GPL(skb_cow_data); ++ ++static void sock_rmem_free(struct sk_buff *skb) ++{ ++ struct sock *sk = skb->sk; ++ ++ atomic_sub(skb->truesize, &sk->sk_rmem_alloc); ++} ++ ++static void skb_set_err_queue(struct sk_buff *skb) ++{ ++ /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. ++ * So, it is safe to (mis)use it to mark skbs on the error queue. ++ */ ++ skb->pkt_type = PACKET_OUTGOING; ++ BUILD_BUG_ON(PACKET_OUTGOING == 0); ++} ++ ++/* ++ * Note: We dont mem charge error packets (no sk_forward_alloc changes) ++ */ ++int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= ++ (unsigned int)READ_ONCE(sk->sk_rcvbuf)) ++ return -ENOMEM; ++ ++ skb_orphan(skb); ++ skb->sk = sk; ++ skb->destructor = sock_rmem_free; ++ atomic_add(skb->truesize, &sk->sk_rmem_alloc); ++ skb_set_err_queue(skb); ++ ++ /* before exiting rcu section, make sure dst is refcounted */ ++ skb_dst_force(skb); ++ ++ skb_queue_tail(&sk->sk_error_queue, skb); ++ if (!sock_flag(sk, SOCK_DEAD)) ++ sk_error_report(sk); ++ return 0; ++} ++EXPORT_SYMBOL(sock_queue_err_skb); ++ ++static bool is_icmp_err_skb(const struct sk_buff *skb) ++{ ++ return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || ++ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); ++} ++ ++struct sk_buff *sock_dequeue_err_skb(struct sock *sk) ++{ ++ struct sk_buff_head *q = &sk->sk_error_queue; ++ struct sk_buff *skb, *skb_next = NULL; ++ bool icmp_next = false; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&q->lock, flags); ++ skb = __skb_dequeue(q); ++ if (skb && (skb_next = skb_peek(q))) { ++ icmp_next = is_icmp_err_skb(skb_next); ++ if (icmp_next) ++ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; ++ } ++ spin_unlock_irqrestore(&q->lock, flags); ++ ++ if (is_icmp_err_skb(skb) && !icmp_next) ++ sk->sk_err = 0; ++ ++ if (skb_next) ++ sk_error_report(sk); ++ ++ return skb; ++} ++EXPORT_SYMBOL(sock_dequeue_err_skb); ++ ++/** ++ * skb_clone_sk - create clone of skb, and take reference to socket ++ * @skb: the skb to clone ++ * ++ * This function creates a clone of a buffer that holds a reference on ++ * sk_refcnt. Buffers created via this function are meant to be ++ * returned using sock_queue_err_skb, or free via kfree_skb. ++ * ++ * When passing buffers allocated with this function to sock_queue_err_skb ++ * it is necessary to wrap the call with sock_hold/sock_put in order to ++ * prevent the socket from being released prior to being enqueued on ++ * the sk_error_queue. ++ */ ++struct sk_buff *skb_clone_sk(struct sk_buff *skb) ++{ ++ struct sock *sk = skb->sk; ++ struct sk_buff *clone; ++ ++ if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) ++ return NULL; ++ ++ clone = skb_clone(skb, GFP_ATOMIC); ++ if (!clone) { ++ sock_put(sk); ++ return NULL; ++ } ++ ++ clone->sk = sk; ++ clone->destructor = sock_efree; ++ ++ return clone; ++} ++EXPORT_SYMBOL(skb_clone_sk); ++ ++static void __skb_complete_tx_timestamp(struct sk_buff *skb, ++ struct sock *sk, ++ int tstype, ++ bool opt_stats) ++{ ++ struct sock_exterr_skb *serr; ++ int err; ++ ++ BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); ++ ++ serr = SKB_EXT_ERR(skb); ++ memset(serr, 0, sizeof(*serr)); ++ serr->ee.ee_errno = ENOMSG; ++ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; ++ serr->ee.ee_info = tstype; ++ serr->opt_stats = opt_stats; ++ serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; ++ if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { ++ serr->ee.ee_data = skb_shinfo(skb)->tskey; ++ if (sk_is_tcp(sk)) ++ serr->ee.ee_data -= atomic_read(&sk->sk_tskey); ++ } ++ ++ err = sock_queue_err_skb(sk, skb); ++ ++ if (err) ++ kfree_skb(skb); ++} ++ ++static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) ++{ ++ bool ret; ++ ++ if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) ++ return true; ++ ++ read_lock_bh(&sk->sk_callback_lock); ++ ret = sk->sk_socket && sk->sk_socket->file && ++ file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); ++ read_unlock_bh(&sk->sk_callback_lock); ++ return ret; ++} ++ ++void skb_complete_tx_timestamp(struct sk_buff *skb, ++ struct skb_shared_hwtstamps *hwtstamps) ++{ ++ struct sock *sk = skb->sk; ++ ++ if (!skb_may_tx_timestamp(sk, false)) ++ goto err; ++ ++ /* Take a reference to prevent skb_orphan() from freeing the socket, ++ * but only if the socket refcount is not zero. ++ */ ++ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { ++ *skb_hwtstamps(skb) = *hwtstamps; ++ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); ++ sock_put(sk); ++ return; ++ } ++ ++err: ++ kfree_skb(skb); ++} ++EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); ++ ++void __skb_tstamp_tx(struct sk_buff *orig_skb, ++ const struct sk_buff *ack_skb, ++ struct skb_shared_hwtstamps *hwtstamps, ++ struct sock *sk, int tstype) ++{ ++ struct sk_buff *skb; ++ bool tsonly, opt_stats = false; ++ ++ if (!sk) ++ return; ++ ++ if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && ++ skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) ++ return; ++ ++ tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; ++ if (!skb_may_tx_timestamp(sk, tsonly)) ++ return; ++ ++ if (tsonly) { ++#ifdef CONFIG_INET ++ if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && ++ sk_is_tcp(sk)) { ++ skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ++ ack_skb); ++ opt_stats = true; ++ } else ++#endif ++ skb = alloc_skb(0, GFP_ATOMIC); ++ } else { ++ skb = skb_clone(orig_skb, GFP_ATOMIC); ++ } ++ if (!skb) ++ return; ++ ++ if (tsonly) { ++ skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & ++ SKBTX_ANY_TSTAMP; ++ skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; ++ } ++ ++ if (hwtstamps) ++ *skb_hwtstamps(skb) = *hwtstamps; ++ else ++ __net_timestamp(skb); ++ ++ __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); ++} ++EXPORT_SYMBOL_GPL(__skb_tstamp_tx); ++ ++void skb_tstamp_tx(struct sk_buff *orig_skb, ++ struct skb_shared_hwtstamps *hwtstamps) ++{ ++ return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, ++ SCM_TSTAMP_SND); ++} ++EXPORT_SYMBOL_GPL(skb_tstamp_tx); ++ ++void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) ++{ ++ struct sock *sk = skb->sk; ++ struct sock_exterr_skb *serr; ++ int err = 1; ++ ++ skb->wifi_acked_valid = 1; ++ skb->wifi_acked = acked; ++ ++ serr = SKB_EXT_ERR(skb); ++ memset(serr, 0, sizeof(*serr)); ++ serr->ee.ee_errno = ENOMSG; ++ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; ++ ++ /* Take a reference to prevent skb_orphan() from freeing the socket, ++ * but only if the socket refcount is not zero. ++ */ ++ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { ++ err = sock_queue_err_skb(sk, skb); ++ sock_put(sk); ++ } ++ if (err) ++ kfree_skb(skb); ++} ++EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); ++ ++/** ++ * skb_partial_csum_set - set up and verify partial csum values for packet ++ * @skb: the skb to set ++ * @start: the number of bytes after skb->data to start checksumming. ++ * @off: the offset from start to place the checksum. ++ * ++ * For untrusted partially-checksummed packets, we need to make sure the values ++ * for skb->csum_start and skb->csum_offset are valid so we don't oops. ++ * ++ * This function checks and sets those values and skb->ip_summed: if this ++ * returns false you should drop the packet. ++ */ ++bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) ++{ ++ u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); ++ u32 csum_start = skb_headroom(skb) + (u32)start; ++ ++ if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { ++ net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", ++ start, off, skb_headroom(skb), skb_headlen(skb)); ++ return false; ++ } ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ skb->csum_start = csum_start; ++ skb->csum_offset = off; ++ skb_set_transport_header(skb, start); ++ return true; ++} ++EXPORT_SYMBOL_GPL(skb_partial_csum_set); ++ ++static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, ++ unsigned int max) ++{ ++ if (skb_headlen(skb) >= len) ++ return 0; ++ ++ /* If we need to pullup then pullup to the max, so we ++ * won't need to do it again. ++ */ ++ if (max > skb->len) ++ max = skb->len; ++ ++ if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) ++ return -ENOMEM; ++ ++ if (skb_headlen(skb) < len) ++ return -EPROTO; ++ ++ return 0; ++} ++ ++#define MAX_TCP_HDR_LEN (15 * 4) ++ ++static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, ++ typeof(IPPROTO_IP) proto, ++ unsigned int off) ++{ ++ int err; ++ ++ switch (proto) { ++ case IPPROTO_TCP: ++ err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), ++ off + MAX_TCP_HDR_LEN); ++ if (!err && !skb_partial_csum_set(skb, off, ++ offsetof(struct tcphdr, ++ check))) ++ err = -EPROTO; ++ return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; ++ ++ case IPPROTO_UDP: ++ err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), ++ off + sizeof(struct udphdr)); ++ if (!err && !skb_partial_csum_set(skb, off, ++ offsetof(struct udphdr, ++ check))) ++ err = -EPROTO; ++ return err ? ERR_PTR(err) : &udp_hdr(skb)->check; ++ } ++ ++ return ERR_PTR(-EPROTO); ++} ++ ++/* This value should be large enough to cover a tagged ethernet header plus ++ * maximally sized IP and TCP or UDP headers. ++ */ ++#define MAX_IP_HDR_LEN 128 ++ ++static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) ++{ ++ unsigned int off; ++ bool fragment; ++ __sum16 *csum; ++ int err; ++ ++ fragment = false; ++ ++ err = skb_maybe_pull_tail(skb, ++ sizeof(struct iphdr), ++ MAX_IP_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ if (ip_is_fragment(ip_hdr(skb))) ++ fragment = true; ++ ++ off = ip_hdrlen(skb); ++ ++ err = -EPROTO; ++ ++ if (fragment) ++ goto out; ++ ++ csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); ++ if (IS_ERR(csum)) ++ return PTR_ERR(csum); ++ ++ if (recalculate) ++ *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ++ ip_hdr(skb)->daddr, ++ skb->len - off, ++ ip_hdr(skb)->protocol, 0); ++ err = 0; ++ ++out: ++ return err; ++} ++ ++/* This value should be large enough to cover a tagged ethernet header plus ++ * an IPv6 header, all options, and a maximal TCP or UDP header. ++ */ ++#define MAX_IPV6_HDR_LEN 256 ++ ++#define OPT_HDR(type, skb, off) \ ++ (type *)(skb_network_header(skb) + (off)) ++ ++static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) ++{ ++ int err; ++ u8 nexthdr; ++ unsigned int off; ++ unsigned int len; ++ bool fragment; ++ bool done; ++ __sum16 *csum; ++ ++ fragment = false; ++ done = false; ++ ++ off = sizeof(struct ipv6hdr); ++ ++ err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ nexthdr = ipv6_hdr(skb)->nexthdr; ++ ++ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); ++ while (off <= len && !done) { ++ switch (nexthdr) { ++ case IPPROTO_DSTOPTS: ++ case IPPROTO_HOPOPTS: ++ case IPPROTO_ROUTING: { ++ struct ipv6_opt_hdr *hp; ++ ++ err = skb_maybe_pull_tail(skb, ++ off + ++ sizeof(struct ipv6_opt_hdr), ++ MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); ++ nexthdr = hp->nexthdr; ++ off += ipv6_optlen(hp); ++ break; ++ } ++ case IPPROTO_AH: { ++ struct ip_auth_hdr *hp; ++ ++ err = skb_maybe_pull_tail(skb, ++ off + ++ sizeof(struct ip_auth_hdr), ++ MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ hp = OPT_HDR(struct ip_auth_hdr, skb, off); ++ nexthdr = hp->nexthdr; ++ off += ipv6_authlen(hp); ++ break; ++ } ++ case IPPROTO_FRAGMENT: { ++ struct frag_hdr *hp; ++ ++ err = skb_maybe_pull_tail(skb, ++ off + ++ sizeof(struct frag_hdr), ++ MAX_IPV6_HDR_LEN); ++ if (err < 0) ++ goto out; ++ ++ hp = OPT_HDR(struct frag_hdr, skb, off); ++ ++ if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) ++ fragment = true; ++ ++ nexthdr = hp->nexthdr; ++ off += sizeof(struct frag_hdr); ++ break; ++ } ++ default: ++ done = true; ++ break; ++ } ++ } ++ ++ err = -EPROTO; ++ ++ if (!done || fragment) ++ goto out; ++ ++ csum = skb_checksum_setup_ip(skb, nexthdr, off); ++ if (IS_ERR(csum)) ++ return PTR_ERR(csum); ++ ++ if (recalculate) ++ *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, ++ &ipv6_hdr(skb)->daddr, ++ skb->len - off, nexthdr, 0); ++ err = 0; ++ ++out: ++ return err; ++} ++ ++/** ++ * skb_checksum_setup - set up partial checksum offset ++ * @skb: the skb to set up ++ * @recalculate: if true the pseudo-header checksum will be recalculated ++ */ ++int skb_checksum_setup(struct sk_buff *skb, bool recalculate) ++{ ++ int err; ++ ++ switch (skb->protocol) { ++ case htons(ETH_P_IP): ++ err = skb_checksum_setup_ipv4(skb, recalculate); ++ break; ++ ++ case htons(ETH_P_IPV6): ++ err = skb_checksum_setup_ipv6(skb, recalculate); ++ break; ++ ++ default: ++ err = -EPROTO; ++ break; ++ } ++ ++ return err; ++} ++EXPORT_SYMBOL(skb_checksum_setup); ++ ++/** ++ * skb_checksum_maybe_trim - maybe trims the given skb ++ * @skb: the skb to check ++ * @transport_len: the data length beyond the network header ++ * ++ * Checks whether the given skb has data beyond the given transport length. ++ * If so, returns a cloned skb trimmed to this transport length. ++ * Otherwise returns the provided skb. Returns NULL in error cases ++ * (e.g. transport_len exceeds skb length or out-of-memory). ++ * ++ * Caller needs to set the skb transport header and free any returned skb if it ++ * differs from the provided skb. ++ */ ++static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, ++ unsigned int transport_len) ++{ ++ struct sk_buff *skb_chk; ++ unsigned int len = skb_transport_offset(skb) + transport_len; ++ int ret; ++ ++ if (skb->len < len) ++ return NULL; ++ else if (skb->len == len) ++ return skb; ++ ++ skb_chk = skb_clone(skb, GFP_ATOMIC); ++ if (!skb_chk) ++ return NULL; ++ ++ ret = pskb_trim_rcsum(skb_chk, len); ++ if (ret) { ++ kfree_skb(skb_chk); ++ return NULL; ++ } ++ ++ return skb_chk; ++} ++ ++/** ++ * skb_checksum_trimmed - validate checksum of an skb ++ * @skb: the skb to check ++ * @transport_len: the data length beyond the network header ++ * @skb_chkf: checksum function to use ++ * ++ * Applies the given checksum function skb_chkf to the provided skb. ++ * Returns a checked and maybe trimmed skb. Returns NULL on error. ++ * ++ * If the skb has data beyond the given transport length, then a ++ * trimmed & cloned skb is checked and returned. ++ * ++ * Caller needs to set the skb transport header and free any returned skb if it ++ * differs from the provided skb. ++ */ ++struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, ++ unsigned int transport_len, ++ __sum16(*skb_chkf)(struct sk_buff *skb)) ++{ ++ struct sk_buff *skb_chk; ++ unsigned int offset = skb_transport_offset(skb); ++ __sum16 ret; ++ ++ skb_chk = skb_checksum_maybe_trim(skb, transport_len); ++ if (!skb_chk) ++ goto err; ++ ++ if (!pskb_may_pull(skb_chk, offset)) ++ goto err; ++ ++ skb_pull_rcsum(skb_chk, offset); ++ ret = skb_chkf(skb_chk); ++ skb_push_rcsum(skb_chk, offset); ++ ++ if (ret) ++ goto err; ++ ++ return skb_chk; ++ ++err: ++ if (skb_chk && skb_chk != skb) ++ kfree_skb(skb_chk); ++ ++ return NULL; ++ ++} ++EXPORT_SYMBOL(skb_checksum_trimmed); ++ ++void __skb_warn_lro_forwarding(const struct sk_buff *skb) ++{ ++ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", ++ skb->dev->name); ++} ++EXPORT_SYMBOL(__skb_warn_lro_forwarding); ++ ++void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) ++{ ++ if (head_stolen) { ++ skb_release_head_state(skb); ++ kmem_cache_free(skbuff_head_cache, skb); ++ } else { ++ __kfree_skb(skb); ++ } ++} ++EXPORT_SYMBOL(kfree_skb_partial); ++ ++/** ++ * skb_try_coalesce - try to merge skb to prior one ++ * @to: prior buffer ++ * @from: buffer to add ++ * @fragstolen: pointer to boolean ++ * @delta_truesize: how much more was allocated than was requested ++ */ ++bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, ++ bool *fragstolen, int *delta_truesize) ++{ ++ struct skb_shared_info *to_shinfo, *from_shinfo; ++ int i, delta, len = from->len; ++ ++ *fragstolen = false; ++ ++ if (skb_cloned(to)) ++ return false; ++ ++ /* In general, avoid mixing slab allocated and page_pool allocated ++ * pages within the same SKB. However when @to is not pp_recycle and ++ * @from is cloned, we can transition frag pages from page_pool to ++ * reference counted. ++ * ++ * On the other hand, don't allow coalescing two pp_recycle SKBs if ++ * @from is cloned, in case the SKB is using page_pool fragment ++ * references (PP_FLAG_PAGE_FRAG). Since we only take full page ++ * references for cloned SKBs at the moment that would result in ++ * inconsistent reference counts. ++ */ ++ if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) ++ return false; ++ ++ if (len <= skb_tailroom(to)) { ++ if (len) ++ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); ++ *delta_truesize = 0; ++ return true; ++ } ++ ++ to_shinfo = skb_shinfo(to); ++ from_shinfo = skb_shinfo(from); ++ if (to_shinfo->frag_list || from_shinfo->frag_list) ++ return false; ++ if (skb_zcopy(to) || skb_zcopy(from)) ++ return false; ++ ++ if (skb_headlen(from) != 0) { ++ struct page *page; ++ unsigned int offset; ++ ++ if (to_shinfo->nr_frags + ++ from_shinfo->nr_frags >= MAX_SKB_FRAGS) ++ return false; ++ ++ if (skb_head_is_locked(from)) ++ return false; ++ ++ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); ++ ++ page = virt_to_head_page(from->head); ++ offset = from->data - (unsigned char *)page_address(page); ++ ++ skb_fill_page_desc(to, to_shinfo->nr_frags, ++ page, offset, skb_headlen(from)); ++ *fragstolen = true; ++ } else { ++ if (to_shinfo->nr_frags + ++ from_shinfo->nr_frags > MAX_SKB_FRAGS) ++ return false; ++ ++ delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); ++ } ++ ++ WARN_ON_ONCE(delta < len); ++ ++ memcpy(to_shinfo->frags + to_shinfo->nr_frags, ++ from_shinfo->frags, ++ from_shinfo->nr_frags * sizeof(skb_frag_t)); ++ to_shinfo->nr_frags += from_shinfo->nr_frags; ++ ++ if (!skb_cloned(from)) ++ from_shinfo->nr_frags = 0; ++ ++ /* if the skb is not cloned this does nothing ++ * since we set nr_frags to 0. ++ */ ++ for (i = 0; i < from_shinfo->nr_frags; i++) ++ __skb_frag_ref(&from_shinfo->frags[i]); ++ ++ to->truesize += delta; ++ to->len += len; ++ to->data_len += len; ++ ++ *delta_truesize = delta; ++ return true; ++} ++EXPORT_SYMBOL(skb_try_coalesce); ++ ++/** ++ * skb_scrub_packet - scrub an skb ++ * ++ * @skb: buffer to clean ++ * @xnet: packet is crossing netns ++ * ++ * skb_scrub_packet can be used after encapsulating or decapsulting a packet ++ * into/from a tunnel. Some information have to be cleared during these ++ * operations. ++ * skb_scrub_packet can also be used to clean a skb before injecting it in ++ * another namespace (@xnet == true). We have to clear all information in the ++ * skb that could impact namespace isolation. ++ */ ++void skb_scrub_packet(struct sk_buff *skb, bool xnet) ++{ ++ skb->pkt_type = PACKET_HOST; ++ skb->skb_iif = 0; ++ skb->ignore_df = 0; ++ skb_dst_drop(skb); ++ skb_ext_reset(skb); ++ nf_reset_ct(skb); ++ nf_reset_trace(skb); ++ ++#ifdef CONFIG_NET_SWITCHDEV ++ skb->offload_fwd_mark = 0; ++ skb->offload_l3_fwd_mark = 0; ++#endif ++ ++ if (!xnet) ++ return; ++ ++ ipvs_reset(skb); ++ skb->mark = 0; ++ skb_clear_tstamp(skb); ++} ++EXPORT_SYMBOL_GPL(skb_scrub_packet); ++ ++/** ++ * skb_gso_transport_seglen - Return length of individual segments of a gso packet ++ * ++ * @skb: GSO skb ++ * ++ * skb_gso_transport_seglen is used to determine the real size of the ++ * individual segments, including Layer4 headers (TCP/UDP). ++ * ++ * The MAC/L2 or network (IP, IPv6) headers are not accounted for. ++ */ ++static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) ++{ ++ const struct skb_shared_info *shinfo = skb_shinfo(skb); ++ unsigned int thlen = 0; ++ ++ if (skb->encapsulation) { ++ thlen = skb_inner_transport_header(skb) - ++ skb_transport_header(skb); ++ ++ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) ++ thlen += inner_tcp_hdrlen(skb); ++ } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { ++ thlen = tcp_hdrlen(skb); ++ } else if (unlikely(skb_is_gso_sctp(skb))) { ++ thlen = sizeof(struct sctphdr); ++ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { ++ thlen = sizeof(struct udphdr); ++ } ++ /* UFO sets gso_size to the size of the fragmentation ++ * payload, i.e. the size of the L4 (UDP) header is already ++ * accounted for. ++ */ ++ return thlen + shinfo->gso_size; ++} ++ ++/** ++ * skb_gso_network_seglen - Return length of individual segments of a gso packet ++ * ++ * @skb: GSO skb ++ * ++ * skb_gso_network_seglen is used to determine the real size of the ++ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). ++ * ++ * The MAC/L2 header is not accounted for. ++ */ ++static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) ++{ ++ unsigned int hdr_len = skb_transport_header(skb) - ++ skb_network_header(skb); ++ ++ return hdr_len + skb_gso_transport_seglen(skb); ++} ++ ++/** ++ * skb_gso_mac_seglen - Return length of individual segments of a gso packet ++ * ++ * @skb: GSO skb ++ * ++ * skb_gso_mac_seglen is used to determine the real size of the ++ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 ++ * headers (TCP/UDP). ++ */ ++static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) ++{ ++ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); ++ ++ return hdr_len + skb_gso_transport_seglen(skb); ++} ++ ++/** ++ * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS ++ * ++ * There are a couple of instances where we have a GSO skb, and we ++ * want to determine what size it would be after it is segmented. ++ * ++ * We might want to check: ++ * - L3+L4+payload size (e.g. IP forwarding) ++ * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) ++ * ++ * This is a helper to do that correctly considering GSO_BY_FRAGS. ++ * ++ * @skb: GSO skb ++ * ++ * @seg_len: The segmented length (from skb_gso_*_seglen). In the ++ * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. ++ * ++ * @max_len: The maximum permissible length. ++ * ++ * Returns true if the segmented length <= max length. ++ */ ++static inline bool skb_gso_size_check(const struct sk_buff *skb, ++ unsigned int seg_len, ++ unsigned int max_len) { ++ const struct skb_shared_info *shinfo = skb_shinfo(skb); ++ const struct sk_buff *iter; ++ ++ if (shinfo->gso_size != GSO_BY_FRAGS) ++ return seg_len <= max_len; ++ ++ /* Undo this so we can re-use header sizes */ ++ seg_len -= GSO_BY_FRAGS; ++ ++ skb_walk_frags(skb, iter) { ++ if (seg_len + skb_headlen(iter) > max_len) ++ return false; ++ } ++ ++ return true; ++} ++ ++/** ++ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? ++ * ++ * @skb: GSO skb ++ * @mtu: MTU to validate against ++ * ++ * skb_gso_validate_network_len validates if a given skb will fit a ++ * wanted MTU once split. It considers L3 headers, L4 headers, and the ++ * payload. ++ */ ++bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) ++{ ++ return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); ++} ++EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); ++ ++/** ++ * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? ++ * ++ * @skb: GSO skb ++ * @len: length to validate against ++ * ++ * skb_gso_validate_mac_len validates if a given skb will fit a wanted ++ * length once split, including L2, L3 and L4 headers and the payload. ++ */ ++bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) ++{ ++ return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); ++} ++EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); ++ ++static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) ++{ ++ int mac_len, meta_len; ++ void *meta; ++ ++ if (skb_cow(skb, skb_headroom(skb)) < 0) { ++ kfree_skb(skb); ++ return NULL; ++ } ++ ++ mac_len = skb->data - skb_mac_header(skb); ++ if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { ++ memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), ++ mac_len - VLAN_HLEN - ETH_TLEN); ++ } ++ ++ meta_len = skb_metadata_len(skb); ++ if (meta_len) { ++ meta = skb_metadata_end(skb) - meta_len; ++ memmove(meta + VLAN_HLEN, meta, meta_len); ++ } ++ ++ skb->mac_header += VLAN_HLEN; ++ return skb; ++} ++ ++struct sk_buff *skb_vlan_untag(struct sk_buff *skb) ++{ ++ struct vlan_hdr *vhdr; ++ u16 vlan_tci; ++ ++ if (unlikely(skb_vlan_tag_present(skb))) { ++ /* vlan_tci is already set-up so leave this for another time */ ++ return skb; ++ } ++ ++ skb = skb_share_check(skb, GFP_ATOMIC); ++ if (unlikely(!skb)) ++ goto err_free; ++ /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ ++ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) ++ goto err_free; ++ ++ vhdr = (struct vlan_hdr *)skb->data; ++ vlan_tci = ntohs(vhdr->h_vlan_TCI); ++ __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); ++ ++ skb_pull_rcsum(skb, VLAN_HLEN); ++ vlan_set_encap_proto(skb, vhdr); ++ ++ skb = skb_reorder_vlan_header(skb); ++ if (unlikely(!skb)) ++ goto err_free; ++ ++ skb_reset_network_header(skb); ++ if (!skb_transport_header_was_set(skb)) ++ skb_reset_transport_header(skb); ++ skb_reset_mac_len(skb); ++ ++ return skb; ++ ++err_free: ++ kfree_skb(skb); ++ return NULL; ++} ++EXPORT_SYMBOL(skb_vlan_untag); ++ ++int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) ++{ ++ if (!pskb_may_pull(skb, write_len)) ++ return -ENOMEM; ++ ++ if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) ++ return 0; ++ ++ return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); ++} ++EXPORT_SYMBOL(skb_ensure_writable); ++ ++/* remove VLAN header from packet and update csum accordingly. ++ * expects a non skb_vlan_tag_present skb with a vlan tag payload ++ */ ++int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) ++{ ++ struct vlan_hdr *vhdr; ++ int offset = skb->data - skb_mac_header(skb); ++ int err; ++ ++ if (WARN_ONCE(offset, ++ "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", ++ offset)) { ++ return -EINVAL; ++ } ++ ++ err = skb_ensure_writable(skb, VLAN_ETH_HLEN); ++ if (unlikely(err)) ++ return err; ++ ++ skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); ++ ++ vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); ++ *vlan_tci = ntohs(vhdr->h_vlan_TCI); ++ ++ memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); ++ __skb_pull(skb, VLAN_HLEN); ++ ++ vlan_set_encap_proto(skb, vhdr); ++ skb->mac_header += VLAN_HLEN; ++ ++ if (skb_network_offset(skb) < ETH_HLEN) ++ skb_set_network_header(skb, ETH_HLEN); ++ ++ skb_reset_mac_len(skb); ++ ++ return err; ++} ++EXPORT_SYMBOL(__skb_vlan_pop); ++ ++/* Pop a vlan tag either from hwaccel or from payload. ++ * Expects skb->data at mac header. ++ */ ++int skb_vlan_pop(struct sk_buff *skb) ++{ ++ u16 vlan_tci; ++ __be16 vlan_proto; ++ int err; ++ ++ if (likely(skb_vlan_tag_present(skb))) { ++ __vlan_hwaccel_clear_tag(skb); ++ } else { ++ if (unlikely(!eth_type_vlan(skb->protocol))) ++ return 0; ++ ++ err = __skb_vlan_pop(skb, &vlan_tci); ++ if (err) ++ return err; ++ } ++ /* move next vlan tag to hw accel tag */ ++ if (likely(!eth_type_vlan(skb->protocol))) ++ return 0; ++ ++ vlan_proto = skb->protocol; ++ err = __skb_vlan_pop(skb, &vlan_tci); ++ if (unlikely(err)) ++ return err; ++ ++ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); ++ return 0; ++} ++EXPORT_SYMBOL(skb_vlan_pop); ++ ++/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). ++ * Expects skb->data at mac header. ++ */ ++int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) ++{ ++ if (skb_vlan_tag_present(skb)) { ++ int offset = skb->data - skb_mac_header(skb); ++ int err; ++ ++ if (WARN_ONCE(offset, ++ "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", ++ offset)) { ++ return -EINVAL; ++ } ++ ++ err = __vlan_insert_tag(skb, skb->vlan_proto, ++ skb_vlan_tag_get(skb)); ++ if (err) ++ return err; ++ ++ skb->protocol = skb->vlan_proto; ++ skb->mac_len += VLAN_HLEN; ++ ++ skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); ++ } ++ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); ++ return 0; ++} ++EXPORT_SYMBOL(skb_vlan_push); ++ ++/** ++ * skb_eth_pop() - Drop the Ethernet header at the head of a packet ++ * ++ * @skb: Socket buffer to modify ++ * ++ * Drop the Ethernet header of @skb. ++ * ++ * Expects that skb->data points to the mac header and that no VLAN tags are ++ * present. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_eth_pop(struct sk_buff *skb) ++{ ++ if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || ++ skb_network_offset(skb) < ETH_HLEN) ++ return -EPROTO; ++ ++ skb_pull_rcsum(skb, ETH_HLEN); ++ skb_reset_mac_header(skb); ++ skb_reset_mac_len(skb); ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_eth_pop); ++ ++/** ++ * skb_eth_push() - Add a new Ethernet header at the head of a packet ++ * ++ * @skb: Socket buffer to modify ++ * @dst: Destination MAC address of the new header ++ * @src: Source MAC address of the new header ++ * ++ * Prepend @skb with a new Ethernet header. ++ * ++ * Expects that skb->data points to the mac header, which must be empty. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, ++ const unsigned char *src) ++{ ++ struct ethhdr *eth; ++ int err; ++ ++ if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) ++ return -EPROTO; ++ ++ err = skb_cow_head(skb, sizeof(*eth)); ++ if (err < 0) ++ return err; ++ ++ skb_push(skb, sizeof(*eth)); ++ skb_reset_mac_header(skb); ++ skb_reset_mac_len(skb); ++ ++ eth = eth_hdr(skb); ++ ether_addr_copy(eth->h_dest, dst); ++ ether_addr_copy(eth->h_source, src); ++ eth->h_proto = skb->protocol; ++ ++ skb_postpush_rcsum(skb, eth, sizeof(*eth)); ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_eth_push); ++ ++/* Update the ethertype of hdr and the skb csum value if required. */ ++static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, ++ __be16 ethertype) ++{ ++ if (skb->ip_summed == CHECKSUM_COMPLETE) { ++ __be16 diff[] = { ~hdr->h_proto, ethertype }; ++ ++ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); ++ } ++ ++ hdr->h_proto = ethertype; ++} ++ ++/** ++ * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of ++ * the packet ++ * ++ * @skb: buffer ++ * @mpls_lse: MPLS label stack entry to push ++ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) ++ * @mac_len: length of the MAC header ++ * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is ++ * ethernet ++ * ++ * Expects skb->data at mac header. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, ++ int mac_len, bool ethernet) ++{ ++ struct mpls_shim_hdr *lse; ++ int err; ++ ++ if (unlikely(!eth_p_mpls(mpls_proto))) ++ return -EINVAL; ++ ++ /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ ++ if (skb->encapsulation) ++ return -EINVAL; ++ ++ err = skb_cow_head(skb, MPLS_HLEN); ++ if (unlikely(err)) ++ return err; ++ ++ if (!skb->inner_protocol) { ++ skb_set_inner_network_header(skb, skb_network_offset(skb)); ++ skb_set_inner_protocol(skb, skb->protocol); ++ } ++ ++ skb_push(skb, MPLS_HLEN); ++ memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), ++ mac_len); ++ skb_reset_mac_header(skb); ++ skb_set_network_header(skb, mac_len); ++ skb_reset_mac_len(skb); ++ ++ lse = mpls_hdr(skb); ++ lse->label_stack_entry = mpls_lse; ++ skb_postpush_rcsum(skb, lse, MPLS_HLEN); ++ ++ if (ethernet && mac_len >= ETH_HLEN) ++ skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); ++ skb->protocol = mpls_proto; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_mpls_push); ++ ++/** ++ * skb_mpls_pop() - pop the outermost MPLS header ++ * ++ * @skb: buffer ++ * @next_proto: ethertype of header after popped MPLS header ++ * @mac_len: length of the MAC header ++ * @ethernet: flag to indicate if the packet is ethernet ++ * ++ * Expects skb->data at mac header. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, ++ bool ethernet) ++{ ++ int err; ++ ++ if (unlikely(!eth_p_mpls(skb->protocol))) ++ return 0; ++ ++ err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); ++ if (unlikely(err)) ++ return err; ++ ++ skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); ++ memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), ++ mac_len); ++ ++ __skb_pull(skb, MPLS_HLEN); ++ skb_reset_mac_header(skb); ++ skb_set_network_header(skb, mac_len); ++ ++ if (ethernet && mac_len >= ETH_HLEN) { ++ struct ethhdr *hdr; ++ ++ /* use mpls_hdr() to get ethertype to account for VLANs. */ ++ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); ++ skb_mod_eth_type(skb, hdr, next_proto); ++ } ++ skb->protocol = next_proto; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_mpls_pop); ++ ++/** ++ * skb_mpls_update_lse() - modify outermost MPLS header and update csum ++ * ++ * @skb: buffer ++ * @mpls_lse: new MPLS label stack entry to update to ++ * ++ * Expects skb->data at mac header. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) ++{ ++ int err; ++ ++ if (unlikely(!eth_p_mpls(skb->protocol))) ++ return -EINVAL; ++ ++ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); ++ if (unlikely(err)) ++ return err; ++ ++ if (skb->ip_summed == CHECKSUM_COMPLETE) { ++ __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; ++ ++ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); ++ } ++ ++ mpls_hdr(skb)->label_stack_entry = mpls_lse; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_mpls_update_lse); ++ ++/** ++ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header ++ * ++ * @skb: buffer ++ * ++ * Expects skb->data at mac header. ++ * ++ * Returns 0 on success, -errno otherwise. ++ */ ++int skb_mpls_dec_ttl(struct sk_buff *skb) ++{ ++ u32 lse; ++ u8 ttl; ++ ++ if (unlikely(!eth_p_mpls(skb->protocol))) ++ return -EINVAL; ++ ++ if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) ++ return -ENOMEM; ++ ++ lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); ++ ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; ++ if (!--ttl) ++ return -EINVAL; ++ ++ lse &= ~MPLS_LS_TTL_MASK; ++ lse |= ttl << MPLS_LS_TTL_SHIFT; ++ ++ return skb_mpls_update_lse(skb, cpu_to_be32(lse)); ++} ++EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); ++ ++/** ++ * alloc_skb_with_frags - allocate skb with page frags ++ * ++ * @header_len: size of linear part ++ * @data_len: needed length in frags ++ * @max_page_order: max page order desired. ++ * @errcode: pointer to error code if any ++ * @gfp_mask: allocation mask ++ * ++ * This can be used to allocate a paged skb, given a maximal order for frags. ++ */ ++struct sk_buff *alloc_skb_with_frags(unsigned long header_len, ++ unsigned long data_len, ++ int max_page_order, ++ int *errcode, ++ gfp_t gfp_mask) ++{ ++ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; ++ unsigned long chunk; ++ struct sk_buff *skb; ++ struct page *page; ++ int i; ++ ++ *errcode = -EMSGSIZE; ++ /* Note this test could be relaxed, if we succeed to allocate ++ * high order pages... ++ */ ++ if (npages > MAX_SKB_FRAGS) ++ return NULL; ++ ++ *errcode = -ENOBUFS; ++ skb = alloc_skb(header_len, gfp_mask); ++ if (!skb) ++ return NULL; ++ ++ skb->truesize += npages << PAGE_SHIFT; ++ ++ for (i = 0; npages > 0; i++) { ++ int order = max_page_order; ++ ++ while (order) { ++ if (npages >= 1 << order) { ++ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | ++ __GFP_COMP | ++ __GFP_NOWARN, ++ order); ++ if (page) ++ goto fill_page; ++ /* Do not retry other high order allocations */ ++ order = 1; ++ max_page_order = 0; ++ } ++ order--; ++ } ++ page = alloc_page(gfp_mask); ++ if (!page) ++ goto failure; ++fill_page: ++ chunk = min_t(unsigned long, data_len, ++ PAGE_SIZE << order); ++ skb_fill_page_desc(skb, i, page, 0, chunk); ++ data_len -= chunk; ++ npages -= 1 << order; ++ } ++ return skb; ++ ++failure: ++ kfree_skb(skb); ++ return NULL; ++} ++EXPORT_SYMBOL(alloc_skb_with_frags); ++ ++/* carve out the first off bytes from skb when off < headlen */ ++static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, ++ const int headlen, gfp_t gfp_mask) ++{ ++ int i; ++ int size = skb_end_offset(skb); ++ int new_hlen = headlen - off; ++ u8 *data; ++ ++ size = SKB_DATA_ALIGN(size); ++ ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ data = kmalloc_reserve(size + ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ++ gfp_mask, NUMA_NO_NODE, NULL); ++ if (!data) ++ return -ENOMEM; ++ ++ size = SKB_WITH_OVERHEAD(ksize(data)); ++ ++ /* Copy real data, and all frags */ ++ skb_copy_from_linear_data_offset(skb, off, data, new_hlen); ++ skb->len -= off; ++ ++ memcpy((struct skb_shared_info *)(data + size), ++ skb_shinfo(skb), ++ offsetof(struct skb_shared_info, ++ frags[skb_shinfo(skb)->nr_frags])); ++ if (skb_cloned(skb)) { ++ /* drop the old head gracefully */ ++ if (skb_orphan_frags(skb, gfp_mask)) { ++ kfree(data); ++ return -ENOMEM; ++ } ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_frag_ref(skb, i); ++ if (skb_has_frag_list(skb)) ++ skb_clone_fraglist(skb); ++ skb_release_data(skb); ++ } else { ++ /* we can reuse existing recount- all we did was ++ * relocate values ++ */ ++ skb_free_head(skb); ++ } ++ ++ skb->head = data; ++ skb->data = data; ++ skb->head_frag = 0; ++ skb_set_end_offset(skb, size); ++ skb_set_tail_pointer(skb, skb_headlen(skb)); ++ skb_headers_offset_update(skb, 0); ++ skb->cloned = 0; ++ skb->hdr_len = 0; ++ skb->nohdr = 0; ++ atomic_set(&skb_shinfo(skb)->dataref, 1); ++ ++ return 0; ++} ++ ++static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); ++ ++/* carve out the first eat bytes from skb's frag_list. May recurse into ++ * pskb_carve() ++ */ ++static int pskb_carve_frag_list(struct sk_buff *skb, ++ struct skb_shared_info *shinfo, int eat, ++ gfp_t gfp_mask) ++{ ++ struct sk_buff *list = shinfo->frag_list; ++ struct sk_buff *clone = NULL; ++ struct sk_buff *insp = NULL; ++ ++ do { ++ if (!list) { ++ pr_err("Not enough bytes to eat. Want %d\n", eat); ++ return -EFAULT; ++ } ++ if (list->len <= eat) { ++ /* Eaten as whole. */ ++ eat -= list->len; ++ list = list->next; ++ insp = list; ++ } else { ++ /* Eaten partially. */ ++ if (skb_shared(list)) { ++ clone = skb_clone(list, gfp_mask); ++ if (!clone) ++ return -ENOMEM; ++ insp = list->next; ++ list = clone; ++ } else { ++ /* This may be pulled without problems. */ ++ insp = list; ++ } ++ if (pskb_carve(list, eat, gfp_mask) < 0) { ++ kfree_skb(clone); ++ return -ENOMEM; ++ } ++ break; ++ } ++ } while (eat); ++ ++ /* Free pulled out fragments. */ ++ while ((list = shinfo->frag_list) != insp) { ++ shinfo->frag_list = list->next; ++ consume_skb(list); ++ } ++ /* And insert new clone at head. */ ++ if (clone) { ++ clone->next = list; ++ shinfo->frag_list = clone; ++ } ++ return 0; ++} ++ ++/* carve off first len bytes from skb. Split line (off) is in the ++ * non-linear part of skb ++ */ ++static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, ++ int pos, gfp_t gfp_mask) ++{ ++ int i, k = 0; ++ int size = skb_end_offset(skb); ++ u8 *data; ++ const int nfrags = skb_shinfo(skb)->nr_frags; ++ struct skb_shared_info *shinfo; ++ ++ size = SKB_DATA_ALIGN(size); ++ ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ data = kmalloc_reserve(size + ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ++ gfp_mask, NUMA_NO_NODE, NULL); ++ if (!data) ++ return -ENOMEM; ++ ++ size = SKB_WITH_OVERHEAD(ksize(data)); ++ ++ memcpy((struct skb_shared_info *)(data + size), ++ skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); ++ if (skb_orphan_frags(skb, gfp_mask)) { ++ kfree(data); ++ return -ENOMEM; ++ } ++ shinfo = (struct skb_shared_info *)(data + size); ++ for (i = 0; i < nfrags; i++) { ++ int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (pos + fsize > off) { ++ shinfo->frags[k] = skb_shinfo(skb)->frags[i]; ++ ++ if (pos < off) { ++ /* Split frag. ++ * We have two variants in this case: ++ * 1. Move all the frag to the second ++ * part, if it is possible. F.e. ++ * this approach is mandatory for TUX, ++ * where splitting is expensive. ++ * 2. Split is accurately. We make this. ++ */ ++ skb_frag_off_add(&shinfo->frags[0], off - pos); ++ skb_frag_size_sub(&shinfo->frags[0], off - pos); ++ } ++ skb_frag_ref(skb, i); ++ k++; ++ } ++ pos += fsize; ++ } ++ shinfo->nr_frags = k; ++ if (skb_has_frag_list(skb)) ++ skb_clone_fraglist(skb); ++ ++ /* split line is in frag list */ ++ if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { ++ /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ ++ if (skb_has_frag_list(skb)) ++ kfree_skb_list(skb_shinfo(skb)->frag_list); ++ kfree(data); ++ return -ENOMEM; ++ } ++ skb_release_data(skb); ++ ++ skb->head = data; ++ skb->head_frag = 0; ++ skb->data = data; ++ skb_set_end_offset(skb, size); ++ skb_reset_tail_pointer(skb); ++ skb_headers_offset_update(skb, 0); ++ skb->cloned = 0; ++ skb->hdr_len = 0; ++ skb->nohdr = 0; ++ skb->len -= off; ++ skb->data_len = skb->len; ++ atomic_set(&skb_shinfo(skb)->dataref, 1); ++ return 0; ++} ++ ++/* remove len bytes from the beginning of the skb */ ++static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) ++{ ++ int headlen = skb_headlen(skb); ++ ++ if (len < headlen) ++ return pskb_carve_inside_header(skb, len, headlen, gfp); ++ else ++ return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); ++} ++ ++/* Extract to_copy bytes starting at off from skb, and return this in ++ * a new skb ++ */ ++struct sk_buff *pskb_extract(struct sk_buff *skb, int off, ++ int to_copy, gfp_t gfp) ++{ ++ struct sk_buff *clone = skb_clone(skb, gfp); ++ ++ if (!clone) ++ return NULL; ++ ++ if (pskb_carve(clone, off, gfp) < 0 || ++ pskb_trim(clone, to_copy)) { ++ kfree_skb(clone); ++ return NULL; ++ } ++ return clone; ++} ++EXPORT_SYMBOL(pskb_extract); ++ ++/** ++ * skb_condense - try to get rid of fragments/frag_list if possible ++ * @skb: buffer ++ * ++ * Can be used to save memory before skb is added to a busy queue. ++ * If packet has bytes in frags and enough tail room in skb->head, ++ * pull all of them, so that we can free the frags right now and adjust ++ * truesize. ++ * Notes: ++ * We do not reallocate skb->head thus can not fail. ++ * Caller must re-evaluate skb->truesize if needed. ++ */ ++void skb_condense(struct sk_buff *skb) ++{ ++ if (skb->data_len) { ++ if (skb->data_len > skb->end - skb->tail || ++ skb_cloned(skb)) ++ return; ++ ++ /* Nice, we can free page frag(s) right now */ ++ __pskb_pull_tail(skb, skb->data_len); ++ } ++ /* At this point, skb->truesize might be over estimated, ++ * because skb had a fragment, and fragments do not tell ++ * their truesize. ++ * When we pulled its content into skb->head, fragment ++ * was freed, but __pskb_pull_tail() could not possibly ++ * adjust skb->truesize, not knowing the frag truesize. ++ */ ++ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); ++} ++ ++#ifdef CONFIG_SKB_EXTENSIONS ++static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) ++{ ++ return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); ++} ++ ++/** ++ * __skb_ext_alloc - allocate a new skb extensions storage ++ * ++ * @flags: See kmalloc(). ++ * ++ * Returns the newly allocated pointer. The pointer can later attached to a ++ * skb via __skb_ext_set(). ++ * Note: caller must handle the skb_ext as an opaque data. ++ */ ++struct skb_ext *__skb_ext_alloc(gfp_t flags) ++{ ++ struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); ++ ++ if (new) { ++ memset(new->offset, 0, sizeof(new->offset)); ++ refcount_set(&new->refcnt, 1); ++ } ++ ++ return new; ++} ++ ++static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, ++ unsigned int old_active) ++{ ++ struct skb_ext *new; ++ ++ if (refcount_read(&old->refcnt) == 1) ++ return old; ++ ++ new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); ++ if (!new) ++ return NULL; ++ ++ memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); ++ refcount_set(&new->refcnt, 1); ++ ++#ifdef CONFIG_XFRM ++ if (old_active & (1 << SKB_EXT_SEC_PATH)) { ++ struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); ++ unsigned int i; ++ ++ for (i = 0; i < sp->len; i++) ++ xfrm_state_hold(sp->xvec[i]); ++ } ++#endif ++ __skb_ext_put(old); ++ return new; ++} ++ ++/** ++ * __skb_ext_set - attach the specified extension storage to this skb ++ * @skb: buffer ++ * @id: extension id ++ * @ext: extension storage previously allocated via __skb_ext_alloc() ++ * ++ * Existing extensions, if any, are cleared. ++ * ++ * Returns the pointer to the extension. ++ */ ++void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, ++ struct skb_ext *ext) ++{ ++ unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); ++ ++ skb_ext_put(skb); ++ newlen = newoff + skb_ext_type_len[id]; ++ ext->chunks = newlen; ++ ext->offset[id] = newoff; ++ skb->extensions = ext; ++ skb->active_extensions = 1 << id; ++ return skb_ext_get_ptr(ext, id); ++} ++ ++/** ++ * skb_ext_add - allocate space for given extension, COW if needed ++ * @skb: buffer ++ * @id: extension to allocate space for ++ * ++ * Allocates enough space for the given extension. ++ * If the extension is already present, a pointer to that extension ++ * is returned. ++ * ++ * If the skb was cloned, COW applies and the returned memory can be ++ * modified without changing the extension space of clones buffers. ++ * ++ * Returns pointer to the extension or NULL on allocation failure. ++ */ ++void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) ++{ ++ struct skb_ext *new, *old = NULL; ++ unsigned int newlen, newoff; ++ ++ if (skb->active_extensions) { ++ old = skb->extensions; ++ ++ new = skb_ext_maybe_cow(old, skb->active_extensions); ++ if (!new) ++ return NULL; ++ ++ if (__skb_ext_exist(new, id)) ++ goto set_active; ++ ++ newoff = new->chunks; ++ } else { ++ newoff = SKB_EXT_CHUNKSIZEOF(*new); ++ ++ new = __skb_ext_alloc(GFP_ATOMIC); ++ if (!new) ++ return NULL; ++ } ++ ++ newlen = newoff + skb_ext_type_len[id]; ++ new->chunks = newlen; ++ new->offset[id] = newoff; ++set_active: ++ skb->slow_gro = 1; ++ skb->extensions = new; ++ skb->active_extensions |= 1 << id; ++ return skb_ext_get_ptr(new, id); ++} ++EXPORT_SYMBOL(skb_ext_add); ++ ++#ifdef CONFIG_XFRM ++static void skb_ext_put_sp(struct sec_path *sp) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < sp->len; i++) ++ xfrm_state_put(sp->xvec[i]); ++} ++#endif ++ ++#ifdef CONFIG_MCTP_FLOWS ++static void skb_ext_put_mctp(struct mctp_flow *flow) ++{ ++ if (flow->key) ++ mctp_key_unref(flow->key); ++} ++#endif ++ ++void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) ++{ ++ struct skb_ext *ext = skb->extensions; ++ ++ skb->active_extensions &= ~(1 << id); ++ if (skb->active_extensions == 0) { ++ skb->extensions = NULL; ++ __skb_ext_put(ext); ++#ifdef CONFIG_XFRM ++ } else if (id == SKB_EXT_SEC_PATH && ++ refcount_read(&ext->refcnt) == 1) { ++ struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); ++ ++ skb_ext_put_sp(sp); ++ sp->len = 0; ++#endif ++ } ++} ++EXPORT_SYMBOL(__skb_ext_del); ++ ++void __skb_ext_put(struct skb_ext *ext) ++{ ++ /* If this is last clone, nothing can increment ++ * it after check passes. Avoids one atomic op. ++ */ ++ if (refcount_read(&ext->refcnt) == 1) ++ goto free_now; ++ ++ if (!refcount_dec_and_test(&ext->refcnt)) ++ return; ++free_now: ++#ifdef CONFIG_XFRM ++ if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) ++ skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); ++#endif ++#ifdef CONFIG_MCTP_FLOWS ++ if (__skb_ext_exist(ext, SKB_EXT_MCTP)) ++ skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); ++#endif ++ ++ kmem_cache_free(skbuff_ext_cache, ext); ++} ++EXPORT_SYMBOL(__skb_ext_put); ++#endif /* CONFIG_SKB_EXTENSIONS */ ++ ++/** ++ * skb_attempt_defer_free - queue skb for remote freeing ++ * @skb: buffer ++ * ++ * Put @skb in a per-cpu list, using the cpu which ++ * allocated the skb/pages to reduce false sharing ++ * and memory zone spinlock contention. ++ */ ++void skb_attempt_defer_free(struct sk_buff *skb) ++{ ++ int cpu = skb->alloc_cpu; ++ struct softnet_data *sd; ++ unsigned long flags; ++ unsigned int defer_max; ++ bool kick; ++ ++ if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || ++ !cpu_online(cpu) || ++ cpu == raw_smp_processor_id()) { ++nodefer: __kfree_skb(skb); ++ return; ++ } ++ ++ sd = &per_cpu(softnet_data, cpu); ++ defer_max = READ_ONCE(sysctl_skb_defer_max); ++ if (READ_ONCE(sd->defer_count) >= defer_max) ++ goto nodefer; ++ ++ spin_lock_irqsave(&sd->defer_lock, flags); ++ /* Send an IPI every time queue reaches half capacity. */ ++ kick = sd->defer_count == (defer_max >> 1); ++ /* Paired with the READ_ONCE() few lines above */ ++ WRITE_ONCE(sd->defer_count, sd->defer_count + 1); ++ ++ skb->next = sd->defer_list; ++ /* Paired with READ_ONCE() in skb_defer_free_flush() */ ++ WRITE_ONCE(sd->defer_list, skb); ++ spin_unlock_irqrestore(&sd->defer_lock, flags); ++ ++ /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU ++ * if we are unlucky enough (this seems very unlikely). ++ */ ++ if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) ++ smp_call_function_single_async(cpu, &sd->defer_csd); ++} +diff -rupN linux.orig/net/dsa/slave.c linux/net/dsa/slave.c +--- linux.orig/net/dsa/slave.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/dsa/slave.c 2022-12-04 10:40:26.732034003 -0500 +@@ -934,12 +934,12 @@ static void dsa_slave_get_ethtool_stats( s = per_cpu_ptr(dev->tstats, i); do { @@ -8833,11 +57287,10 @@ index 1291c2431d440..dcc550b871623 100644 data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; -diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c -index 3ca0cc4678862..dbae0c79d5cfb 100644 ---- a/net/ipv4/af_inet.c -+++ b/net/ipv4/af_inet.c -@@ -1684,9 +1684,9 @@ u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, +diff -rupN linux.orig/net/ipv4/af_inet.c linux/net/ipv4/af_inet.c +--- linux.orig/net/ipv4/af_inet.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/ipv4/af_inet.c 2022-12-04 10:40:26.732034003 -0500 +@@ -1686,9 +1686,9 @@ u64 snmp_get_cpu_field64(void __percpu * bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { @@ -8849,11 +57302,2095 @@ index 3ca0cc4678862..dbae0c79d5cfb 100644 return v; } -diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c -index b7de5e46fdd8f..f84da849819cc 100644 ---- a/net/ipv6/seg6_local.c -+++ b/net/ipv6/seg6_local.c -@@ -1508,13 +1508,13 @@ static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt) +diff -rupN linux.orig/net/ipv4/af_inet.c.orig linux/net/ipv4/af_inet.c.orig +--- linux.orig/net/ipv4/af_inet.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux/net/ipv4/af_inet.c.orig 2022-12-04 10:40:18.732054506 -0500 +@@ -0,0 +1,2081 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * INET An implementation of the TCP/IP protocol suite for the LINUX ++ * operating system. INET is implemented using the BSD Socket ++ * interface as the means of communication with the user level. ++ * ++ * PF_INET protocol family socket handler. ++ * ++ * Authors: Ross Biro ++ * Fred N. van Kempen, ++ * Florian La Roche, ++ * Alan Cox, ++ * ++ * Changes (see also sock.c) ++ * ++ * piggy, ++ * Karl Knutson : Socket protocol table ++ * A.N.Kuznetsov : Socket death error in accept(). ++ * John Richardson : Fix non blocking error in connect() ++ * so sockets that fail to connect ++ * don't return -EINPROGRESS. ++ * Alan Cox : Asynchronous I/O support ++ * Alan Cox : Keep correct socket pointer on sock ++ * structures ++ * when accept() ed ++ * Alan Cox : Semantics of SO_LINGER aren't state ++ * moved to close when you look carefully. ++ * With this fixed and the accept bug fixed ++ * some RPC stuff seems happier. ++ * Niibe Yutaka : 4.4BSD style write async I/O ++ * Alan Cox, ++ * Tony Gale : Fixed reuse semantics. ++ * Alan Cox : bind() shouldn't abort existing but dead ++ * sockets. Stops FTP netin:.. I hope. ++ * Alan Cox : bind() works correctly for RAW sockets. ++ * Note that FreeBSD at least was broken ++ * in this respect so be careful with ++ * compatibility tests... ++ * Alan Cox : routing cache support ++ * Alan Cox : memzero the socket structure for ++ * compactness. ++ * Matt Day : nonblock connect error handler ++ * Alan Cox : Allow large numbers of pending sockets ++ * (eg for big web sites), but only if ++ * specifically application requested. ++ * Alan Cox : New buffering throughout IP. Used ++ * dumbly. ++ * Alan Cox : New buffering now used smartly. ++ * Alan Cox : BSD rather than common sense ++ * interpretation of listen. ++ * Germano Caronni : Assorted small races. ++ * Alan Cox : sendmsg/recvmsg basic support. ++ * Alan Cox : Only sendmsg/recvmsg now supported. ++ * Alan Cox : Locked down bind (see security list). ++ * Alan Cox : Loosened bind a little. ++ * Mike McLagan : ADD/DEL DLCI Ioctls ++ * Willy Konynenberg : Transparent proxying support. ++ * David S. Miller : New socket lookup architecture. ++ * Some other random speedups. ++ * Cyrus Durgin : Cleaned up file for kmod hacks. ++ * Andi Kleen : Fix inet_stream_connect TCP race. ++ */ ++ ++#define pr_fmt(fmt) "IPv4: " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_IP_MROUTE ++#include ++#endif ++#include ++#include ++ ++#include ++ ++/* The inetsw table contains everything that inet_create needs to ++ * build a new socket. ++ */ ++static struct list_head inetsw[SOCK_MAX]; ++static DEFINE_SPINLOCK(inetsw_lock); ++ ++/* New destruction routine */ ++ ++void inet_sock_destruct(struct sock *sk) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ ++ __skb_queue_purge(&sk->sk_receive_queue); ++ __skb_queue_purge(&sk->sk_error_queue); ++ ++ sk_mem_reclaim_final(sk); ++ ++ if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { ++ pr_err("Attempt to release TCP socket in state %d %p\n", ++ sk->sk_state, sk); ++ return; ++ } ++ if (!sock_flag(sk, SOCK_DEAD)) { ++ pr_err("Attempt to release alive inet socket %p\n", sk); ++ return; ++ } ++ ++ WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); ++ WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); ++ WARN_ON_ONCE(sk->sk_wmem_queued); ++ WARN_ON_ONCE(sk_forward_alloc_get(sk)); ++ ++ kfree(rcu_dereference_protected(inet->inet_opt, 1)); ++ dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); ++ dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); ++ sk_refcnt_debug_dec(sk); ++} ++EXPORT_SYMBOL(inet_sock_destruct); ++ ++/* ++ * The routines beyond this point handle the behaviour of an AF_INET ++ * socket object. Mostly it punts to the subprotocols of IP to do ++ * the work. ++ */ ++ ++/* ++ * Automatically bind an unbound socket. ++ */ ++ ++static int inet_autobind(struct sock *sk) ++{ ++ struct inet_sock *inet; ++ /* We may need to bind the socket. */ ++ lock_sock(sk); ++ inet = inet_sk(sk); ++ if (!inet->inet_num) { ++ if (sk->sk_prot->get_port(sk, 0)) { ++ release_sock(sk); ++ return -EAGAIN; ++ } ++ inet->inet_sport = htons(inet->inet_num); ++ } ++ release_sock(sk); ++ return 0; ++} ++ ++/* ++ * Move a socket into listening state. ++ */ ++int inet_listen(struct socket *sock, int backlog) ++{ ++ struct sock *sk = sock->sk; ++ unsigned char old_state; ++ int err, tcp_fastopen; ++ ++ lock_sock(sk); ++ ++ err = -EINVAL; ++ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) ++ goto out; ++ ++ old_state = sk->sk_state; ++ if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) ++ goto out; ++ ++ WRITE_ONCE(sk->sk_max_ack_backlog, backlog); ++ /* Really, if the socket is already in listen state ++ * we can only allow the backlog to be adjusted. ++ */ ++ if (old_state != TCP_LISTEN) { ++ /* Enable TFO w/o requiring TCP_FASTOPEN socket option. ++ * Note that only TCP sockets (SOCK_STREAM) will reach here. ++ * Also fastopen backlog may already been set via the option ++ * because the socket was in TCP_LISTEN state previously but ++ * was shutdown() rather than close(). ++ */ ++ tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); ++ if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && ++ (tcp_fastopen & TFO_SERVER_ENABLE) && ++ !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { ++ fastopen_queue_tune(sk, backlog); ++ tcp_fastopen_init_key_once(sock_net(sk)); ++ } ++ ++ err = inet_csk_listen_start(sk); ++ if (err) ++ goto out; ++ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); ++ } ++ err = 0; ++ ++out: ++ release_sock(sk); ++ return err; ++} ++EXPORT_SYMBOL(inet_listen); ++ ++/* ++ * Create an inet socket. ++ */ ++ ++static int inet_create(struct net *net, struct socket *sock, int protocol, ++ int kern) ++{ ++ struct sock *sk; ++ struct inet_protosw *answer; ++ struct inet_sock *inet; ++ struct proto *answer_prot; ++ unsigned char answer_flags; ++ int try_loading_module = 0; ++ int err; ++ ++ if (protocol < 0 || protocol >= IPPROTO_MAX) ++ return -EINVAL; ++ ++ sock->state = SS_UNCONNECTED; ++ ++ /* Look for the requested type/protocol pair. */ ++lookup_protocol: ++ err = -ESOCKTNOSUPPORT; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { ++ ++ err = 0; ++ /* Check the non-wild match. */ ++ if (protocol == answer->protocol) { ++ if (protocol != IPPROTO_IP) ++ break; ++ } else { ++ /* Check for the two wild cases. */ ++ if (IPPROTO_IP == protocol) { ++ protocol = answer->protocol; ++ break; ++ } ++ if (IPPROTO_IP == answer->protocol) ++ break; ++ } ++ err = -EPROTONOSUPPORT; ++ } ++ ++ if (unlikely(err)) { ++ if (try_loading_module < 2) { ++ rcu_read_unlock(); ++ /* ++ * Be more specific, e.g. net-pf-2-proto-132-type-1 ++ * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) ++ */ ++ if (++try_loading_module == 1) ++ request_module("net-pf-%d-proto-%d-type-%d", ++ PF_INET, protocol, sock->type); ++ /* ++ * Fall back to generic, e.g. net-pf-2-proto-132 ++ * (net-pf-PF_INET-proto-IPPROTO_SCTP) ++ */ ++ else ++ request_module("net-pf-%d-proto-%d", ++ PF_INET, protocol); ++ goto lookup_protocol; ++ } else ++ goto out_rcu_unlock; ++ } ++ ++ err = -EPERM; ++ if (sock->type == SOCK_RAW && !kern && ++ !ns_capable(net->user_ns, CAP_NET_RAW)) ++ goto out_rcu_unlock; ++ ++ sock->ops = answer->ops; ++ answer_prot = answer->prot; ++ answer_flags = answer->flags; ++ rcu_read_unlock(); ++ ++ WARN_ON(!answer_prot->slab); ++ ++ err = -ENOMEM; ++ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); ++ if (!sk) ++ goto out; ++ ++ err = 0; ++ if (INET_PROTOSW_REUSE & answer_flags) ++ sk->sk_reuse = SK_CAN_REUSE; ++ ++ inet = inet_sk(sk); ++ inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; ++ ++ inet->nodefrag = 0; ++ ++ if (SOCK_RAW == sock->type) { ++ inet->inet_num = protocol; ++ if (IPPROTO_RAW == protocol) ++ inet->hdrincl = 1; ++ } ++ ++ if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) ++ inet->pmtudisc = IP_PMTUDISC_DONT; ++ else ++ inet->pmtudisc = IP_PMTUDISC_WANT; ++ ++ inet->inet_id = 0; ++ ++ sock_init_data(sock, sk); ++ ++ sk->sk_destruct = inet_sock_destruct; ++ sk->sk_protocol = protocol; ++ sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; ++ ++ inet->uc_ttl = -1; ++ inet->mc_loop = 1; ++ inet->mc_ttl = 1; ++ inet->mc_all = 1; ++ inet->mc_index = 0; ++ inet->mc_list = NULL; ++ inet->rcv_tos = 0; ++ ++ sk_refcnt_debug_inc(sk); ++ ++ if (inet->inet_num) { ++ /* It assumes that any protocol which allows ++ * the user to assign a number at socket ++ * creation time automatically ++ * shares. ++ */ ++ inet->inet_sport = htons(inet->inet_num); ++ /* Add to protocol hash chains. */ ++ err = sk->sk_prot->hash(sk); ++ if (err) { ++ sk_common_release(sk); ++ goto out; ++ } ++ } ++ ++ if (sk->sk_prot->init) { ++ err = sk->sk_prot->init(sk); ++ if (err) { ++ sk_common_release(sk); ++ goto out; ++ } ++ } ++ ++ if (!kern) { ++ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); ++ if (err) { ++ sk_common_release(sk); ++ goto out; ++ } ++ } ++out: ++ return err; ++out_rcu_unlock: ++ rcu_read_unlock(); ++ goto out; ++} ++ ++ ++/* ++ * The peer socket should always be NULL (or else). When we call this ++ * function we are destroying the object and from then on nobody ++ * should refer to it. ++ */ ++int inet_release(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ ++ if (sk) { ++ long timeout; ++ ++ if (!sk->sk_kern_sock) ++ BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); ++ ++ /* Applications forget to leave groups before exiting */ ++ ip_mc_drop_socket(sk); ++ ++ /* If linger is set, we don't return until the close ++ * is complete. Otherwise we return immediately. The ++ * actually closing is done the same either way. ++ * ++ * If the close is due to the process exiting, we never ++ * linger.. ++ */ ++ timeout = 0; ++ if (sock_flag(sk, SOCK_LINGER) && ++ !(current->flags & PF_EXITING)) ++ timeout = sk->sk_lingertime; ++ sk->sk_prot->close(sk, timeout); ++ sock->sk = NULL; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(inet_release); ++ ++int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ++{ ++ struct sock *sk = sock->sk; ++ u32 flags = BIND_WITH_LOCK; ++ int err; ++ ++ /* If the socket has its own bind function then use it. (RAW) */ ++ if (sk->sk_prot->bind) { ++ return sk->sk_prot->bind(sk, uaddr, addr_len); ++ } ++ if (addr_len < sizeof(struct sockaddr_in)) ++ return -EINVAL; ++ ++ /* BPF prog is run before any checks are done so that if the prog ++ * changes context in a wrong way it will be caught. ++ */ ++ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, ++ CGROUP_INET4_BIND, &flags); ++ if (err) ++ return err; ++ ++ return __inet_bind(sk, uaddr, addr_len, flags); ++} ++EXPORT_SYMBOL(inet_bind); ++ ++int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, ++ u32 flags) ++{ ++ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; ++ struct inet_sock *inet = inet_sk(sk); ++ struct net *net = sock_net(sk); ++ unsigned short snum; ++ int chk_addr_ret; ++ u32 tb_id = RT_TABLE_LOCAL; ++ int err; ++ ++ if (addr->sin_family != AF_INET) { ++ /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) ++ * only if s_addr is INADDR_ANY. ++ */ ++ err = -EAFNOSUPPORT; ++ if (addr->sin_family != AF_UNSPEC || ++ addr->sin_addr.s_addr != htonl(INADDR_ANY)) ++ goto out; ++ } ++ ++ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; ++ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); ++ ++ /* Not specified by any standard per-se, however it breaks too ++ * many applications when removed. It is unfortunate since ++ * allowing applications to make a non-local bind solves ++ * several problems with systems using dynamic addressing. ++ * (ie. your servers still start up even if your ISDN link ++ * is temporarily down) ++ */ ++ err = -EADDRNOTAVAIL; ++ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr, ++ chk_addr_ret)) ++ goto out; ++ ++ snum = ntohs(addr->sin_port); ++ err = -EACCES; ++ if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && ++ snum && inet_port_requires_bind_service(net, snum) && ++ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) ++ goto out; ++ ++ /* We keep a pair of addresses. rcv_saddr is the one ++ * used by hash lookups, and saddr is used for transmit. ++ * ++ * In the BSD API these are the same except where it ++ * would be illegal to use them (multicast/broadcast) in ++ * which case the sending device address is used. ++ */ ++ if (flags & BIND_WITH_LOCK) ++ lock_sock(sk); ++ ++ /* Check these errors (active socket, double bind). */ ++ err = -EINVAL; ++ if (sk->sk_state != TCP_CLOSE || inet->inet_num) ++ goto out_release_sock; ++ ++ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; ++ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) ++ inet->inet_saddr = 0; /* Use device */ ++ ++ /* Make sure we are allowed to bind here. */ ++ if (snum || !(inet->bind_address_no_port || ++ (flags & BIND_FORCE_ADDRESS_NO_PORT))) { ++ if (sk->sk_prot->get_port(sk, snum)) { ++ inet->inet_saddr = inet->inet_rcv_saddr = 0; ++ err = -EADDRINUSE; ++ goto out_release_sock; ++ } ++ if (!(flags & BIND_FROM_BPF)) { ++ err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); ++ if (err) { ++ inet->inet_saddr = inet->inet_rcv_saddr = 0; ++ if (sk->sk_prot->put_port) ++ sk->sk_prot->put_port(sk); ++ goto out_release_sock; ++ } ++ } ++ } ++ ++ if (inet->inet_rcv_saddr) ++ sk->sk_userlocks |= SOCK_BINDADDR_LOCK; ++ if (snum) ++ sk->sk_userlocks |= SOCK_BINDPORT_LOCK; ++ inet->inet_sport = htons(inet->inet_num); ++ inet->inet_daddr = 0; ++ inet->inet_dport = 0; ++ sk_dst_reset(sk); ++ err = 0; ++out_release_sock: ++ if (flags & BIND_WITH_LOCK) ++ release_sock(sk); ++out: ++ return err; ++} ++ ++int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len, int flags) ++{ ++ struct sock *sk = sock->sk; ++ int err; ++ ++ if (addr_len < sizeof(uaddr->sa_family)) ++ return -EINVAL; ++ if (uaddr->sa_family == AF_UNSPEC) ++ return sk->sk_prot->disconnect(sk, flags); ++ ++ if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { ++ err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); ++ if (err) ++ return err; ++ } ++ ++ if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) ++ return -EAGAIN; ++ return sk->sk_prot->connect(sk, uaddr, addr_len); ++} ++EXPORT_SYMBOL(inet_dgram_connect); ++ ++static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) ++{ ++ DEFINE_WAIT_FUNC(wait, woken_wake_function); ++ ++ add_wait_queue(sk_sleep(sk), &wait); ++ sk->sk_write_pending += writebias; ++ ++ /* Basic assumption: if someone sets sk->sk_err, he _must_ ++ * change state of the socket from TCP_SYN_*. ++ * Connect() does not allow to get error notifications ++ * without closing the socket. ++ */ ++ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { ++ release_sock(sk); ++ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); ++ lock_sock(sk); ++ if (signal_pending(current) || !timeo) ++ break; ++ } ++ remove_wait_queue(sk_sleep(sk), &wait); ++ sk->sk_write_pending -= writebias; ++ return timeo; ++} ++ ++/* ++ * Connect to a remote host. There is regrettably still a little ++ * TCP 'magic' in here. ++ */ ++int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len, int flags, int is_sendmsg) ++{ ++ struct sock *sk = sock->sk; ++ int err; ++ long timeo; ++ ++ /* ++ * uaddr can be NULL and addr_len can be 0 if: ++ * sk is a TCP fastopen active socket and ++ * TCP_FASTOPEN_CONNECT sockopt is set and ++ * we already have a valid cookie for this socket. ++ * In this case, user can call write() after connect(). ++ * write() will invoke tcp_sendmsg_fastopen() which calls ++ * __inet_stream_connect(). ++ */ ++ if (uaddr) { ++ if (addr_len < sizeof(uaddr->sa_family)) ++ return -EINVAL; ++ ++ if (uaddr->sa_family == AF_UNSPEC) { ++ err = sk->sk_prot->disconnect(sk, flags); ++ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; ++ goto out; ++ } ++ } ++ ++ switch (sock->state) { ++ default: ++ err = -EINVAL; ++ goto out; ++ case SS_CONNECTED: ++ err = -EISCONN; ++ goto out; ++ case SS_CONNECTING: ++ if (inet_sk(sk)->defer_connect) ++ err = is_sendmsg ? -EINPROGRESS : -EISCONN; ++ else ++ err = -EALREADY; ++ /* Fall out of switch with err, set for this state */ ++ break; ++ case SS_UNCONNECTED: ++ err = -EISCONN; ++ if (sk->sk_state != TCP_CLOSE) ++ goto out; ++ ++ if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { ++ err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); ++ if (err) ++ goto out; ++ } ++ ++ err = sk->sk_prot->connect(sk, uaddr, addr_len); ++ if (err < 0) ++ goto out; ++ ++ sock->state = SS_CONNECTING; ++ ++ if (!err && inet_sk(sk)->defer_connect) ++ goto out; ++ ++ /* Just entered SS_CONNECTING state; the only ++ * difference is that return value in non-blocking ++ * case is EINPROGRESS, rather than EALREADY. ++ */ ++ err = -EINPROGRESS; ++ break; ++ } ++ ++ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); ++ ++ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { ++ int writebias = (sk->sk_protocol == IPPROTO_TCP) && ++ tcp_sk(sk)->fastopen_req && ++ tcp_sk(sk)->fastopen_req->data ? 1 : 0; ++ ++ /* Error code is set above */ ++ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) ++ goto out; ++ ++ err = sock_intr_errno(timeo); ++ if (signal_pending(current)) ++ goto out; ++ } ++ ++ /* Connection was closed by RST, timeout, ICMP error ++ * or another process disconnected us. ++ */ ++ if (sk->sk_state == TCP_CLOSE) ++ goto sock_error; ++ ++ /* sk->sk_err may be not zero now, if RECVERR was ordered by user ++ * and error was received after socket entered established state. ++ * Hence, it is handled normally after connect() return successfully. ++ */ ++ ++ sock->state = SS_CONNECTED; ++ err = 0; ++out: ++ return err; ++ ++sock_error: ++ err = sock_error(sk) ? : -ECONNABORTED; ++ sock->state = SS_UNCONNECTED; ++ if (sk->sk_prot->disconnect(sk, flags)) ++ sock->state = SS_DISCONNECTING; ++ goto out; ++} ++EXPORT_SYMBOL(__inet_stream_connect); ++ ++int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len, int flags) ++{ ++ int err; ++ ++ lock_sock(sock->sk); ++ err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); ++ release_sock(sock->sk); ++ return err; ++} ++EXPORT_SYMBOL(inet_stream_connect); ++ ++/* ++ * Accept a pending connection. The TCP layer now gives BSD semantics. ++ */ ++ ++int inet_accept(struct socket *sock, struct socket *newsock, int flags, ++ bool kern) ++{ ++ struct sock *sk1 = sock->sk; ++ int err = -EINVAL; ++ struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern); ++ ++ if (!sk2) ++ goto do_err; ++ ++ lock_sock(sk2); ++ ++ sock_rps_record_flow(sk2); ++ WARN_ON(!((1 << sk2->sk_state) & ++ (TCPF_ESTABLISHED | TCPF_SYN_RECV | ++ TCPF_CLOSE_WAIT | TCPF_CLOSE))); ++ ++ if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) ++ set_bit(SOCK_SUPPORT_ZC, &newsock->flags); ++ sock_graft(sk2, newsock); ++ ++ newsock->state = SS_CONNECTED; ++ err = 0; ++ release_sock(sk2); ++do_err: ++ return err; ++} ++EXPORT_SYMBOL(inet_accept); ++ ++/* ++ * This does both peername and sockname. ++ */ ++int inet_getname(struct socket *sock, struct sockaddr *uaddr, ++ int peer) ++{ ++ struct sock *sk = sock->sk; ++ struct inet_sock *inet = inet_sk(sk); ++ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); ++ ++ sin->sin_family = AF_INET; ++ lock_sock(sk); ++ if (peer) { ++ if (!inet->inet_dport || ++ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && ++ peer == 1)) { ++ release_sock(sk); ++ return -ENOTCONN; ++ } ++ sin->sin_port = inet->inet_dport; ++ sin->sin_addr.s_addr = inet->inet_daddr; ++ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, ++ CGROUP_INET4_GETPEERNAME); ++ } else { ++ __be32 addr = inet->inet_rcv_saddr; ++ if (!addr) ++ addr = inet->inet_saddr; ++ sin->sin_port = inet->inet_sport; ++ sin->sin_addr.s_addr = addr; ++ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, ++ CGROUP_INET4_GETSOCKNAME); ++ } ++ release_sock(sk); ++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); ++ return sizeof(*sin); ++} ++EXPORT_SYMBOL(inet_getname); ++ ++int inet_send_prepare(struct sock *sk) ++{ ++ sock_rps_record_flow(sk); ++ ++ /* We may need to bind the socket. */ ++ if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind && ++ inet_autobind(sk)) ++ return -EAGAIN; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(inet_send_prepare); ++ ++int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) ++{ ++ struct sock *sk = sock->sk; ++ ++ if (unlikely(inet_send_prepare(sk))) ++ return -EAGAIN; ++ ++ return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg, ++ sk, msg, size); ++} ++EXPORT_SYMBOL(inet_sendmsg); ++ ++ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, ++ size_t size, int flags) ++{ ++ struct sock *sk = sock->sk; ++ ++ if (unlikely(inet_send_prepare(sk))) ++ return -EAGAIN; ++ ++ if (sk->sk_prot->sendpage) ++ return sk->sk_prot->sendpage(sk, page, offset, size, flags); ++ return sock_no_sendpage(sock, page, offset, size, flags); ++} ++EXPORT_SYMBOL(inet_sendpage); ++ ++INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, ++ size_t, int, int *)); ++int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, ++ int flags) ++{ ++ struct sock *sk = sock->sk; ++ int addr_len = 0; ++ int err; ++ ++ if (likely(!(flags & MSG_ERRQUEUE))) ++ sock_rps_record_flow(sk); ++ ++ err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, ++ sk, msg, size, flags, &addr_len); ++ if (err >= 0) ++ msg->msg_namelen = addr_len; ++ return err; ++} ++EXPORT_SYMBOL(inet_recvmsg); ++ ++int inet_shutdown(struct socket *sock, int how) ++{ ++ struct sock *sk = sock->sk; ++ int err = 0; ++ ++ /* This should really check to make sure ++ * the socket is a TCP socket. (WHY AC...) ++ */ ++ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and ++ 1->2 bit 2 snds. ++ 2->3 */ ++ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ ++ return -EINVAL; ++ ++ lock_sock(sk); ++ if (sock->state == SS_CONNECTING) { ++ if ((1 << sk->sk_state) & ++ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) ++ sock->state = SS_DISCONNECTING; ++ else ++ sock->state = SS_CONNECTED; ++ } ++ ++ switch (sk->sk_state) { ++ case TCP_CLOSE: ++ err = -ENOTCONN; ++ /* Hack to wake up other listeners, who can poll for ++ EPOLLHUP, even on eg. unconnected UDP sockets -- RR */ ++ fallthrough; ++ default: ++ sk->sk_shutdown |= how; ++ if (sk->sk_prot->shutdown) ++ sk->sk_prot->shutdown(sk, how); ++ break; ++ ++ /* Remaining two branches are temporary solution for missing ++ * close() in multithreaded environment. It is _not_ a good idea, ++ * but we have no choice until close() is repaired at VFS level. ++ */ ++ case TCP_LISTEN: ++ if (!(how & RCV_SHUTDOWN)) ++ break; ++ fallthrough; ++ case TCP_SYN_SENT: ++ err = sk->sk_prot->disconnect(sk, O_NONBLOCK); ++ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; ++ break; ++ } ++ ++ /* Wake up anyone sleeping in poll. */ ++ sk->sk_state_change(sk); ++ release_sock(sk); ++ return err; ++} ++EXPORT_SYMBOL(inet_shutdown); ++ ++/* ++ * ioctl() calls you can issue on an INET socket. Most of these are ++ * device configuration and stuff and very rarely used. Some ioctls ++ * pass on to the socket itself. ++ * ++ * NOTE: I like the idea of a module for the config stuff. ie ifconfig ++ * loads the devconfigure module does its configuring and unloads it. ++ * There's a good 20K of config code hanging around the kernel. ++ */ ++ ++int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) ++{ ++ struct sock *sk = sock->sk; ++ int err = 0; ++ struct net *net = sock_net(sk); ++ void __user *p = (void __user *)arg; ++ struct ifreq ifr; ++ struct rtentry rt; ++ ++ switch (cmd) { ++ case SIOCADDRT: ++ case SIOCDELRT: ++ if (copy_from_user(&rt, p, sizeof(struct rtentry))) ++ return -EFAULT; ++ err = ip_rt_ioctl(net, cmd, &rt); ++ break; ++ case SIOCRTMSG: ++ err = -EINVAL; ++ break; ++ case SIOCDARP: ++ case SIOCGARP: ++ case SIOCSARP: ++ err = arp_ioctl(net, cmd, (void __user *)arg); ++ break; ++ case SIOCGIFADDR: ++ case SIOCGIFBRDADDR: ++ case SIOCGIFNETMASK: ++ case SIOCGIFDSTADDR: ++ case SIOCGIFPFLAGS: ++ if (get_user_ifreq(&ifr, NULL, p)) ++ return -EFAULT; ++ err = devinet_ioctl(net, cmd, &ifr); ++ if (!err && put_user_ifreq(&ifr, p)) ++ err = -EFAULT; ++ break; ++ ++ case SIOCSIFADDR: ++ case SIOCSIFBRDADDR: ++ case SIOCSIFNETMASK: ++ case SIOCSIFDSTADDR: ++ case SIOCSIFPFLAGS: ++ case SIOCSIFFLAGS: ++ if (get_user_ifreq(&ifr, NULL, p)) ++ return -EFAULT; ++ err = devinet_ioctl(net, cmd, &ifr); ++ break; ++ default: ++ if (sk->sk_prot->ioctl) ++ err = sk->sk_prot->ioctl(sk, cmd, arg); ++ else ++ err = -ENOIOCTLCMD; ++ break; ++ } ++ return err; ++} ++EXPORT_SYMBOL(inet_ioctl); ++ ++#ifdef CONFIG_COMPAT ++static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd, ++ struct compat_rtentry __user *ur) ++{ ++ compat_uptr_t rtdev; ++ struct rtentry rt; ++ ++ if (copy_from_user(&rt.rt_dst, &ur->rt_dst, ++ 3 * sizeof(struct sockaddr)) || ++ get_user(rt.rt_flags, &ur->rt_flags) || ++ get_user(rt.rt_metric, &ur->rt_metric) || ++ get_user(rt.rt_mtu, &ur->rt_mtu) || ++ get_user(rt.rt_window, &ur->rt_window) || ++ get_user(rt.rt_irtt, &ur->rt_irtt) || ++ get_user(rtdev, &ur->rt_dev)) ++ return -EFAULT; ++ ++ rt.rt_dev = compat_ptr(rtdev); ++ return ip_rt_ioctl(sock_net(sk), cmd, &rt); ++} ++ ++static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) ++{ ++ void __user *argp = compat_ptr(arg); ++ struct sock *sk = sock->sk; ++ ++ switch (cmd) { ++ case SIOCADDRT: ++ case SIOCDELRT: ++ return inet_compat_routing_ioctl(sk, cmd, argp); ++ default: ++ if (!sk->sk_prot->compat_ioctl) ++ return -ENOIOCTLCMD; ++ return sk->sk_prot->compat_ioctl(sk, cmd, arg); ++ } ++} ++#endif /* CONFIG_COMPAT */ ++ ++const struct proto_ops inet_stream_ops = { ++ .family = PF_INET, ++ .owner = THIS_MODULE, ++ .release = inet_release, ++ .bind = inet_bind, ++ .connect = inet_stream_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = inet_accept, ++ .getname = inet_getname, ++ .poll = tcp_poll, ++ .ioctl = inet_ioctl, ++ .gettstamp = sock_gettstamp, ++ .listen = inet_listen, ++ .shutdown = inet_shutdown, ++ .setsockopt = sock_common_setsockopt, ++ .getsockopt = sock_common_getsockopt, ++ .sendmsg = inet_sendmsg, ++ .recvmsg = inet_recvmsg, ++#ifdef CONFIG_MMU ++ .mmap = tcp_mmap, ++#endif ++ .sendpage = inet_sendpage, ++ .splice_read = tcp_splice_read, ++ .read_sock = tcp_read_sock, ++ .read_skb = tcp_read_skb, ++ .sendmsg_locked = tcp_sendmsg_locked, ++ .sendpage_locked = tcp_sendpage_locked, ++ .peek_len = tcp_peek_len, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = inet_compat_ioctl, ++#endif ++ .set_rcvlowat = tcp_set_rcvlowat, ++}; ++EXPORT_SYMBOL(inet_stream_ops); ++ ++const struct proto_ops inet_dgram_ops = { ++ .family = PF_INET, ++ .owner = THIS_MODULE, ++ .release = inet_release, ++ .bind = inet_bind, ++ .connect = inet_dgram_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = sock_no_accept, ++ .getname = inet_getname, ++ .poll = udp_poll, ++ .ioctl = inet_ioctl, ++ .gettstamp = sock_gettstamp, ++ .listen = sock_no_listen, ++ .shutdown = inet_shutdown, ++ .setsockopt = sock_common_setsockopt, ++ .getsockopt = sock_common_getsockopt, ++ .sendmsg = inet_sendmsg, ++ .read_skb = udp_read_skb, ++ .recvmsg = inet_recvmsg, ++ .mmap = sock_no_mmap, ++ .sendpage = inet_sendpage, ++ .set_peek_off = sk_set_peek_off, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = inet_compat_ioctl, ++#endif ++}; ++EXPORT_SYMBOL(inet_dgram_ops); ++ ++/* ++ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without ++ * udp_poll ++ */ ++static const struct proto_ops inet_sockraw_ops = { ++ .family = PF_INET, ++ .owner = THIS_MODULE, ++ .release = inet_release, ++ .bind = inet_bind, ++ .connect = inet_dgram_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = sock_no_accept, ++ .getname = inet_getname, ++ .poll = datagram_poll, ++ .ioctl = inet_ioctl, ++ .gettstamp = sock_gettstamp, ++ .listen = sock_no_listen, ++ .shutdown = inet_shutdown, ++ .setsockopt = sock_common_setsockopt, ++ .getsockopt = sock_common_getsockopt, ++ .sendmsg = inet_sendmsg, ++ .recvmsg = inet_recvmsg, ++ .mmap = sock_no_mmap, ++ .sendpage = inet_sendpage, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = inet_compat_ioctl, ++#endif ++}; ++ ++static const struct net_proto_family inet_family_ops = { ++ .family = PF_INET, ++ .create = inet_create, ++ .owner = THIS_MODULE, ++}; ++ ++/* Upon startup we insert all the elements in inetsw_array[] into ++ * the linked list inetsw. ++ */ ++static struct inet_protosw inetsw_array[] = ++{ ++ { ++ .type = SOCK_STREAM, ++ .protocol = IPPROTO_TCP, ++ .prot = &tcp_prot, ++ .ops = &inet_stream_ops, ++ .flags = INET_PROTOSW_PERMANENT | ++ INET_PROTOSW_ICSK, ++ }, ++ ++ { ++ .type = SOCK_DGRAM, ++ .protocol = IPPROTO_UDP, ++ .prot = &udp_prot, ++ .ops = &inet_dgram_ops, ++ .flags = INET_PROTOSW_PERMANENT, ++ }, ++ ++ { ++ .type = SOCK_DGRAM, ++ .protocol = IPPROTO_ICMP, ++ .prot = &ping_prot, ++ .ops = &inet_sockraw_ops, ++ .flags = INET_PROTOSW_REUSE, ++ }, ++ ++ { ++ .type = SOCK_RAW, ++ .protocol = IPPROTO_IP, /* wild card */ ++ .prot = &raw_prot, ++ .ops = &inet_sockraw_ops, ++ .flags = INET_PROTOSW_REUSE, ++ } ++}; ++ ++#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array) ++ ++void inet_register_protosw(struct inet_protosw *p) ++{ ++ struct list_head *lh; ++ struct inet_protosw *answer; ++ int protocol = p->protocol; ++ struct list_head *last_perm; ++ ++ spin_lock_bh(&inetsw_lock); ++ ++ if (p->type >= SOCK_MAX) ++ goto out_illegal; ++ ++ /* If we are trying to override a permanent protocol, bail. */ ++ last_perm = &inetsw[p->type]; ++ list_for_each(lh, &inetsw[p->type]) { ++ answer = list_entry(lh, struct inet_protosw, list); ++ /* Check only the non-wild match. */ ++ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) ++ break; ++ if (protocol == answer->protocol) ++ goto out_permanent; ++ last_perm = lh; ++ } ++ ++ /* Add the new entry after the last permanent entry if any, so that ++ * the new entry does not override a permanent entry when matched with ++ * a wild-card protocol. But it is allowed to override any existing ++ * non-permanent entry. This means that when we remove this entry, the ++ * system automatically returns to the old behavior. ++ */ ++ list_add_rcu(&p->list, last_perm); ++out: ++ spin_unlock_bh(&inetsw_lock); ++ ++ return; ++ ++out_permanent: ++ pr_err("Attempt to override permanent protocol %d\n", protocol); ++ goto out; ++ ++out_illegal: ++ pr_err("Ignoring attempt to register invalid socket type %d\n", ++ p->type); ++ goto out; ++} ++EXPORT_SYMBOL(inet_register_protosw); ++ ++void inet_unregister_protosw(struct inet_protosw *p) ++{ ++ if (INET_PROTOSW_PERMANENT & p->flags) { ++ pr_err("Attempt to unregister permanent protocol %d\n", ++ p->protocol); ++ } else { ++ spin_lock_bh(&inetsw_lock); ++ list_del_rcu(&p->list); ++ spin_unlock_bh(&inetsw_lock); ++ ++ synchronize_net(); ++ } ++} ++EXPORT_SYMBOL(inet_unregister_protosw); ++ ++static int inet_sk_reselect_saddr(struct sock *sk) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ __be32 old_saddr = inet->inet_saddr; ++ __be32 daddr = inet->inet_daddr; ++ struct flowi4 *fl4; ++ struct rtable *rt; ++ __be32 new_saddr; ++ struct ip_options_rcu *inet_opt; ++ ++ inet_opt = rcu_dereference_protected(inet->inet_opt, ++ lockdep_sock_is_held(sk)); ++ if (inet_opt && inet_opt->opt.srr) ++ daddr = inet_opt->opt.faddr; ++ ++ /* Query new route. */ ++ fl4 = &inet->cork.fl.u.ip4; ++ rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if, ++ sk->sk_protocol, inet->inet_sport, ++ inet->inet_dport, sk); ++ if (IS_ERR(rt)) ++ return PTR_ERR(rt); ++ ++ sk_setup_caps(sk, &rt->dst); ++ ++ new_saddr = fl4->saddr; ++ ++ if (new_saddr == old_saddr) ++ return 0; ++ ++ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) { ++ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", ++ __func__, &old_saddr, &new_saddr); ++ } ++ ++ inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; ++ ++ /* ++ * XXX The only one ugly spot where we need to ++ * XXX really change the sockets identity after ++ * XXX it has entered the hashes. -DaveM ++ * ++ * Besides that, it does not check for connection ++ * uniqueness. Wait for troubles. ++ */ ++ return __sk_prot_rehash(sk); ++} ++ ++int inet_sk_rebuild_header(struct sock *sk) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); ++ __be32 daddr; ++ struct ip_options_rcu *inet_opt; ++ struct flowi4 *fl4; ++ int err; ++ ++ /* Route is OK, nothing to do. */ ++ if (rt) ++ return 0; ++ ++ /* Reroute. */ ++ rcu_read_lock(); ++ inet_opt = rcu_dereference(inet->inet_opt); ++ daddr = inet->inet_daddr; ++ if (inet_opt && inet_opt->opt.srr) ++ daddr = inet_opt->opt.faddr; ++ rcu_read_unlock(); ++ fl4 = &inet->cork.fl.u.ip4; ++ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, ++ inet->inet_dport, inet->inet_sport, ++ sk->sk_protocol, RT_CONN_FLAGS(sk), ++ sk->sk_bound_dev_if); ++ if (!IS_ERR(rt)) { ++ err = 0; ++ sk_setup_caps(sk, &rt->dst); ++ } else { ++ err = PTR_ERR(rt); ++ ++ /* Routing failed... */ ++ sk->sk_route_caps = 0; ++ /* ++ * Other protocols have to map its equivalent state to TCP_SYN_SENT. ++ * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme ++ */ ++ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || ++ sk->sk_state != TCP_SYN_SENT || ++ (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || ++ (err = inet_sk_reselect_saddr(sk)) != 0) ++ sk->sk_err_soft = -err; ++ } ++ ++ return err; ++} ++EXPORT_SYMBOL(inet_sk_rebuild_header); ++ ++void inet_sk_set_state(struct sock *sk, int state) ++{ ++ trace_inet_sock_set_state(sk, sk->sk_state, state); ++ sk->sk_state = state; ++} ++EXPORT_SYMBOL(inet_sk_set_state); ++ ++void inet_sk_state_store(struct sock *sk, int newstate) ++{ ++ trace_inet_sock_set_state(sk, sk->sk_state, newstate); ++ smp_store_release(&sk->sk_state, newstate); ++} ++ ++struct sk_buff *inet_gso_segment(struct sk_buff *skb, ++ netdev_features_t features) ++{ ++ bool udpfrag = false, fixedid = false, gso_partial, encap; ++ struct sk_buff *segs = ERR_PTR(-EINVAL); ++ const struct net_offload *ops; ++ unsigned int offset = 0; ++ struct iphdr *iph; ++ int proto, tot_len; ++ int nhoff; ++ int ihl; ++ int id; ++ ++ skb_reset_network_header(skb); ++ nhoff = skb_network_header(skb) - skb_mac_header(skb); ++ if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) ++ goto out; ++ ++ iph = ip_hdr(skb); ++ ihl = iph->ihl * 4; ++ if (ihl < sizeof(*iph)) ++ goto out; ++ ++ id = ntohs(iph->id); ++ proto = iph->protocol; ++ ++ /* Warning: after this point, iph might be no longer valid */ ++ if (unlikely(!pskb_may_pull(skb, ihl))) ++ goto out; ++ __skb_pull(skb, ihl); ++ ++ encap = SKB_GSO_CB(skb)->encap_level > 0; ++ if (encap) ++ features &= skb->dev->hw_enc_features; ++ SKB_GSO_CB(skb)->encap_level += ihl; ++ ++ skb_reset_transport_header(skb); ++ ++ segs = ERR_PTR(-EPROTONOSUPPORT); ++ ++ if (!skb->encapsulation || encap) { ++ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ++ fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); ++ ++ /* fixed ID is invalid if DF bit is not set */ ++ if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) ++ goto out; ++ } ++ ++ ops = rcu_dereference(inet_offloads[proto]); ++ if (likely(ops && ops->callbacks.gso_segment)) { ++ segs = ops->callbacks.gso_segment(skb, features); ++ if (!segs) ++ skb->network_header = skb_mac_header(skb) + nhoff - skb->head; ++ } ++ ++ if (IS_ERR_OR_NULL(segs)) ++ goto out; ++ ++ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); ++ ++ skb = segs; ++ do { ++ iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); ++ if (udpfrag) { ++ iph->frag_off = htons(offset >> 3); ++ if (skb->next) ++ iph->frag_off |= htons(IP_MF); ++ offset += skb->len - nhoff - ihl; ++ tot_len = skb->len - nhoff; ++ } else if (skb_is_gso(skb)) { ++ if (!fixedid) { ++ iph->id = htons(id); ++ id += skb_shinfo(skb)->gso_segs; ++ } ++ ++ if (gso_partial) ++ tot_len = skb_shinfo(skb)->gso_size + ++ SKB_GSO_CB(skb)->data_offset + ++ skb->head - (unsigned char *)iph; ++ else ++ tot_len = skb->len - nhoff; ++ } else { ++ if (!fixedid) ++ iph->id = htons(id++); ++ tot_len = skb->len - nhoff; ++ } ++ iph->tot_len = htons(tot_len); ++ ip_send_check(iph); ++ if (encap) ++ skb_reset_inner_headers(skb); ++ skb->network_header = (u8 *)iph - skb->head; ++ skb_reset_mac_len(skb); ++ } while ((skb = skb->next)); ++ ++out: ++ return segs; ++} ++ ++static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, ++ netdev_features_t features) ++{ ++ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) ++ return ERR_PTR(-EINVAL); ++ ++ return inet_gso_segment(skb, features); ++} ++ ++struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) ++{ ++ const struct net_offload *ops; ++ struct sk_buff *pp = NULL; ++ const struct iphdr *iph; ++ struct sk_buff *p; ++ unsigned int hlen; ++ unsigned int off; ++ unsigned int id; ++ int flush = 1; ++ int proto; ++ ++ off = skb_gro_offset(skb); ++ hlen = off + sizeof(*iph); ++ iph = skb_gro_header_fast(skb, off); ++ if (skb_gro_header_hard(skb, hlen)) { ++ iph = skb_gro_header_slow(skb, hlen, off); ++ if (unlikely(!iph)) ++ goto out; ++ } ++ ++ proto = iph->protocol; ++ ++ ops = rcu_dereference(inet_offloads[proto]); ++ if (!ops || !ops->callbacks.gro_receive) ++ goto out; ++ ++ if (*(u8 *)iph != 0x45) ++ goto out; ++ ++ if (ip_is_fragment(iph)) ++ goto out; ++ ++ if (unlikely(ip_fast_csum((u8 *)iph, 5))) ++ goto out; ++ ++ id = ntohl(*(__be32 *)&iph->id); ++ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); ++ id >>= 16; ++ ++ list_for_each_entry(p, head, list) { ++ struct iphdr *iph2; ++ u16 flush_id; ++ ++ if (!NAPI_GRO_CB(p)->same_flow) ++ continue; ++ ++ iph2 = (struct iphdr *)(p->data + off); ++ /* The above works because, with the exception of the top ++ * (inner most) layer, we only aggregate pkts with the same ++ * hdr length so all the hdrs we'll need to verify will start ++ * at the same offset. ++ */ ++ if ((iph->protocol ^ iph2->protocol) | ++ ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ++ ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { ++ NAPI_GRO_CB(p)->same_flow = 0; ++ continue; ++ } ++ ++ /* All fields must match except length and checksum. */ ++ NAPI_GRO_CB(p)->flush |= ++ (iph->ttl ^ iph2->ttl) | ++ (iph->tos ^ iph2->tos) | ++ ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); ++ ++ NAPI_GRO_CB(p)->flush |= flush; ++ ++ /* We need to store of the IP ID check to be included later ++ * when we can verify that this packet does in fact belong ++ * to a given flow. ++ */ ++ flush_id = (u16)(id - ntohs(iph2->id)); ++ ++ /* This bit of code makes it much easier for us to identify ++ * the cases where we are doing atomic vs non-atomic IP ID ++ * checks. Specifically an atomic check can return IP ID ++ * values 0 - 0xFFFF, while a non-atomic check can only ++ * return 0 or 0xFFFF. ++ */ ++ if (!NAPI_GRO_CB(p)->is_atomic || ++ !(iph->frag_off & htons(IP_DF))) { ++ flush_id ^= NAPI_GRO_CB(p)->count; ++ flush_id = flush_id ? 0xFFFF : 0; ++ } ++ ++ /* If the previous IP ID value was based on an atomic ++ * datagram we can overwrite the value and ignore it. ++ */ ++ if (NAPI_GRO_CB(skb)->is_atomic) ++ NAPI_GRO_CB(p)->flush_id = flush_id; ++ else ++ NAPI_GRO_CB(p)->flush_id |= flush_id; ++ } ++ ++ NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); ++ NAPI_GRO_CB(skb)->flush |= flush; ++ skb_set_network_header(skb, off); ++ /* The above will be needed by the transport layer if there is one ++ * immediately following this IP hdr. ++ */ ++ ++ /* Note : No need to call skb_gro_postpull_rcsum() here, ++ * as we already checked checksum over ipv4 header was 0 ++ */ ++ skb_gro_pull(skb, sizeof(*iph)); ++ skb_set_transport_header(skb, skb_gro_offset(skb)); ++ ++ pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive, ++ ops->callbacks.gro_receive, head, skb); ++ ++out: ++ skb_gro_flush_final(skb, pp, flush); ++ ++ return pp; ++} ++ ++static struct sk_buff *ipip_gro_receive(struct list_head *head, ++ struct sk_buff *skb) ++{ ++ if (NAPI_GRO_CB(skb)->encap_mark) { ++ NAPI_GRO_CB(skb)->flush = 1; ++ return NULL; ++ } ++ ++ NAPI_GRO_CB(skb)->encap_mark = 1; ++ ++ return inet_gro_receive(head, skb); ++} ++ ++#define SECONDS_PER_DAY 86400 ++ ++/* inet_current_timestamp - Return IP network timestamp ++ * ++ * Return milliseconds since midnight in network byte order. ++ */ ++__be32 inet_current_timestamp(void) ++{ ++ u32 secs; ++ u32 msecs; ++ struct timespec64 ts; ++ ++ ktime_get_real_ts64(&ts); ++ ++ /* Get secs since midnight. */ ++ (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs); ++ /* Convert to msecs. */ ++ msecs = secs * MSEC_PER_SEC; ++ /* Convert nsec to msec. */ ++ msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC; ++ ++ /* Convert to network byte order. */ ++ return htonl(msecs); ++} ++EXPORT_SYMBOL(inet_current_timestamp); ++ ++int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) ++{ ++ if (sk->sk_family == AF_INET) ++ return ip_recv_error(sk, msg, len, addr_len); ++#if IS_ENABLED(CONFIG_IPV6) ++ if (sk->sk_family == AF_INET6) ++ return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); ++#endif ++ return -EINVAL; ++} ++ ++int inet_gro_complete(struct sk_buff *skb, int nhoff) ++{ ++ __be16 newlen = htons(skb->len - nhoff); ++ struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); ++ const struct net_offload *ops; ++ int proto = iph->protocol; ++ int err = -ENOSYS; ++ ++ if (skb->encapsulation) { ++ skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP)); ++ skb_set_inner_network_header(skb, nhoff); ++ } ++ ++ csum_replace2(&iph->check, iph->tot_len, newlen); ++ iph->tot_len = newlen; ++ ++ ops = rcu_dereference(inet_offloads[proto]); ++ if (WARN_ON(!ops || !ops->callbacks.gro_complete)) ++ goto out; ++ ++ /* Only need to add sizeof(*iph) to get to the next hdr below ++ * because any hdr with option will have been flushed in ++ * inet_gro_receive(). ++ */ ++ err = INDIRECT_CALL_2(ops->callbacks.gro_complete, ++ tcp4_gro_complete, udp4_gro_complete, ++ skb, nhoff + sizeof(*iph)); ++ ++out: ++ return err; ++} ++ ++static int ipip_gro_complete(struct sk_buff *skb, int nhoff) ++{ ++ skb->encapsulation = 1; ++ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; ++ return inet_gro_complete(skb, nhoff); ++} ++ ++int inet_ctl_sock_create(struct sock **sk, unsigned short family, ++ unsigned short type, unsigned char protocol, ++ struct net *net) ++{ ++ struct socket *sock; ++ int rc = sock_create_kern(net, family, type, protocol, &sock); ++ ++ if (rc == 0) { ++ *sk = sock->sk; ++ (*sk)->sk_allocation = GFP_ATOMIC; ++ /* ++ * Unhash it so that IP input processing does not even see it, ++ * we do not wish this socket to see incoming packets. ++ */ ++ (*sk)->sk_prot->unhash(*sk); ++ } ++ return rc; ++} ++EXPORT_SYMBOL_GPL(inet_ctl_sock_create); ++ ++unsigned long snmp_fold_field(void __percpu *mib, int offt) ++{ ++ unsigned long res = 0; ++ int i; ++ ++ for_each_possible_cpu(i) ++ res += snmp_get_cpu_field(mib, i, offt); ++ return res; ++} ++EXPORT_SYMBOL_GPL(snmp_fold_field); ++ ++#if BITS_PER_LONG==32 ++ ++u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, ++ size_t syncp_offset) ++{ ++ void *bhptr; ++ struct u64_stats_sync *syncp; ++ u64 v; ++ unsigned int start; ++ ++ bhptr = per_cpu_ptr(mib, cpu); ++ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); ++ do { ++ start = u64_stats_fetch_begin_irq(syncp); ++ v = *(((u64 *)bhptr) + offt); ++ } while (u64_stats_fetch_retry_irq(syncp, start)); ++ ++ return v; ++} ++EXPORT_SYMBOL_GPL(snmp_get_cpu_field64); ++ ++u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) ++{ ++ u64 res = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset); ++ } ++ return res; ++} ++EXPORT_SYMBOL_GPL(snmp_fold_field64); ++#endif ++ ++#ifdef CONFIG_IP_MULTICAST ++static const struct net_protocol igmp_protocol = { ++ .handler = igmp_rcv, ++}; ++#endif ++ ++static const struct net_protocol tcp_protocol = { ++ .handler = tcp_v4_rcv, ++ .err_handler = tcp_v4_err, ++ .no_policy = 1, ++ .icmp_strict_tag_validation = 1, ++}; ++ ++static const struct net_protocol udp_protocol = { ++ .handler = udp_rcv, ++ .err_handler = udp_err, ++ .no_policy = 1, ++}; ++ ++static const struct net_protocol icmp_protocol = { ++ .handler = icmp_rcv, ++ .err_handler = icmp_err, ++ .no_policy = 1, ++}; ++ ++static __net_init int ipv4_mib_init_net(struct net *net) ++{ ++ int i; ++ ++ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); ++ if (!net->mib.tcp_statistics) ++ goto err_tcp_mib; ++ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); ++ if (!net->mib.ip_statistics) ++ goto err_ip_mib; ++ ++ for_each_possible_cpu(i) { ++ struct ipstats_mib *af_inet_stats; ++ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); ++ u64_stats_init(&af_inet_stats->syncp); ++ } ++ ++ net->mib.net_statistics = alloc_percpu(struct linux_mib); ++ if (!net->mib.net_statistics) ++ goto err_net_mib; ++ net->mib.udp_statistics = alloc_percpu(struct udp_mib); ++ if (!net->mib.udp_statistics) ++ goto err_udp_mib; ++ net->mib.udplite_statistics = alloc_percpu(struct udp_mib); ++ if (!net->mib.udplite_statistics) ++ goto err_udplite_mib; ++ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); ++ if (!net->mib.icmp_statistics) ++ goto err_icmp_mib; ++ net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), ++ GFP_KERNEL); ++ if (!net->mib.icmpmsg_statistics) ++ goto err_icmpmsg_mib; ++ ++ tcp_mib_init(net); ++ return 0; ++ ++err_icmpmsg_mib: ++ free_percpu(net->mib.icmp_statistics); ++err_icmp_mib: ++ free_percpu(net->mib.udplite_statistics); ++err_udplite_mib: ++ free_percpu(net->mib.udp_statistics); ++err_udp_mib: ++ free_percpu(net->mib.net_statistics); ++err_net_mib: ++ free_percpu(net->mib.ip_statistics); ++err_ip_mib: ++ free_percpu(net->mib.tcp_statistics); ++err_tcp_mib: ++ return -ENOMEM; ++} ++ ++static __net_exit void ipv4_mib_exit_net(struct net *net) ++{ ++ kfree(net->mib.icmpmsg_statistics); ++ free_percpu(net->mib.icmp_statistics); ++ free_percpu(net->mib.udplite_statistics); ++ free_percpu(net->mib.udp_statistics); ++ free_percpu(net->mib.net_statistics); ++ free_percpu(net->mib.ip_statistics); ++ free_percpu(net->mib.tcp_statistics); ++#ifdef CONFIG_MPTCP ++ /* allocated on demand, see mptcp_init_sock() */ ++ free_percpu(net->mib.mptcp_statistics); ++#endif ++} ++ ++static __net_initdata struct pernet_operations ipv4_mib_ops = { ++ .init = ipv4_mib_init_net, ++ .exit = ipv4_mib_exit_net, ++}; ++ ++static int __init init_ipv4_mibs(void) ++{ ++ return register_pernet_subsys(&ipv4_mib_ops); ++} ++ ++static __net_init int inet_init_net(struct net *net) ++{ ++ /* ++ * Set defaults for local port range ++ */ ++ seqlock_init(&net->ipv4.ip_local_ports.lock); ++ net->ipv4.ip_local_ports.range[0] = 32768; ++ net->ipv4.ip_local_ports.range[1] = 60999; ++ ++ seqlock_init(&net->ipv4.ping_group_range.lock); ++ /* ++ * Sane defaults - nobody may create ping sockets. ++ * Boot scripts should set this to distro-specific group. ++ */ ++ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); ++ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); ++ ++ /* Default values for sysctl-controlled parameters. ++ * We set them here, in case sysctl is not compiled. ++ */ ++ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; ++ net->ipv4.sysctl_ip_fwd_update_priority = 1; ++ net->ipv4.sysctl_ip_dynaddr = 0; ++ net->ipv4.sysctl_ip_early_demux = 1; ++ net->ipv4.sysctl_udp_early_demux = 1; ++ net->ipv4.sysctl_tcp_early_demux = 1; ++ net->ipv4.sysctl_nexthop_compat_mode = 1; ++#ifdef CONFIG_SYSCTL ++ net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; ++#endif ++ ++ /* Some igmp sysctl, whose values are always used */ ++ net->ipv4.sysctl_igmp_max_memberships = 20; ++ net->ipv4.sysctl_igmp_max_msf = 10; ++ /* IGMP reports for link-local multicast groups are enabled by default */ ++ net->ipv4.sysctl_igmp_llm_reports = 1; ++ net->ipv4.sysctl_igmp_qrv = 2; ++ ++ net->ipv4.sysctl_fib_notify_on_flag_change = 0; ++ ++ return 0; ++} ++ ++static __net_initdata struct pernet_operations af_inet_ops = { ++ .init = inet_init_net, ++}; ++ ++static int __init init_inet_pernet_ops(void) ++{ ++ return register_pernet_subsys(&af_inet_ops); ++} ++ ++static int ipv4_proc_init(void); ++ ++/* ++ * IP protocol layer initialiser ++ */ ++ ++static struct packet_offload ip_packet_offload __read_mostly = { ++ .type = cpu_to_be16(ETH_P_IP), ++ .callbacks = { ++ .gso_segment = inet_gso_segment, ++ .gro_receive = inet_gro_receive, ++ .gro_complete = inet_gro_complete, ++ }, ++}; ++ ++static const struct net_offload ipip_offload = { ++ .callbacks = { ++ .gso_segment = ipip_gso_segment, ++ .gro_receive = ipip_gro_receive, ++ .gro_complete = ipip_gro_complete, ++ }, ++}; ++ ++static int __init ipip_offload_init(void) ++{ ++ return inet_add_offload(&ipip_offload, IPPROTO_IPIP); ++} ++ ++static int __init ipv4_offload_init(void) ++{ ++ /* ++ * Add offloads ++ */ ++ if (udpv4_offload_init() < 0) ++ pr_crit("%s: Cannot add UDP protocol offload\n", __func__); ++ if (tcpv4_offload_init() < 0) ++ pr_crit("%s: Cannot add TCP protocol offload\n", __func__); ++ if (ipip_offload_init() < 0) ++ pr_crit("%s: Cannot add IPIP protocol offload\n", __func__); ++ ++ dev_add_offload(&ip_packet_offload); ++ return 0; ++} ++ ++fs_initcall(ipv4_offload_init); ++ ++static struct packet_type ip_packet_type __read_mostly = { ++ .type = cpu_to_be16(ETH_P_IP), ++ .func = ip_rcv, ++ .list_func = ip_list_rcv, ++}; ++ ++static int __init inet_init(void) ++{ ++ struct inet_protosw *q; ++ struct list_head *r; ++ int rc; ++ ++ sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); ++ ++ raw_hashinfo_init(&raw_v4_hashinfo); ++ ++ rc = proto_register(&tcp_prot, 1); ++ if (rc) ++ goto out; ++ ++ rc = proto_register(&udp_prot, 1); ++ if (rc) ++ goto out_unregister_tcp_proto; ++ ++ rc = proto_register(&raw_prot, 1); ++ if (rc) ++ goto out_unregister_udp_proto; ++ ++ rc = proto_register(&ping_prot, 1); ++ if (rc) ++ goto out_unregister_raw_proto; ++ ++ /* ++ * Tell SOCKET that we are alive... ++ */ ++ ++ (void)sock_register(&inet_family_ops); ++ ++#ifdef CONFIG_SYSCTL ++ ip_static_sysctl_init(); ++#endif ++ ++ /* ++ * Add all the base protocols. ++ */ ++ ++ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) ++ pr_crit("%s: Cannot add ICMP protocol\n", __func__); ++ if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) ++ pr_crit("%s: Cannot add UDP protocol\n", __func__); ++ if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) ++ pr_crit("%s: Cannot add TCP protocol\n", __func__); ++#ifdef CONFIG_IP_MULTICAST ++ if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) ++ pr_crit("%s: Cannot add IGMP protocol\n", __func__); ++#endif ++ ++ /* Register the socket-side information for inet_create. */ ++ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) ++ INIT_LIST_HEAD(r); ++ ++ for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) ++ inet_register_protosw(q); ++ ++ /* ++ * Set the ARP module up ++ */ ++ ++ arp_init(); ++ ++ /* ++ * Set the IP module up ++ */ ++ ++ ip_init(); ++ ++ /* Initialise per-cpu ipv4 mibs */ ++ if (init_ipv4_mibs()) ++ panic("%s: Cannot init ipv4 mibs\n", __func__); ++ ++ /* Setup TCP slab cache for open requests. */ ++ tcp_init(); ++ ++ /* Setup UDP memory threshold */ ++ udp_init(); ++ ++ /* Add UDP-Lite (RFC 3828) */ ++ udplite4_register(); ++ ++ raw_init(); ++ ++ ping_init(); ++ ++ /* ++ * Set the ICMP layer up ++ */ ++ ++ if (icmp_init() < 0) ++ panic("Failed to create the ICMP control socket.\n"); ++ ++ /* ++ * Initialise the multicast router ++ */ ++#if defined(CONFIG_IP_MROUTE) ++ if (ip_mr_init()) ++ pr_crit("%s: Cannot init ipv4 mroute\n", __func__); ++#endif ++ ++ if (init_inet_pernet_ops()) ++ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); ++ ++ ipv4_proc_init(); ++ ++ ipfrag_init(); ++ ++ dev_add_pack(&ip_packet_type); ++ ++ ip_tunnel_core_init(); ++ ++ rc = 0; ++out: ++ return rc; ++out_unregister_raw_proto: ++ proto_unregister(&raw_prot); ++out_unregister_udp_proto: ++ proto_unregister(&udp_prot); ++out_unregister_tcp_proto: ++ proto_unregister(&tcp_prot); ++ goto out; ++} ++ ++fs_initcall(inet_init); ++ ++/* ------------------------------------------------------------------------ */ ++ ++#ifdef CONFIG_PROC_FS ++static int __init ipv4_proc_init(void) ++{ ++ int rc = 0; ++ ++ if (raw_proc_init()) ++ goto out_raw; ++ if (tcp4_proc_init()) ++ goto out_tcp; ++ if (udp4_proc_init()) ++ goto out_udp; ++ if (ping_proc_init()) ++ goto out_ping; ++ if (ip_misc_proc_init()) ++ goto out_misc; ++out: ++ return rc; ++out_misc: ++ ping_proc_exit(); ++out_ping: ++ udp4_proc_exit(); ++out_udp: ++ tcp4_proc_exit(); ++out_tcp: ++ raw_proc_exit(); ++out_raw: ++ rc = -ENOMEM; ++ goto out; ++} ++ ++#else /* CONFIG_PROC_FS */ ++static int __init ipv4_proc_init(void) ++{ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -rupN linux.orig/net/ipv6/seg6_local.c linux/net/ipv6/seg6_local.c +--- linux.orig/net/ipv6/seg6_local.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/ipv6/seg6_local.c 2022-12-04 10:40:26.732034003 -0500 +@@ -1508,13 +1508,13 @@ static int put_nla_counters(struct sk_bu pcounters = per_cpu_ptr(slwt->pcpu_counters, i); do { @@ -8869,11 +59406,10 @@ index b7de5e46fdd8f..f84da849819cc 100644 counters.packets += packets; counters.bytes += bytes; -diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c -index 9d7b238a67372..965b9cb2ef3f2 100644 ---- a/net/mac80211/sta_info.c -+++ b/net/mac80211/sta_info.c -@@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, +diff -rupN linux.orig/net/mac80211/sta_info.c linux/net/mac80211/sta_info.c +--- linux.orig/net/mac80211/sta_info.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/mac80211/sta_info.c 2022-12-04 10:40:26.732034003 -0500 +@@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu( u64 value; do { @@ -8885,7 +59421,7 @@ index 9d7b238a67372..965b9cb2ef3f2 100644 return value; } -@@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) +@@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(st u64 value; do { @@ -8897,11 +59433,10 @@ index 9d7b238a67372..965b9cb2ef3f2 100644 return value; } -diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c -index b52afe316dc41..35b5f806fdda1 100644 ---- a/net/mpls/af_mpls.c -+++ b/net/mpls/af_mpls.c -@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev, +diff -rupN linux.orig/net/mpls/af_mpls.c linux/net/mpls/af_mpls.c +--- linux.orig/net/mpls/af_mpls.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/mpls/af_mpls.c 2022-12-04 10:40:26.732034003 -0500 +@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_d p = per_cpu_ptr(mdev->stats, i); do { @@ -8913,11 +59448,10 @@ index b52afe316dc41..35b5f806fdda1 100644 stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; -diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c -index efab2b06d3732..5a7349002508e 100644 ---- a/net/netfilter/ipvs/ip_vs_ctl.c -+++ b/net/netfilter/ipvs/ip_vs_ctl.c -@@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +diff -rupN linux.orig/net/netfilter/ipvs/ip_vs_ctl.c linux/net/netfilter/ipvs/ip_vs_ctl.c +--- linux.orig/net/netfilter/ipvs/ip_vs_ctl.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/netfilter/ipvs/ip_vs_ctl.c 2022-12-04 10:40:26.736033993 -0500 +@@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struc u64 conns, inpkts, outpkts, inbytes, outbytes; do { @@ -8933,11 +59467,10 @@ index efab2b06d3732..5a7349002508e 100644 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", i, (u64)conns, (u64)inpkts, -diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c -index 63c70141b3e5d..cde0d9f0d838e 100644 ---- a/net/netfilter/nf_tables_api.c -+++ b/net/netfilter/nf_tables_api.c -@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) +diff -rupN linux.orig/net/netfilter/nf_tables_api.c linux/net/netfilter/nf_tables_api.c +--- linux.orig/net/netfilter/nf_tables_api.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/netfilter/nf_tables_api.c 2022-12-04 10:40:26.736033993 -0500 +@@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); do { @@ -8950,11 +59483,10 @@ index 63c70141b3e5d..cde0d9f0d838e 100644 total.pkts += pkts; total.bytes += bytes; } -diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c -index 93c596e3b22b9..b05458c170484 100644 ---- a/net/openvswitch/datapath.c -+++ b/net/openvswitch/datapath.c -@@ -715,9 +715,9 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, +diff -rupN linux.orig/net/openvswitch/datapath.c linux/net/openvswitch/datapath.c +--- linux.orig/net/openvswitch/datapath.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/openvswitch/datapath.c 2022-12-04 10:40:26.736033993 -0500 +@@ -715,9 +715,9 @@ static void get_dp_stats(const struct da percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { @@ -8966,11 +59498,10 @@ index 93c596e3b22b9..b05458c170484 100644 stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; -diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c -index d4a2db0b22998..0a0e4c283f02e 100644 ---- a/net/openvswitch/flow_table.c -+++ b/net/openvswitch/flow_table.c -@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma) +diff -rupN linux.orig/net/openvswitch/flow_table.c linux/net/openvswitch/flow_table.c +--- linux.orig/net/openvswitch/flow_table.c 2022-12-02 11:43:18.000000000 -0500 ++++ linux/net/openvswitch/flow_table.c 2022-12-04 10:40:26.736033993 -0500 +@@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counter stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { @@ -8982,7 +59513,7 @@ index d4a2db0b22998..0a0e4c283f02e 100644 ma->masks_usage_zero_cntr[i] += counter; } -@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flow_table *table) +@@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flo stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do {